# Fixing the filenames
Goal: don't actually move or rename the file but create a json file with structural information 

1. Preparing to fix some file names, by checking how many specific punctuation characters they have
2. structure them: each file has a description object `FileDescriptor` . a folder of files are in a `DescriptorCollection` object. the final abstract structure is a dictionary of these collections. This is called a log, which can be stored on the drive
3. the auto naming function can be improved
4. a runthough of all files can if not auto-named assign persons to them
5. a manual naming function, gives options and an input field.
6. log files can be (simply) mergerd

## DO IT!

In order to skip the examples and go straight to the process just execute the 6 required cells, which have class and function definition in them. they are marked like this:

```# <<< 1/6```


In [None]:
# check where the files are first
#! cd ../../../data/NAIL_DATAFIELD_txt/parsed_v3 && pwd && ls

In [None]:
# <<< 1/6
import os
import json
import codecs
import re
from tqdm import tqdm
from random import choice
from copy import copy
from collections import Counter

from IPython.display import clear_output

Create a json object valid_files with the category folders as keys (e.g. law_legal_theory_prison_ip) and another dict as values:
```
{
    path: <folder_path>,  
    valid_files: [
        {
            file_name: <file_name>,
            automatic_name: True|False,
            auto_group_assigned: <some_auto_name_group_name> 
            assigned_to: "M"|"R",
            manually_set: True|False,
            author_name: <first_name(,last_name)?>,
        },
        ...
    ]
}
```


A maybe getting better and better auto naming algo is applied. For non auto detected filenames a person is assigned to set the name manually.
For each auto naming of a file the descicion tree is run through:

### Descicion tree:

- auto_name_found:
    - yes: auto_named already?
        - yes: same name?
            - yes: all cool
            - no : warning: dont set new name, only when flag set (override_old_auto)
        - no : manually_named already?
            - yes: same name?
                - yes: all cool
                - no : critical warning. don't set
            - no : cool. set name
    - no : auto_named already?
        - yes: critical warning. change of algo messed auto nameing up
        - no : assign to a person if not assigned already
        

In [None]:
# <<< 2/6

auto_name_groups = ['1comma_1dash','1comma_>1dash','1dash']

two_words = re.compile("([a-zA-Z]+[,-_]){2}")

class FileDescriptor:
    
    def __init__(self, file_name, dict_ = None):
        self.file_name = file_name
        self.auto_named = False # flag indicating that auto name applied
        self.auto_group_name = None # auto name rule name for debugging... 
        self.assigned_to = None # assigned to person (M|R for splitting :D )
        self.manually_named = False # flag indicating that name was set manually
        self.author_name = "" # final author, JUST ONE
        
        if dict_:
            self.from_dict(dict_)
        
    def to_dict(self):
        return self.__dict__.copy()
    
    def from_dict(self, dict_):
        self.__dict__ = dict_.copy()
    
    def set_auto_name(self, author_name, auto_group_name, override_old_auto = False, override_manual = False, debug = False):
        """
        contains the desciocion tree. read above
        """
        if debug: print('setting name to',author_name)
        # auto_named_already?
        if self.auto_named:
            if debug: print('auto_named already')
            # yes: same name?
            if self.author_name == author_name:
                # yes: all cool. set the new group anyway
                self.auto_group_name = auto_group_name
            # no : warning: dont set new name, only when flag set
            else:
                # check flag set (override_old_auto)
                if override_old_auto:
                    self.author_name = author_name
                    self.auto_group_name = auto_group_name
                else:
                    print('Warning: New Auto name does not match old one.')
                    print(self.file_name,'old',self.author_name,'new auto',author_name)
                    print('Not gonna take it. check your algo')
        # no (not auto_named_already)
        else: 
            if debug: print('not auto_named yet')
            # manually named already?
            if self.manually_named:
                # yes: same name?
                if self.author_name == author_name:
                    # yes: all cool
                    self.auto_group_name = auto_group_name
                # no : only set when flag set (override_manual). Otherwise warning
                else:
                    if override_manual:
                        self.author_name = author_name
                        self.auto_group_name = auto_group_name
                        self.auto_named = True
                        self.manually_named = False
                    else:
                        print("Warning. Name has been set manually already.")
                        print(self.file_name)
                        print('old',self.author_name,'new auto',author_name)
                        print('Not gonna take it. check your algo')
            else:
                # new find: name that shit!
                self.author_name = author_name
                self.auto_group_name = auto_group_name  
                self.auto_named = True
                        
    def __repr__(self):
        return json.dumps(self.to_dict())
    
    def auto_name_check(self, debug = False):
        """
        THIS IS THE SMART FUNCTION. IF YOU KNOW SOME GOOD RULES TO FIND THE NAME FROM A TXT FILE PUT IT HERE
        """
        f = self.file_name
        auto_name = None
        if debug:
            print('checking',self.file_name)
            print(', :',f.count(','),'   - :',f.count('-'))
        if f.count(',') == 1:
            if debug: print('found  1 comma')
            # FINAL this is a great set. lastname, firstname - title something like ALBERRO, NORVELL-recording_conceptual_art
            if f.count('-') == 1:
                auto_name = (f[:f.find('-')].strip(), '1comma_1dash')
            # FINAL? here we have the words in the title separated with - or a minus in a a word 
            elif f.count('-') > 1:
                auto_name = (f[:f.find('-')].strip(), '1comma_>1dash')
            else:
                pass
        elif f.count(',') > 1:
            pass
        # FINAL. basically lastname - something to separate words in the title
        elif f.count('-') == 1:
            if debug: print('found 1dash')
            auto_name = (f[:f.find('-')].strip(), '1dash')
        elif f.count('-') > 1:
            if debug: print('found  >1 -')
            pass
        # not so many anymore. do them manually. often no author
        else :
            pass
        return auto_name    
  
    def auto_name(self, override_old_auto = False, override_manual = False, debug= False):
        auto_name = self.auto_name_check(debug)
        if auto_name:
            self.set_auto_name(*auto_name, override_old_auto, override_manual, debug)
        # TODO add a warning, when we had a auto_name befor and don't get it anymore

    def manual_name_options(self):
        file_name_alt = self.file_name.replace(' ','')
        options = {'no':'n: no author'}
        find_name_match = two_words.match(file_name_alt)
        if find_name_match:
            potential_name =file_name_alt[:find_name_match.span()[1]-1]
            potential_name = potential_name.replace('-',',')
            potential_name = potential_name.replace('_',',')
            splitguess = 'g: ' + potential_name
            options['guess'] = splitguess
        return options

    def manual_naming(self):
        text = ['set first_name(, last_name)? for',self.file_name,'']
        options = self.manual_name_options()
        text.extend(list(options.values()))
        text = '\n'.join(text)
        name = input(text + '\n\n')
        if name == '':
            return False
        else:
            if name == 'n':
                self.author_name = options['no']
            elif name == 'g' and 'guess' in options:
#                 print('selected option:',options[3])
                self.author_name = options['guess'][3:]
            else:
                self.author_name = name
            self.auto_named = False
            # keep auto_name_group so we see, if there was something before
            self.manually_named = True
            return True

    def simple_manual_merge(self, other_desrc):
        new_file_descr = copy(self)   
        if other_desrc.manually_named and not new_file_descr.manually_named:
            new_file_descr.manually_named = True
            new_file_descr.author_name = other_desrc.author_name
        return new_file_descr
    
class DescriptorCollection:
    """
    File descriptors for a folder of files
    """
    def __init__(self,folder_name,folder_path, dict_ = None):
        self.folder_name = folder_name
        self.folder_path = folder_path
        self.file_descriptors = {}
        
        if dict_:
            self.from_dict(dict_)

    def to_dict(self):
        dict_ = self.__dict__.copy()
        dict_['file_descriptors'] = {file_descr: self.file_descriptors[file_descr].to_dict() for file_descr in self.file_descriptors}
        return dict_
            
    def from_dict(self, dict_):
        self.__dict__ = dict_
        self.file_descriptors = {file_descr_name : FileDescriptor('',self.file_descriptors[file_descr_name]) 
                                 for file_descr_name in self.file_descriptors}

    def __repr__(self):
        return json.dumps(self.to_dict())
    
    def short_description(self):
        dict_ = self.to_dict().copy()
        dict_['file_descriptors'] = len(dict_['file_descriptors'])
        return json.dumps(dict_)
    
    def build(self, file_filter = None, file_name_processor= None):
        """
        for initialisation:
        builds a log dict for a list of file in a folder. 
        the actual selection of files can be filtered with a passed function
        for each selected file the empty description is created
        """
        for file_ in os.listdir(self.folder_path):
            if not file_filter or file_filter(file_):
                file_key = file_ 
                if file_name_processor:
                    file_key = file_name_processor(file_)
                self.file_descriptors[file_key] = FileDescriptor(file_)
#         return log_files
    
    def get_descriptor(self, file_name_or_index):
        if type(file_name_or_index) == str:
            return self.file_descriptors[file_name_or_index]
        elif type(file_name_or_index) == int:
            return list(self.file_descriptors.values())[file_name_or_index]
        
    def auto_name_all(self, override_old_auto = False, override_manual = False, debug= False):
        for file_descr in list(self.file_descriptors.values()):
            file_descr.auto_name(override_old_auto, override_manual, debug)
            
    def info(self, printIt=True):
        num_docs = len(self.file_descriptors)
        num_auto_named = sum([1 for text_descr in list(self.file_descriptors.values()) if text_descr.auto_named])
        num_man_named = 0 # sum([1 for text_descr in list(self.file_descriptors.values()) if text_descr.manually_named])
        assignment_names_counter = Counter()
        man_named_names_counter = Counter()
        num_assigned = 0 
        for text_descr in list(self.file_descriptors.values()):
            if text_descr.assigned_to:
                assignment_names_counter[text_descr.assigned_to] += 1
                num_assigned += 1
                if text_descr.manually_named:
                    man_named_names_counter[text_descr.assigned_to] += 1
                    num_man_named += 1
        if printIt: 
            print('Collection:',self.folder_name)
            print(num_docs, 'docs')
            print(num_auto_named, 'auto named')
            print(num_man_named,'/',num_assigned, 'manually named/assigned', assignment_names_counter, man_named_names_counter)
        return {
            "folder_name": self.folder_name,
            "num_docs": num_docs,
            "num_auto_named": num_auto_named,
            "num_man_named": num_man_named,
            "num_assigned": num_assigned,
            "assignment_names_counter" : assignment_names_counter,
            "man_named_names_counter": man_named_names_counter
        }
              
    def file_descr_list(self):
        return list(self.file_descriptors.values())
    
    def get_all_missing(self):    
        return [text_descr for text_descr in self.file_descr_list() if not text_descr.author_name]
        
    def get_all_assigned_to(self, name = None, only_missing = False):
        return [text_descr for text_descr in list(self.file_descriptors.values()) 
                if text_descr.assigned_to == name  and not(only_missing and text_descr.manually_named)]
 
    def merge(self, other_collection):
        """
        merge in another collection. should be the same basis, just with different file_descriptior values:
        manually assigned author names... basically
        creates a NEW COLLECTION
        Preference goes to auto_naming, if it doens't exist in the 2nd collection. that means the other has a smarter algo
        
        """
        # TODO
#         for file_descr_name in self.file_descriptors:
#             # file descriptor not in other collection? weird... ignore
#             if file_descr_name not in other_collection:
#                 print(file_descr, 'is not in the 2nd collection. not gonna take it')
#             file_descr1 = self.file_descriptors[file_descr_name]
#             file_descr2 = other_collection.file_descriptors[file_descr_name]
#             new_file_descr = FileDescriptor(file_descr1.file_name)
#             # we need to cover 4 cases. auto-auto, auto-man, man-auto, man-man, 
#             # if file_descr1.auto_named and not file_descr2.auto_named
        
            
    def simple_manual_merge(self, other_collection):
        """
        just slam the file_descr together to complete the authors.
        assumption is that auto_naming is the same and just different assigned descriptors are merges
        Returns a new log
        """
        new_col_desrc = DescriptorCollection(self.folder_name, self.folder_path)
        for file_descr_name in self.file_descriptors:
            file_descr1 = self.file_descriptors[file_descr_name]
            file_descr2 = other_collection.file_descriptors[file_descr_name]
            new_file_descr = file_descr1.simple_manual_merge(file_descr2)
            new_col_desrc.file_descriptors[file_descr_name] = new_file_descr 
        return new_col_desrc
    
    def get_all_auto_named(self):
        return [file_descr for file_descr in self.file_descr_list() if file_descr.auto_named]
    
    def get_all_manualy_named(self):
        return [file_descr for file_descr in self.file_descr_list() if file_descr.manually_named and file_descr.author_name]

Next part is simple.
Initiate the description files for our corpus.

This is only for initialisation. If it's done you can just **load a log file**...
At the end we get a dict, where the keys are foldernames and the values are DescriptorCollections
```
{
    folder_name: {
        <DescriptorCollections>: as_json:
        folder_name: <folder_name>
        folder_path: <folder_path>,
        log_files: <list of FileDescriptors>
    }
},
...
```

In [None]:
# <<< 3/6

def build_descr_folder(base_path, folder_names, file_filter, file_name_processor):
    """
    build multiple folder in one base folder to a dict
    {key: folder_name ; value: {path: folder_path, list of fileDescriptions}}
    """
    descr_collections = []
    for folder_name in folder_names:
        descr_folder = DescriptorCollection(folder_name, base_path + folder_name)
        descr_folder.build(file_filter, file_name_processor)
        descr_collections.append(descr_folder)    
    return {collection.folder_name : collection for collection in descr_collections} 

def load_descr_folder_from_dict(dict_):
    log = {}
    for col_folder in dict_:
        log[col_folder] = DescriptorCollection(None, None, dict_[col_folder])
    return log

Next we define some methods to initiate a set of collections, read and dump them to drive

In [None]:
# <<< 4/6

def init_log(base_path, file_filter = None, file_name_processor = None):
    """
    initialises
    """
    folder_names = [obj for obj in os.listdir(base_path) if os.path.isdir(base_path + obj)]
    log = build_descr_folder(main_path, folder_names, file_filter, file_name_processor)
  
    # print(log_files)
    total_size = 0
    for folder in log:
        print(folder)
        print(len(log[folder].file_descriptors),'log files')
        total_size += len(log[folder].file_descriptors)
    print('TOTAL SIZE',total_size)
    return log

        
def read_log(log_file_name):
    try:
        with codecs.open(log_file_name,encoding='utf-8') as fin:
            log_dict = json.loads(fin.read())
    except FileNotFoundError:
        write_log_file()
    return load_descr_folder_from_dict(log_dict)


def write_log_file(log, log_file_name):
    """
    log is a dict of collection. dump it to drive...
    """
    dict_ = {col: log[col].to_dict() for col in log} 
    with codecs.open(log_file_name,'w', encoding='utf-8') as fout:
        fout.write(json.dumps(dict_, indent=2, ensure_ascii=False))
        
def sum_info(log, printIt=True):
    sum_num_docs = 0
    sum_num_auto_named = 0
    sum_num_man_named = 0
    sum_num_assigned = 0
    sum_assignment_names_counter = Counter()
    sum_man_named_names_counter = Counter()
    
    for collection in list(log.values()):
        info = collection.info(False)
        sum_num_docs += info['num_docs']
        sum_num_auto_named += info['num_auto_named']
        sum_num_man_named += info['num_man_named']
        sum_num_assigned += info['num_assigned']
        sum_assignment_names_counter += info['assignment_names_counter']
        sum_man_named_names_counter += info['man_named_names_counter']
        
    if printIt:
        print('TOTAL NUM OF DOCS:',sum_num_docs)
        print("Auto named:",sum_num_auto_named)
        print("Manually named",sum_num_man_named)
        print("Done/Assigned:",sum_man_named_names_counter,sum_assignment_names_counter)
    return {
        "sum_num_docs": sum_num_docs,
        "sum_num_auto_named": sum_num_auto_named,
        "sum_num_man_named": sum_num_man_named,
        "sum_num_assigned": sum_num_assigned,
        "sum_assignment_names_counter": sum_assignment_names_counter,
        "sum_man_named_names_counter": sum_man_named_names_counter
    }
        

In [None]:
main_path = '../../../data/NAIL_DATAFIELD_txt/parsed_v3/'

def valid_file_filter(file_name):
    return '_valid' in file_name

def valid_file_name_processor(file_name):
    return file_name[:-len('_valid.txt')]

log = init_log(main_path, valid_file_filter, valid_file_name_processor)
write_log_file(log,'log.json')

# now we can grab a file descriptor either by some index or by it's file name:
file_descr = log['arts_arthistory_aesthetics'].get_descriptor(0)
print(json.dumps(file_descr.to_dict(), indent = 2))
file_descr = log['arts_arthistory_aesthetics'].get_descriptor('Batsford - Gwen.White-Perspective.A.Guide.For.Artists,.Architects.and.Designers')
print(json.dumps(file_descr.to_dict(), indent = 2))

In [None]:
# testing read
# log = read_log('log.json')

Checking a single file if it would find a name:

In [None]:
file_descr.auto_name_check(debug = True)
# that looks good...

Let's test the auto naming function, which will if it finds a name and the deciscion tree rules are cool set the "file description" 

In [None]:
# test auto_name
file_descr.auto_name(debug= True)
print(json.dumps(file_descr.to_dict(), indent = 2))

manual naming could work like this

In [None]:
#file_descr.manual_naming()
#print(json.dumps(file_descr.to_dict(), indent = 2))

now we go through all files in bulk. Instead of `auto_name_check` we use `auto_name`,
which will call `set_auto_name` in case we found something.
The complete check also allowes us to assign a random 'person name' to each text document which has not been auto_named. At the end, we get an overview


In [None]:
# <<< 5/6

def complete_check(dict_of_collections, assign_equally_to = None, 
                   override_old_auto = False, override_manual = False, debug= False):
    """
    when assign_equally_to contains a list, it will randomly choose one for each file that is not
    auto-named
    """
    for collection in list(dict_of_collections.values()):
        collection.auto_name_all()
        for file_descr in list(collection.file_descriptors.values()):
            if not file_descr.auto_named and assign_equally_to:
                file_descr.assigned_to = choice(assign_equally_to)


In [None]:
complete_check(log,['M','R'])
print()
for col in log:
    log[col].info()
    print()

Let's have a look at all assigned descriptors of own_mixed_collection

In [None]:
[file_descr for file_descr in list(log['own_mixed_collection'].file_descriptors.values()) if file_descr.assigned_to]
# we could also call `get_all_missing` which returns all file_descr. which don't have an author yet
# log['own_mixed_collection'].get_all_missing()

nice... now lets get all of them assigned to 'R'. I added a function for that, to have it handy

In [None]:
assigned_to_R = log['own_mixed_collection'].get_all_assigned_to('R')
assigned_to_R

Now let's manually name them and get the info of that collection again.

In [None]:
for file_descr in assigned_to_R:
    file_descr.manual_naming()
    clear_output()
print('cool all done')
log['own_mixed_collection'].info()

If you tired you can write the log now. for later you can also just grab those, which are not set yet. 
The second parameter 'only_missing' default False does just that...

In [None]:
assigned_to_R = log['own_mixed_collection'].get_all_assigned_to('R', True)
assigned_to_R
# EMPTY SINCE R did all ot his files...

we want to merge 2 logs. Let's not rely on git with that... :)


In [None]:
# <<< 6/6

def simple_manual_merge_logs(log1, log2):
    final_log = {}
    for col_name in log1:
        if col_name in log2:
            final_log[col_name] = log1[col_name].simple_manual_merge(log2[col_name]) 
        else:
            print('collection', col_name,'is missing')
    return final_log
# simple_manual_merge_logs(log,log)

testing the merge... 
a bit hacky...
- initiate a second log `log2`, 
- do auto naming without assignments
- copy the assignments of `own_mixed_collection` from `log` to `log2` 
- manualy name log2 `own_mixed_collection` for M
- now `log` has all R manualy named and `log2` all Ms
- merge them

In [None]:
# still a bit strange...

# initiate a second log
log2 = init_log(main_path, valid_file_filter, valid_file_name_processor)
# do auto naming without assignments
complete_check(log2)

# copy the assignments of own_mixed_collection from log to log2
for file_descr in log['own_mixed_collection'].file_descriptors:
   assigned_to = log['own_mixed_collection'].file_descriptors[file_descr].assigned_to
   log2['own_mixed_collection'].file_descriptors[file_descr].assigned_to = assigned_to

# manualy name log2 own_mixed_collection for M
assigned_to_M = log2['own_mixed_collection'].get_all_assigned_to('M')
print(assigned_to_M)
for file_descr in assigned_to_M:
    file_descr.manual_naming()
    clear_output()

log2['own_mixed_collection'].info()

newLog = simple_manual_merge_logs(log,log2)
newLog['own_mixed_collection'].info() 
newLog['own_mixed_collection'].get_all_manualy_named()

**Last final part!**
*Run all the required 6 cells before*

1. initiate or load

2. load a second and merge it in

3. run through all collections a manualy fill in missing file descriptions

4. save it

In [None]:
# THE MAIN PROCESS!

I_AM = 'R' # alternative 'M'

main_path = '../../../data/NAIL_DATAFIELD_txt/parsed_v3/'

def valid_file_filter(file_name):
    return '_valid' in file_name

def valid_file_name_processor(file_name):
    return file_name[:-len('_valid.txt')]

# 1. initiate or load
# init
# log = init_log(main_path, valid_file_filter, valid_file_name_processor)
# complete_check(log,['M','R'])

# load
log = read_log('log.json')

for col in list(log.values()):
    col.info()

print()
sum_info(log)    
print()
    
# 2. load a second and merge it in (if exists)
if os.path.isfile('alt_log.json'):
    alt_log = read_log('alt_log.json')
    log = simple_manual_merge_logs(log,alt_log)

# 3. run through all collections a manualy fill in missing file descriptions
quit = False
for collection in list(log.values()):
    assigned_to_me = collection.get_all_assigned_to(I_AM, True)
    for file_descr in assigned_to_me:
        if not file_descr.manual_naming():
            quit = True
            break
        clear_output()
    if quit:
        break
        
clear_output()

sum_info(log)    

# 4. save it
write_log_file(log,'log.json')