# Make Stylometrie Function version

In [2]:
%matplotlib inline

In [16]:
# make the Vasari files
datapath = ('./data/results/Vasari_letters_vite')

In [89]:
# several functions to compiles all of the text files  
# associated with a single author or unkown into a single string
# 
import os
import nltk
import math
import matplotlib.pyplot as plt
import numpy as np

def read_files_into_string(datapath, filenames):
    strings = []
    for file in filenames:
        with open(datapath + '/' + file) as f:
            strings.append(f.read())
    return '\n'.join(strings)


# function to build list of filename per author
def get_filenames_per_author_into_dict(datapath):
    list_of_files = os.listdir(datapath)
    store = list()
    for i in list_of_files:
        store.append(i.split('_')[0])
    authors = list(set(store))
    storage = {}
    count = 0
    for a in authors:
        store2 = list()
        for f in list_of_files:
            if f.startswith(a):
                store2.append(f)
            else:
                continue
        storage[authors[count]] = store2
        count += 1
        
    return storage

# put every author into a single string in a dict
def get_single_string_per_author_into_dict(datapath):
    x = get_filenames_per_author_into_dict(datapath)
    authors = list(x.keys())
    strings_by_author = {}
    
    for author in authors:
        strings_by_author[author] = read_files_into_string(datapath, x[author])

    return strings_by_author


# transform authors' corpora into lists of word tokens
def get_tokens_by_author(datapath):
    # call function to process data first
    strings = get_single_string_per_author_into_dict(datapath)
    authors = strings.keys()
    
    # storage variables
    length_distribution = []
    sorted_length = []
    author_tokens = {}
    author_length_distributions = {}
    
    for author in authors:
        tokens = nltk.word_tokenize(strings[author], language = 'italian')
    
    # Filter out punctuation
        author_tokens[author] = ([token for token in tokens if any(c.isalpha() for c in token)])

    # Get a distribution of token lengths
        token_lengths = [len(token) for token in author_tokens[author]]
        author_length_distributions[author] = nltk.FreqDist(token_lengths)
    #author_length_distributions[author].plot(15,title=author)
        sorted_length.append((dict(sorted((dict(author_length_distributions[author])).items()))))
        length_distribution.append(author_length_distributions[author])
    
    return author_tokens




def get_tokens_by_author_extended(datapath):
    # call function to process data first
    strings = get_single_string_per_author_into_dict(datapath)
    authors = strings.keys()
    
    # storage variables
    length_distribution = []
    sorted_length = []
    author_tokens = {}
    author_length_distributions = {}
    
    for author in authors:
        tokens = nltk.word_tokenize(strings[author], language = 'italian')
    
    # Filter out punctuation
        author_tokens[author] = ([token for token in tokens if any(c.isalpha() for c in token)])

    # Get a distribution of token lengths
        token_lengths = [len(token) for token in author_tokens[author]]
        author_length_distributions[author] = nltk.FreqDist(token_lengths)
    #author_length_distributions[author].plot(15,title=author)
        sorted_length.append((dict(sorted((dict(author_length_distributions[author])).items()))))
        length_distribution.append(author_length_distributions[author])
    
    return {"length_distribution": length_distribution, "sorted_length": sorted_length, "author_tokens": author_tokens, "author_length_distributions": author_length_distributions}

In [99]:
# build Deltascore function

def get_delta_score(datapath,
                    author_candidates = ('Vasari', 'Borghini'),
                   disputed = 'unkown',
                   X = 500, 
                   language = 'italian'):
    """input variables: 
    datapath, 
    author_candidates(default = ('Vasari', 'Borghini')), 
    disputed_text(default = 'unkown') named'unkown'
    X(default = 500) = number of the most common words to use as features
    language(default = 'italian')"""
 
    # set all tokens lower case
    tokens_by_author = get_tokens_by_author(datapath)
    for author in author_candidates:
        tokens_by_author[author] = ([tok.lower() for tok in tokens_by_author[author]])
        
    # Combine every paper except our test case into a single corpus
    whole_corpus = []
    for author in author_candidates:
        whole_corpus += tokens_by_author[author]
        
    # Get a frequency distribution
    whole_corpus_freq_dist = list(nltk.FreqDist(whole_corpus).most_common(X)) # ! # X VARIABLE here! <--------- !

    # Calculating features for each subcorpus
    # The main data structure
    features = [word for word,freq in whole_corpus_freq_dist]
    feature_freqs = {}

    for author in author_candidates:
    # A dictionary for each candidate's features
        feature_freqs[author] = {}

    # A helper value containing the number of tokens in the author's subcorpus
        overall = len(tokens_by_author[author])

    # Calculate each feature's presence in the subcorpus
        for feature in features:
            presence = tokens_by_author[author].count(feature)
            feature_freqs[author][feature] = presence / overall
            
    # Calculating feature averages and standard deviations

    # The data structure into which we will be storing the "corpus standard" statistics
    corpus_features = {}

    # For each feature...
    for feature in features:
    # Create a sub-dictionary that will contain the feature's mean
    # and standard deviation
        corpus_features[feature] = {}
    
    # Calculate the mean of the frequencies expressed in the subcorpora
        feature_average = 0
        for author in author_candidates:
            feature_average += feature_freqs[author][feature]
        feature_average /= len(author_candidates)
        corpus_features[feature]["Mean"] = feature_average

    # Calculate the standard deviation using the basic formula for a sample
        feature_stdev = 0
        for author in author_candidates:
            diff = feature_freqs[author][feature] - corpus_features[feature]["Mean"]
            feature_stdev += diff*diff
        feature_stdev /= (len(author_candidates) - 1)
        feature_stdev = math.sqrt(feature_stdev)
        corpus_features[feature]["StdDev"] = feature_stdev
    
    # Calculating z-scores

    feature_zscores = {}
    for author in author_candidates:
        feature_zscores[author] = {}
        for feature in features:

        # Z-score definition = (value - mean) / stddev
        # We use intermediate variables to make the code easier to read
            feature_val = feature_freqs[author][feature]
            feature_mean = corpus_features[feature]["Mean"]
            feature_stdev = corpus_features[feature]["StdDev"]
            feature_zscores[author][feature] = ((feature_val-feature_mean) /
                                            feature_stdev)
    
    
    # Calculating features and z-scores for the "unkown" = disputed biographies

    # Tokenize the test case
    testcase_tokens = tokens_by_author[disputed]

    # Calculate the test case's features
    overall = len(testcase_tokens)
    testcase_freqs = {}
    for feature in features:
        presence = testcase_tokens.count(feature)
        testcase_freqs[feature] = presence / overall

    # Calculate the test case's feature z-scores
    testcase_zscores = {}
    for feature in features:
        feature_val = testcase_freqs[feature]
        feature_mean = corpus_features[feature]["Mean"]
        feature_stdev = corpus_features[feature]["StdDev"]
        testcase_zscores[feature] = (feature_val - feature_mean) / feature_stdev
        #print("Test case z-score for feature", feature, "is", testcase_zscores[feature])
    
    print('the number of features is: ', len(features))  
    
    # Calculate Delta
    storage = { "X": X}
    for author in author_candidates:
        delta = 0
        for feature in features:
            delta += math.fabs((testcase_zscores[feature] -
                                feature_zscores[author][feature]))
        delta /= len(features)
        storage[author] = delta
        print( "- for ", disputed, " the Delta score for candidate", author, "is", round(delta,4) )
    
    print('\nthe nltk.FreqDist(whole_corpus).most_common(X) value was: ', X, '; tokenizer: language =', language) 
    return storage

In [103]:
get_delta_score(datapath,('Bartoli', 'Borghini', 'Giambullari', 'Vcopy'), disputed = 'Vasari', X = 500)

the number of features is:  500
- for  Vasari  the Delta score for candidate Bartoli is 0.9357
- for  Vasari  the Delta score for candidate Borghini is 0.914
- for  Vasari  the Delta score for candidate Giambullari is 1.2889
- for  Vasari  the Delta score for candidate Vcopy is 0.6136

the nltk.FreqDist(whole_corpus).most_common(X) value was:  500 ; tokenizer: language = italian


{'X': 500,
 'Bartoli': 0.9357027495520863,
 'Borghini': 0.9139842885525475,
 'Giambullari': 1.2888761965891056,
 'Vcopy': 0.6135980418770737}

In [102]:
get_delta_score(datapath,('Bartoli', 'Borghini', 'Giambullari', 'Vcopy'), disputed = 'Vasari', X = 50)

the number of features is:  50
- for  Vasari  the Delta score for candidate Bartoli is 1.1419
- for  Vasari  the Delta score for candidate Borghini is 1.1717
- for  Vasari  the Delta score for candidate Giambullari is 1.4253
- for  Vasari  the Delta score for candidate Vcopy is 0.575

the nltk.FreqDist(whole_corpus).most_common(X) value was:  50 ; tokenizer: language = italian


{'X': 50,
 'Bartoli': 1.1418627181352425,
 'Borghini': 1.1717097713405429,
 'Giambullari': 1.4252578269243714,
 'Vcopy': 0.5749756716639481}

In [101]:
get_delta_score(datapath,('Bartoli', 'Borghini', 'Giambullari', 'Vasari', 'Vcopy'), X = 50)

the number of features is:  50
- for  unkown  the Delta score for candidate Bartoli is 1.4788
- for  unkown  the Delta score for candidate Borghini is 1.845
- for  unkown  the Delta score for candidate Giambullari is 2.125
- for  unkown  the Delta score for candidate Vasari is 1.6158
- for  unkown  the Delta score for candidate Vcopy is 1.7602

the nltk.FreqDist(whole_corpus).most_common(X) value was:  50 ; tokenizer: language = italian


{'X': 50,
 'Bartoli': 1.4788289405455124,
 'Borghini': 1.845040173049742,
 'Giambullari': 2.1250284323044517,
 'Vasari': 1.6158125561561507,
 'Vcopy': 1.7601945555582637}

In [100]:
get_delta_score(datapath, X = 50)

the number of features is:  50
- for  unkown  the Delta score for candidate Vasari is 4.0188
- for  unkown  the Delta score for candidate Borghini is 4.1429

the nltk.FreqDist(whole_corpus).most_common(X) value was:  50 ; tokenizer: language = italian


{'X': 50, 'Vasari': 4.0187661559418855, 'Borghini': 4.142939558936843}

In [94]:
#?????test = get_tokens_p_author_into_dict(datapath)

- for  unkown  the Delta score for candidate Vasari is 6.39789960050083
- for  unkown  the Delta score for candidate Borghini is 6.427139171358765

the nltk.FreqDist(whole_corpus).most_common(X) value was:  500 ; tokenizer: language = italian


{'X': 500, 'Vasari': 6.39789960050083, 'Borghini': 6.427139171358765}

In [96]:
get_delta_score(datapath, author_candidates = ('Bartoli', 'Borghini', 'Giambullari', 'Vasari', 'Vcopy'), X = 7000)

- for  unkown  the Delta score for candidate Bartoli is 1.2377323231396937
- for  unkown  the Delta score for candidate Borghini is 1.2635584021906843
- for  unkown  the Delta score for candidate Giambullari is 1.0613820175750421
- for  unkown  the Delta score for candidate Vasari is 1.1595709261708145
- for  unkown  the Delta score for candidate Vcopy is 1.2607271753521416

the nltk.FreqDist(whole_corpus).most_common(X) value was:  7000 ; tokenizer: language = italian


{'X': 7000,
 'Bartoli': 1.2377323231396937,
 'Borghini': 1.2635584021906843,
 'Giambullari': 1.0613820175750421,
 'Vasari': 1.1595709261708145,
 'Vcopy': 1.2607271753521416}

In [97]:
get_delta_score(datapath, author_candidates = ('Bartoli', 'Borghini', 'Giambullari', 'Vasari', 'Vcopy', 'unkown'), X = 100)

- for  unkown  the Delta score for candidate Bartoli is 1.4220630660309481
- for  unkown  the Delta score for candidate Borghini is 1.4885549299182328
- for  unkown  the Delta score for candidate Giambullari is 1.8512282543585277
- for  unkown  the Delta score for candidate Vasari is 1.3195617334711442
- for  unkown  the Delta score for candidate Vcopy is 1.4504749606798273
- for  unkown  the Delta score for candidate unkown is 0.0

the nltk.FreqDist(whole_corpus).most_common(X) value was:  100 ; tokenizer: language = italian


{'X': 100,
 'Bartoli': 1.4220630660309481,
 'Borghini': 1.4885549299182328,
 'Giambullari': 1.8512282543585277,
 'Vasari': 1.3195617334711442,
 'Vcopy': 1.4504749606798273,
 'unkown': 0.0}

In [98]:
help(get_delta_score)

Help on function get_delta_score in module __main__:

get_delta_score(datapath, author_candidates=('Vasari', 'Borghini'), disputed='unkown', X=500, language='italian')
    input variables: 
    datapath, 
    author_candidates(default = ('Vasari', 'Borghini')), 
    disputed_text(default = 'unkown') named'unkown'
    X(default = 500) = number of the most common words to use as features
    language(default = 'italian')



## count 'disegn' and 'rinasci'

In [87]:
storage = {}
for i in x['author_tokens']:
    disegno = []
    xdisegn = []
    rinascita = []
    for t in x['author_tokens'][i]:
        if t.startswith('disegn'):
            disegno.append(t)
        elif 'disegn' in t:
            xdisegn.append(t)
        elif t.startswith('rinasc'):
            rinascita.append(t)
        else:
            continue
    storage[i] = {'disegno': disegno, 'rinascita': rinascita }
    print(str(i).upper() + ' mentioned \'disegn\'' + str(len(disegno)) + ' times.\n Relative to textlength:' + str((len(disegno)/len(x['author_tokens'][i]))))
    print(str(i) + ' mentioned \'rinasc\'' + str(len(rinascita)) + ' times.\n'  + str(rinascita))
        

VASARI mentioned 'disegn'235 times.
 Relative to textlength:0.0016193271867807775
Vasari mentioned 'rinasc'1 times.
['rinascere']
BORGHINI mentioned 'disegn'138 times.
 Relative to textlength:0.0016928153481924903
Borghini mentioned 'rinasc'1 times.
['rinascente']
BARTOLI mentioned 'disegn'16 times.
 Relative to textlength:0.0007379733407130668
Bartoli mentioned 'rinasc'0 times.
[]
UNKOWN mentioned 'disegn'1831 times.
 Relative to textlength:0.0025917260102564974
unkown mentioned 'rinasc'6 times.
['rinascimento', 'rinasceva', 'rinascere', 'rinascita', 'rinascita', 'rinascita']
GIAMBULLARI mentioned 'disegn'5 times.
 Relative to textlength:0.002508780732563974
Giambullari mentioned 'rinasc'0 times.
[]
VCOPY mentioned 'disegn'23 times.
 Relative to textlength:0.0017074981440237565
Vcopy mentioned 'rinasc'0 times.
[]


In [88]:
storage = {}
for i in x['author_tokens']:
    disegno = []
    for t in x['author_tokens'][i]:
        if t.startswith('disegno'):
            disegno.append(t)
        else:
            continue
    
    storage[i] = disegno    
    print(i, len(disegno),  '\n')

Vasari 15 

Borghini 62 

Bartoli 3 

unkown 948 

Giambullari 3 

Vcopy 1 

