In [27]:
import spacy
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer

In [28]:
with open('stopwords.txt', 'r') as f:
    stopwords = f.read().splitlines()
print(stopwords)

['\ufeff.', ',', ';', ':', '/', '?', '!', 'à', 'puis', 'au', 'aux', 'avec', 'ce', 'ces', 'dans', 'de', 'des', 'du', 'elle', 'en', 'et', 'eux', 'il', 'je', 'la', 'le', 'leur', 'lui', 'ma', 'mais', 'me', 'même', 'mes', 'moi', 'mon', 'ne', 'nos', 'notre', 'nous', 'on', 'ou', 'par', 'pas', 'pour', 'qu', 'que', 'qui', 'sa', 'se', 'ses', 'son', 'sur', 'ta', 'te', 'tes', 'toi', 'ton', 'tu', 'un', 'une', 'vos', 'votre', 'vous', 'c', 'd', 'j', 'l', 'à', 'm', 'n', 's', 't', 'y', 'été', 'étée', 'étées', 'étés', 'étant', 'suis', 'es', 'est', 'sommes', 'êtes', 'être', 'sont', 'serai', 'seras', 'sera', 'serons', 'serez', 'seront', 'serais', 'serait', 'serions', 'seriez', 'seraient', 'étais', 'était', 'étions', 'étiez', 'étaient', 'fus', 'fut', 'fûmes', 'fûtes', 'furent', 'sois', 'soit', 'soyons', 'soyez', 'soient', 'fusse', 'fusses', 'fût', 'fussions', 'fussiez', 'fussent', 'ayant', 'avoir', 'eu', 'eue', 'eues', 'eus', 'ai', 'as', 'avons', 'avez', 'ont', 'aurai', 'auras', 'aura', 'aurons', 'aurez', 

In [51]:
vectorizer = CountVectorizer(input='filename', stop_words=stopwords)
dfTransformer = TfidfTransformer()
subLinearTransformer = TfidfTransformer(sublinear_tf=True)
limitedVectorizer = TfidfVectorizer(input='filename', stop_words=stopwords, max_features=1000)

In [147]:
filenames = ['chaps-lastChaps_adj_rest.txt','chaps-All_but_last_adj_rest.txt']

In [148]:
colLabels = [fn[6:-4] for fn in filenames]
colLabels

['lastChaps_adj_rest', 'All_but_last_adj_rest']

In [149]:
docTermMatrix = vectorizer.fit_transform(filenames)

In [150]:
tfidfMatrix = dfTransformer.fit_transform(docTermMatrix)

In [151]:
subTfidfMatrix = subLinearTransformer.fit_transform(docTermMatrix)

In [152]:
limTfidfMatrix = limitedVectorizer.fit_transform(filenames)

In [153]:
rowLabels = vectorizer.get_feature_names()

In [154]:
df = pd.DataFrame(docTermMatrix.todense().T, columns=colLabels, index=rowLabels)

In [155]:
dfTfidf = pd.DataFrame(tfidfMatrix.todense().T, columns=colLabels, index=rowLabels)

In [156]:
dfSubTfidf = pd.DataFrame(subTfidfMatrix.todense().T, columns=colLabels, index=rowLabels)

In [157]:
def addDistinctivenessScores(df): 
    df['firstDistinct'] = df['lastChaps_adj_rest'] - df['All_but_last_adj_rest']
    df['lastDistinct'] = df['All_but_last_adj_rest'] - df['lastChaps_adj_rest']

In [158]:
for dfI in [df, dfTfidf, dfSubTfidf]: 
    addDistinctivenessScores(dfI)

# Raw Counts

In [69]:
df.sort_values('firstDistinct', ascending=False).head(15)

Unnamed: 0,firstChaps_sub_rest,All_but_first_sub_rest,firstDistinct,lastDistinct
reubeu,38,4,34,-34
cyclope,86,67,19,-19
grappa,17,6,11,-11
scopolamine,7,0,7,-7
plectre,7,1,6,-6
métaphysicien,17,11,6,-6
géode,14,10,4,-4
contrevérité,6,2,4,-4
naturalisme,12,8,4,-4
gyroscope,8,5,3,-3


In [70]:
df.sort_values('lastDistinct', ascending=False).head(15)

Unnamed: 0,firstChaps_sub_rest,All_but_first_sub_rest,firstDistinct,lastDistinct
monsieur,4478,99032,-94554,94554
homme,5449,93689,-88240,88240
peu,4702,71489,-66787,66787
jour,3983,67408,-63425,63425
main,3858,66098,-62240,62240
oeil,4218,64852,-60634,60634
femme,3989,62347,-58358,58358
heure,3412,59835,-56423,56423
temps,3488,55553,-52065,52065
chose,3017,52618,-49601,49601


# Using TFIDF

In [71]:
dfTfidf.sort_values('firstDistinct', ascending=False).head(n=20)

Unnamed: 0,firstChaps_sub_rest,All_but_first_sub_rest,firstDistinct,lastDistinct
an,0.120705,0.080938,0.039768,-0.039768
année,0.061332,0.041365,0.019967,-0.019967
mère,0.115435,0.095666,0.019769,-0.019769
peu,0.219303,0.200213,0.01909,-0.01909
père,0.128214,0.11273,0.015484,-0.015484
oeil,0.196729,0.181625,0.015104,-0.015104
enfant,0.127188,0.114744,0.012444,-0.012444
monde,0.10606,0.094134,0.011926,-0.011926
âge,0.03596,0.02404,0.011919,-0.011919
maison,0.100883,0.089135,0.011748,-0.011748


In [72]:
dfTfidf.sort_values('lastDistinct', ascending=False).head(n=20)

Unnamed: 0,firstChaps_sub_rest,All_but_first_sub_rest,firstDistinct,lastDistinct
monsieur,0.208855,0.27735,-0.068495,0.068495
madame,0.077283,0.105922,-0.028639,0.028639
roi,0.029803,0.058205,-0.028402,0.028402
coeur,0.071266,0.089785,-0.018518,0.018518
ami,0.072106,0.090286,-0.01818,0.01818
lettre,0.032228,0.050296,-0.018068,0.018068
comte,0.024346,0.041359,-0.017013,0.017013
amour,0.046734,0.061574,-0.014841,0.014841
moment,0.091975,0.106143,-0.014168,0.014168
reine,0.009468,0.022853,-0.013385,0.013385


# TFIDF Limited to Top 1000 MFW

In [159]:
rowLabels_limited = limitedVectorizer.get_feature_names()

In [160]:
dfLimTfidf = pd.DataFrame(limTfidfMatrix.todense().T, columns=colLabels, index=rowLabels_limited)

In [161]:
addDistinctivenessScores(dfLimTfidf)

In [162]:
dfLimTfidf.sort_values('firstDistinct', ascending=False).head(n=20)

Unnamed: 0,lastChaps_adj_rest,All_but_last_adj_rest,firstDistinct,lastDistinct
dernier,0.172279,0.13009,0.04219,-0.04219
blanc,0.134705,0.106257,0.028447,-0.028447
seul,0.246114,0.227638,0.018476,-0.018476
long,0.161195,0.143014,0.018181,-0.018181
vide,0.056832,0.04035,0.016482,-0.016482
noir,0.136865,0.122643,0.014223,-0.014223
humain,0.048753,0.034532,0.014221,-0.014221
autre,0.323517,0.310964,0.012553,-0.012553
nu,0.045653,0.034293,0.011361,-0.011361
vivant,0.04152,0.031101,0.010419,-0.010419


In [163]:
dfLimTfidf.sort_values('lastDistinct', ascending=False).head(n=20)

Unnamed: 0,lastChaps_adj_rest,All_but_last_adj_rest,firstDistinct,lastDistinct
jeune,0.196985,0.251786,-0.054801,0.054801
bon,0.235968,0.281069,-0.045101,0.045101
joli,0.032596,0.045002,-0.012406,0.012406
cher,0.074022,0.086243,-0.012221,0.012221
large,0.033535,0.044147,-0.010612,0.010612
certain,0.044902,0.053167,-0.008265,0.008265
pauvre,0.108121,0.116244,-0.008123,0.008123
royal,0.007703,0.015204,-0.007502,0.007502
petit,0.419614,0.426557,-0.006943,0.006943
mauvais,0.05392,0.060751,-0.006832,0.006832
