In [26]:
import spacy
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer

In [27]:
with open('stopwords.txt', 'r') as f:
    stopwords = f.read().splitlines()
print(stopwords)

['\ufeff.', ',', ';', ':', '/', '?', '!', 'à', 'puis', 'au', 'aux', 'avec', 'ce', 'ces', 'dans', 'de', 'des', 'du', 'elle', 'en', 'et', 'eux', 'il', 'je', 'la', 'le', 'leur', 'lui', 'ma', 'mais', 'me', 'même', 'mes', 'moi', 'mon', 'ne', 'nos', 'notre', 'nous', 'on', 'ou', 'par', 'pas', 'pour', 'qu', 'que', 'qui', 'sa', 'se', 'ses', 'son', 'sur', 'ta', 'te', 'tes', 'toi', 'ton', 'tu', 'un', 'une', 'vos', 'votre', 'vous', 'c', 'd', 'j', 'l', 'à', 'm', 'n', 's', 't', 'y', 'été', 'étée', 'étées', 'étés', 'étant', 'suis', 'es', 'est', 'sommes', 'êtes', 'être', 'sont', 'serai', 'seras', 'sera', 'serons', 'serez', 'seront', 'serais', 'serait', 'serions', 'seriez', 'seraient', 'étais', 'était', 'étions', 'étiez', 'étaient', 'fus', 'fut', 'fûmes', 'fûtes', 'furent', 'sois', 'soit', 'soyons', 'soyez', 'soient', 'fusse', 'fusses', 'fût', 'fussions', 'fussiez', 'fussent', 'ayant', 'avoir', 'eu', 'eue', 'eues', 'eus', 'ai', 'as', 'avons', 'avez', 'ont', 'aurai', 'auras', 'aura', 'aurons', 'aurez', 

In [28]:
vectorizer = CountVectorizer(input='filename', stop_words=stopwords)
dfTransformer = TfidfTransformer()
subLinearTransformer = TfidfTransformer(sublinear_tf=True)
limitedVectorizer = TfidfVectorizer(input='filename', stop_words=stopwords, max_features=1000)

In [29]:
filenames = ['chaps-first_sub_glob.txt','chaps-All-but-first_sub_glob.txt']

In [30]:
colLabels = [fn[6:-4] for fn in filenames]
colLabels

['first_sub_glob', 'All-but-first_sub_glob']

In [31]:
docTermMatrix = vectorizer.fit_transform(filenames)

In [32]:
tfidfMatrix = dfTransformer.fit_transform(docTermMatrix)

In [33]:
subTfidfMatrix = subLinearTransformer.fit_transform(docTermMatrix)

In [34]:
limTfidfMatrix = limitedVectorizer.fit_transform(filenames)

In [35]:
rowLabels = vectorizer.get_feature_names()

In [36]:
df = pd.DataFrame(docTermMatrix.todense().T, columns=colLabels, index=rowLabels)

In [37]:
dfTfidf = pd.DataFrame(tfidfMatrix.todense().T, columns=colLabels, index=rowLabels)

In [38]:
dfSubTfidf = pd.DataFrame(subTfidfMatrix.todense().T, columns=colLabels, index=rowLabels)

In [39]:
def addDistinctivenessScores(df): 
    df['firstDistinct'] = df['first_sub_glob'] - df['All-but-first_sub_glob']
    df['lastDistinct'] = df['All-but-first_sub_glob'] - df['first_sub_glob']

In [40]:
for dfI in [df, dfTfidf, dfSubTfidf]: 
    addDistinctivenessScores(dfI)

# Raw Counts

In [41]:
df.sort_values('firstDistinct', ascending=False).head(15)

Unnamed: 0,first_sub_glob,All-but-first_sub_glob,firstDistinct,lastDistinct
orque,32,8,24,-24
miton,18,4,14,-14
troll,24,11,13,-13
bernache,20,8,12,-12
martelet,12,1,11,-11
turgotine,12,2,10,-10
grappa,17,10,7,-7
scopolamine,7,1,6,-6
doryphore,23,19,4,-4
plectre,7,3,4,-4


In [42]:
df.sort_values('lastDistinct', ascending=False).head(15)

Unnamed: 0,first_sub_glob,All-but-first_sub_glob,firstDistinct,lastDistinct
monsieur,16663,345265,-328602,328602
homme,17677,271319,-253642,253642
jour,11861,192896,-181035,181035
main,11669,192016,-180347,180347
peu,13115,191129,-178014,178014
femme,13028,190986,-177958,177958
oeil,12394,184050,-171656,171656
heure,11212,179952,-168740,168740
temps,10315,153584,-143269,143269
fois,8843,143654,-134811,134811


# Using TFIDF

In [19]:
dfTfidf.sort_values('firstDistinct', ascending=False).head(n=20)

Unnamed: 0,lastChaps_adj_rest,All_but_last_adj_rest,firstDistinct,lastDistinct
dernier,0.1721,0.129992,0.042108,-0.042108
blanc,0.134565,0.106178,0.028387,-0.028387
seul,0.245858,0.227467,0.018391,-0.018391
long,0.161027,0.142906,0.018121,-0.018121
vide,0.056772,0.04032,0.016453,-0.016453
humain,0.048702,0.034506,0.014196,-0.014196
noir,0.136723,0.122551,0.014172,-0.014172
autre,0.323181,0.310731,0.01245,-0.01245
nu,0.045606,0.034267,0.011339,-0.011339
vivant,0.041477,0.031078,0.010399,-0.010399


In [20]:
dfTfidf.sort_values('lastDistinct', ascending=False).head(n=20)

Unnamed: 0,lastChaps_adj_rest,All_but_last_adj_rest,firstDistinct,lastDistinct
jeune,0.19678,0.251597,-0.054817,0.054817
bon,0.235723,0.280859,-0.045136,0.045136
joli,0.032562,0.044968,-0.012406,0.012406
cher,0.073945,0.086178,-0.012233,0.012233
large,0.0335,0.044114,-0.010614,0.010614
certain,0.044855,0.053127,-0.008272,0.008272
pauvre,0.108008,0.116157,-0.008149,0.008149
royal,0.007695,0.015193,-0.007498,0.007498
petit,0.419178,0.426237,-0.007059,0.007059
mauvais,0.053863,0.060706,-0.006842,0.006842


# TFIDF Limited to Top 1000 MFW

In [21]:
rowLabels_limited = limitedVectorizer.get_feature_names()

In [22]:
dfLimTfidf = pd.DataFrame(limTfidfMatrix.todense().T, columns=colLabels, index=rowLabels_limited)

In [23]:
addDistinctivenessScores(dfLimTfidf)

In [24]:
dfLimTfidf.sort_values('firstDistinct', ascending=False).head(n=20)

Unnamed: 0,lastChaps_adj_rest,All_but_last_adj_rest,firstDistinct,lastDistinct
dernier,0.172279,0.13009,0.04219,-0.04219
blanc,0.134705,0.106257,0.028447,-0.028447
seul,0.246114,0.227638,0.018476,-0.018476
long,0.161195,0.143014,0.018181,-0.018181
vide,0.056832,0.04035,0.016482,-0.016482
noir,0.136865,0.122643,0.014223,-0.014223
humain,0.048753,0.034532,0.014221,-0.014221
autre,0.323517,0.310964,0.012553,-0.012553
nu,0.045653,0.034293,0.011361,-0.011361
vivant,0.04152,0.031101,0.010419,-0.010419


In [25]:
dfLimTfidf.sort_values('lastDistinct', ascending=False).head(n=20)

Unnamed: 0,lastChaps_adj_rest,All_but_last_adj_rest,firstDistinct,lastDistinct
jeune,0.196985,0.251786,-0.054801,0.054801
bon,0.235968,0.281069,-0.045101,0.045101
joli,0.032596,0.045002,-0.012406,0.012406
cher,0.074022,0.086243,-0.012221,0.012221
large,0.033535,0.044147,-0.010612,0.010612
certain,0.044902,0.053167,-0.008265,0.008265
pauvre,0.108121,0.116244,-0.008123,0.008123
royal,0.007703,0.015204,-0.007502,0.007502
petit,0.419614,0.426557,-0.006943,0.006943
mauvais,0.05392,0.060751,-0.006832,0.006832
