In [1]:
import spacy
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer

In [2]:
with open('stopwords.txt', 'r') as f:
    stopwords = f.read().splitlines()
print(stopwords)

['\ufeff.', ',', ';', ':', '/', '?', '!', 'à', 'puis', 'au', 'aux', 'avec', 'ce', 'ces', 'dans', 'de', 'des', 'du', 'elle', 'en', 'et', 'eux', 'il', 'je', 'la', 'le', 'leur', 'lui', 'ma', 'mais', 'me', 'même', 'mes', 'moi', 'mon', 'ne', 'nos', 'notre', 'nous', 'on', 'ou', 'par', 'pas', 'pour', 'qu', 'que', 'qui', 'sa', 'se', 'ses', 'son', 'sur', 'ta', 'te', 'tes', 'toi', 'ton', 'tu', 'un', 'une', 'vos', 'votre', 'vous', 'c', 'd', 'j', 'l', 'à', 'm', 'n', 's', 't', 'y', 'été', 'étée', 'étées', 'étés', 'étant', 'suis', 'es', 'est', 'sommes', 'êtes', 'être', 'sont', 'serai', 'seras', 'sera', 'serons', 'serez', 'seront', 'serais', 'serait', 'serions', 'seriez', 'seraient', 'étais', 'était', 'étions', 'étiez', 'étaient', 'fus', 'fut', 'fûmes', 'fûtes', 'furent', 'sois', 'soit', 'soyons', 'soyez', 'soient', 'fusse', 'fusses', 'fût', 'fussions', 'fussiez', 'fussent', 'ayant', 'avoir', 'eu', 'eue', 'eues', 'eus', 'ai', 'as', 'avons', 'avez', 'ont', 'aurai', 'auras', 'aura', 'aurons', 'aurez', 

In [3]:
vectorizer = CountVectorizer(input='filename', stop_words=stopwords)
dfTransformer = TfidfTransformer()
subLinearTransformer = TfidfTransformer(sublinear_tf=True)
limitedVectorizer = TfidfVectorizer(input='filename', stop_words=stopwords, max_features=500)

In [4]:
filenames = ['chaps-firstChaps_restreint.txt','chaps-lastChaps_restreint.txt']

In [5]:
colLabels = [fn[6:-4] for fn in filenames]
colLabels

['firstChaps_restreint', 'lastChaps_restreint']

In [6]:
docTermMatrix = vectorizer.fit_transform(filenames)

In [7]:
tfidfMatrix = dfTransformer.fit_transform(docTermMatrix)

In [8]:
subTfidfMatrix = subLinearTransformer.fit_transform(docTermMatrix)

In [9]:
limTfidfMatrix = limitedVectorizer.fit_transform(filenames)

In [10]:
rowLabels = vectorizer.get_feature_names()

In [11]:
df = pd.DataFrame(docTermMatrix.todense().T, columns=colLabels, index=rowLabels)

In [12]:
dfTfidf = pd.DataFrame(tfidfMatrix.todense().T, columns=colLabels, index=rowLabels)

In [13]:
dfSubTfidf = pd.DataFrame(subTfidfMatrix.todense().T, columns=colLabels, index=rowLabels)

In [14]:
def addDistinctivenessScores(df): 
    df['firstDistinct'] = df['firstChaps_restreint'] - df['lastChaps_restreint']
    df['lastDistinct'] = df['lastChaps_restreint'] - df['firstChaps_restreint']

In [15]:
for dfI in [df, dfTfidf, dfSubTfidf]: 
    addDistinctivenessScores(dfI)

# Raw Counts

In [16]:
df.sort_values('firstDistinct', ascending=False).head(15)

Unnamed: 0,firstChaps_restreint,lastChaps_restreint,firstDistinct,lastDistinct
faire,21119,15945,5174,-5174
dire,17069,13381,3688,-3688
deux,8369,5724,2645,-2645
bien,10456,7897,2559,-2559
grand,7416,5022,2394,-2394
petit,7604,5249,2355,-2355
pouvoir,10295,8093,2202,-2202
homme,7025,4994,2031,-2031
monsieur,6096,4128,1968,-1968
peu,6027,4267,1760,-1760


In [17]:
df.sort_values('lastDistinct', ascending=False).head(15)

Unnamed: 0,firstChaps_restreint,lastChaps_restreint,firstDistinct,lastDistinct
mitsou,75,289,-214,214
lion,93,299,-206,206
françoise,50,238,-188,188
julien,92,266,-174,174
lupin,42,192,-150,150
moine,84,232,-148,148
douleur,423,566,-143,143
jacques,401,543,-142,142
tuer,681,808,-127,127
corps,1611,1733,-122,122


# Using TFIDF

In [18]:
dfTfidf.sort_values('firstDistinct', ascending=False).head(n=20)

Unnamed: 0,firstChaps_restreint,lastChaps_restreint,firstDistinct,lastDistinct
répondre,0.060476,0.043248,0.017228,-0.017228
an,0.050703,0.034978,0.015725,-0.015725
jeune,0.067354,0.052456,0.014898,-0.014898
capitaine,0.020516,0.005766,0.01475,-0.01475
deux,0.142489,0.127933,0.014556,-0.014556
grand,0.126263,0.112243,0.01402,-0.01402
petit,0.129464,0.117317,0.012147,-0.012147
monsieur,0.103789,0.092262,0.011527,-0.011527
mère,0.049205,0.037929,0.011276,-0.011276
michel,0.017145,0.006727,0.010418,-0.010418


In [19]:
dfTfidf.sort_values('lastDistinct', ascending=False).head(n=20)

Unnamed: 0,firstChaps_restreint,lastChaps_restreint,firstDistinct,lastDistinct
vouloir,0.103023,0.120647,-0.017624,0.017624
aller,0.157506,0.172902,-0.015397,0.015397
mort,0.040811,0.054334,-0.013523,0.013523
corps,0.027429,0.038733,-0.011305,0.011305
vie,0.058875,0.070046,-0.011171,0.011171
voir,0.157029,0.167047,-0.010017,0.010017
maintenant,0.030136,0.040097,-0.009961,0.009961
croire,0.05862,0.068414,-0.009795,0.009795
encore,0.100537,0.109897,-0.009359,0.009359
amour,0.024415,0.03357,-0.009155,0.009155


# TFIDF Limited to Top 500 MFW

In [20]:
rowLabels_limited = limitedVectorizer.get_feature_names()

In [21]:
dfLimTfidf = pd.DataFrame(limTfidfMatrix.todense().T, columns=colLabels, index=rowLabels_limited)

In [22]:
addDistinctivenessScores(dfLimTfidf)

In [23]:
dfLimTfidf.sort_values('firstDistinct', ascending=False).head(n=20)

Unnamed: 0,firstChaps_restreint,lastChaps_restreint,firstDistinct,lastDistinct
répondre,0.06215,0.044338,0.017812,-0.017812
an,0.052106,0.03586,0.016246,-0.016246
jeune,0.069218,0.053778,0.01544,-0.01544
deux,0.146433,0.131158,0.015275,-0.015275
capitaine,0.021084,0.005912,0.015172,-0.015172
grand,0.129758,0.115073,0.014686,-0.014686
petit,0.133048,0.120274,0.012774,-0.012774
monsieur,0.106662,0.094588,0.012074,-0.012074
mère,0.050567,0.038885,0.011682,-0.011682
michel,0.01762,0.006897,0.010723,-0.010723


In [24]:
dfLimTfidf.sort_values('lastDistinct', ascending=False).head(n=20)

Unnamed: 0,firstChaps_restreint,lastChaps_restreint,firstDistinct,lastDistinct
vouloir,0.105875,0.123688,-0.017813,0.017813
aller,0.161865,0.17726,-0.015395,0.015395
mort,0.04194,0.055703,-0.013763,0.013763
corps,0.028188,0.039709,-0.011522,0.011522
vie,0.060505,0.071811,-0.011307,0.011307
maintenant,0.03097,0.041107,-0.010137,0.010137
croire,0.060242,0.070139,-0.009896,0.009896
voir,0.161375,0.171257,-0.009881,0.009881
encore,0.10332,0.112667,-0.009346,0.009346
amour,0.025091,0.034416,-0.009326,0.009326
