In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import plotly_express as px

In [2]:
sns.set()

In [3]:
import configparser
config = configparser.ConfigParser()
config.read("env.ini")
data_home = config['DEFAULT']['data_home'] 
output_dir = config['DEFAULT']['output_dir']
data_prefix = 'entrepreneur'

In [4]:
OHCO = ['screenplay_id', 'scene_id', 'para_num', 'sent_num', 'token_num']
bags = dict(
    SENTS = OHCO[:4],
    PARAS = OHCO[:3],
    SCENES = OHCO[:2],
    SCREENPLAYS = OHCO[:1]
)

In [5]:
bag = 'PARAS'

In [13]:
LIB = pd.read_csv(f'{output_dir}/ entrepreneur-LIB.csv').set_index('screenplay_id')
TOKEN = pd.read_csv(f'{output_dir}/{data_prefix}-TOKEN.csv').set_index(OHCO).dropna()
VOCAB = pd.read_csv(f'{output_dir}/{data_prefix}-VOCAB_RANKED.csv').set_index('term_str').dropna()

In [15]:
TOKEN.reset_index().screenplay_id.value_counts().sort_index()

screenplay_id
joy                   18921
steve_jobs            32226
the_big_short         25737
the_founder           19789
the_help              25425
the_social_network    28390
Name: count, dtype: int64

## BOW Table

In [16]:
BOW = TOKEN.groupby(bags[bag]+['term_str']).term_str.count().to_frame('n') 

In [17]:
BOW.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,n
screenplay_id,scene_id,para_num,term_str,Unnamed: 4_level_1
joy,1,0,a,1
joy,1,0,drive,1
joy,1,0,in,1
joy,1,0,its,1
joy,1,0,kitchen,1


## DTM

In [18]:
DTCM = BOW.n.unstack(fill_value=0)

In [19]:
DTCM.head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,term_str,000,031915,05,07,0c,0x0400,1,10,100,1000,...,ﬁwhich,ﬁwont,ﬁyesﬂ,ﬁyour,ﬂ,ﬂcalifornia,ﬂfriendsterﬂ,ﬂi,ﬂis,ﬂwhy
screenplay_id,scene_id,para_num,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
joy,1,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
joy,2,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
joy,4,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
joy,6,0,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
joy,7,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
joy,9,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
joy,10,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
joy,11,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
joy,11,1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
joy,13,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [20]:
DTCM.sum()

term_str
000               1
031915          188
05                4
07                4
0c                1
               ... 
ﬂcalifornia       1
ﬂfriendsterﬂ      1
ﬂi                2
ﬂis               1
ﬂwhy              1
Length: 10400, dtype: int64

In [22]:
DOC = DTCM.sum(1).to_frame('n_tokens')
DOC['n_types'] = DTCM.astype('bool').sum(1)
DOC['pkr'] = DOC.n_types / DOC.n_tokens
DOC = DOC.join(LIB[['raw_title']])

In [23]:
DOC.sort_values('pkr').head(20).style.background_gradient(cmap='YlGnBu')

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,n_tokens,n_types,pkr,raw_title
screenplay_id,scene_id,para_num,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
the_social_network,3,1,209,85,0.406699,The Social Network
the_social_network,542,1,56,24,0.428571,The Social Network
steve_jobs,753,1,149,68,0.456376,Steve Jobs
steve_jobs,732,1,220,104,0.472727,Steve Jobs
the_social_network,295,1,165,82,0.49697,The Social Network
the_help,7,1,28,14,0.5,The Help
steve_jobs,442,1,176,88,0.5,Steve Jobs
the_big_short,131,0,2,1,0.5,The Big Short
the_social_network,5,1,181,93,0.513812,The Social Network
steve_jobs,93,1,202,104,0.514851,Steve Jobs


In [24]:
print(bag)
if bag == 'CHAPS':
    DOC.loc[105, 'pkr'].plot(title="PKR by Chapter in Persuasion", label='title');

PARAS


# TFIDF

In [25]:
tf_method = 'sum'         # sum, max, log, double_norm, raw, binary
tf_norm_k = .5            # only used for double_norm
idf_method = 'standard'   # standard, max, smooth
gradient_cmap = 'YlGnBu'  # YlGn, GnBu, YlGnBu; For tables; see https://matplotlib.org/3.1.0/tutorials/colors/colormaps.html 

In [26]:
print('TF method:', tf_method)

if tf_method == 'sum':
    TF = DTCM.T / DTCM.T.sum()

elif tf_method == 'max':
    TF = DTCM.T / DTCM.T.max()
    
elif tf_method == 'log':
    TF = np.log2(1 + DTCM.T)
    
elif tf_method == 'raw':
    TF = DTCM.T
    
elif tf_method == 'double_norm':
    TF = DTCM.T / DTCM.T.max()
    
elif tf_method == 'binary':
    TF = DTCM.T.astype('bool').astype('int')
    
TF = TF.T

TF method: sum


In [27]:
TF.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,term_str,000,031915,05,07,0c,0x0400,1,10,100,1000,...,ﬁwhich,ﬁwont,ﬁyesﬂ,ﬁyour,ﬂ,ﬂcalifornia,ﬂfriendsterﬂ,ﬂi,ﬂis,ﬂwhy
screenplay_id,scene_id,para_num,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
joy,1,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
joy,2,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
joy,4,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
joy,6,0,0.0,0.0,0.0,0.0,0.0,0.0,0.041667,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
joy,7,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [28]:
DF = DTCM.astype('bool').sum() 

In [29]:
DF

term_str
000               1
031915          188
05                4
07                4
0c                1
               ... 
ﬂcalifornia       1
ﬂfriendsterﬂ      1
ﬂi                2
ﬂis               1
ﬂwhy              1
Length: 10400, dtype: int64

In [30]:
N = DTCM.shape[0]

In [31]:
print('IDF method:', idf_method)

if idf_method == 'standard':
    IDF = np.log2(N / DF)

elif idf_method == 'max':
    IDF = np.log2(DF.max() / DF) 

elif idf_method == 'smooth':
    IDF = np.log2((1 + N) / (1 + DF)) + 1

IDF method: standard


In [32]:
IDF

term_str
000             12.068106
031915           4.513518
05              10.068106
07              10.068106
0c              12.068106
                  ...    
ﬂcalifornia     12.068106
ﬂfriendsterﬂ    12.068106
ﬂi              11.068106
ﬂis             12.068106
ﬂwhy            12.068106
Length: 10400, dtype: float64

In [33]:
TFIDF = TF * IDF

In [34]:
TFIDF

Unnamed: 0_level_0,Unnamed: 1_level_0,term_str,000,031915,05,07,0c,0x0400,1,10,100,1000,...,ﬁwhich,ﬁwont,ﬁyesﬂ,ﬁyour,ﬂ,ﬂcalifornia,ﬂfriendsterﬂ,ﬂi,ﬂis,ﬂwhy
screenplay_id,scene_id,para_num,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
joy,1,0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0
joy,2,0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0
joy,4,0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0
joy,6,0,0.0,0.0,0.0,0.000000,0.0,0.0,0.252838,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0
joy,7,0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
the_social_network,573,1,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.038956,0.0,0.0,...,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0
the_social_network,574,0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0
the_social_network,574,1,0.0,0.0,0.0,0.044158,0.0,0.0,0.000000,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.05293,0.0,0.0,0.0,0.0,0.0,0.0
the_social_network,575,0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0


Add df and tf to VOCAB

In [35]:
VOCAB['df'] = DF
VOCAB['idf'] = IDF

In [36]:
BOW['tf'] = TF.stack()
BOW['tfidf'] = TFIDF.stack()

In [46]:
BOW

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,n,tf,tfidf
screenplay_id,scene_id,para_num,term_str,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
joy,1,0,a,1,0.090909,0.134575
joy,1,0,drive,1,0.090909,0.603804
joy,1,0,in,1,0.090909,0.177906
joy,1,0,its,1,0.090909,0.302422
joy,1,0,kitchen,1,0.090909,0.525700
...,...,...,...,...,...,...
the_social_network,575,1,waits,2,0.040000,0.330430
the_social_network,575,1,we,1,0.020000,0.058725
the_social_network,575,1,world,1,0.020000,0.125735
the_social_network,575,1,youngest,1,0.020000,0.241362


# L2 Norm

In [49]:
from numpy.linalg import norm
from scipy.spatial.distance import pdist
# creates hierarchical clustering

import scipy.cluster.hierarchy as sch
import matplotlib.pyplot as plt

L0 = TFIDF.astype('bool').astype('int') # Binary (Pseudo L)
L1 = TFIDF.apply(lambda x: x / x.sum(), 1) # Probabilistic
L2 = TFIDF.apply(lambda x: x / norm(x), 1) # Pythagorean, AKA Euclidean

In [53]:
L2

Unnamed: 0_level_0,Unnamed: 1_level_0,term_str,000,031915,05,07,0c,0x0400,1,10,100,1000,...,ﬁwhich,ﬁwont,ﬁyesﬂ,ﬁyour,ﬂ,ﬂcalifornia,ﬂfriendsterﬂ,ﬂi,ﬂis,ﬂwhy
screenplay_id,scene_id,para_num,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
joy,1,0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
joy,2,0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
joy,4,0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
joy,6,0,0.0,0.0,0.0,0.000000,0.0,0.0,0.180611,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
joy,7,0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
the_social_network,573,1,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.053056,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
the_social_network,574,0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
the_social_network,574,1,0.0,0.0,0.0,0.080651,0.0,0.0,0.000000,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.096672,0.0,0.0,0.0,0.0,0.0,0.0
the_social_network,575,0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0


Contained 104,000 Total terms, grouped by paragraph

In [47]:
save_path = f"{output_dir}/{data_prefix}"
LIB.to_csv(f'{save_path}-LIB.csv')
VOCAB.to_csv(f'{save_path}-VOCAB-{bag}.csv')
BOW.to_csv(f'{save_path}-BOW-{bag}.csv')
DTCM.to_csv(f'{save_path}-DTCM-{bag}.csv')
DOC.to_csv(f'{save_path}-DOC-{bag}.csv')
TFIDF.to_csv(f"{save_path}-TFIDF-{bag}.csv")