In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import string

import nltk
from nltk.tokenize import word_tokenize

from collections import Counter
from wordcloud import WordCloud
import matplotlib.pyplot as plt

font_path = 'font.ttf'

### Retrieve and prepare data

In [3]:
raw = pd.read_csv('Input/the_office_lines.csv')
raw = raw.head(30000)
raw.drop(columns=['id','season','episode','scene','deleted'], inplace = True)
raw = raw.drop(raw[raw['speaker'] == 'Creed'].index)
raw_count = raw['speaker'].value_counts()
raw_count = raw_count[raw_count >= 230].index
raw = raw[raw['speaker'].isin(raw_count)]
print(raw['speaker'].value_counts())

Michael     8541
Dwight      3902
Jim         3472
Pam         2782
Andy        1077
Jan          899
Ryan         852
Angela       830
Kevin        707
Oscar        578
Kelly        533
Phyllis      499
Toby         497
Stanley      411
Darryl       341
Karen        340
Holly        305
Meredith     287
Roy          244
Name: speaker, dtype: int64


### Aggregate by speaker

In [4]:
raw_agg = raw.groupby('speaker')['line_text'].agg(lambda x: ' '.join(x))
raw_agg = raw_agg.reset_index()
raw_agg

Unnamed: 0,speaker,line_text
0,Andy,"Hey, Big Tuna! You're single right? She's pre..."
1,Angela,I bet it's gonna be me. Probably gonna be me. ...
2,Darryl,It's not my real name. Darryl Philbin. Then Re...
3,Dwight,[singing] Shall I play for you? Pa rum pump um...
4,Holly,"Hi. Yes, uh, I'm Holly Flax. I was told to ask..."
5,Jan,[on her cell phone] Just before lunch. That wo...
6,Jim,"Oh, I told you. I couldn't close it. So... Act..."
7,Karen,Jim's nice enough. I dont... I don't know how...
8,Kelly,I have a customer meeting. I just had the long...
9,Kevin,"Yeah, it'll be you. I don't wanna be laid off...."


### Labellise the speakers MBTI types and mental functions

In [5]:
# Ajout des colonnes MBTI
def map_personality(speaker):
    if speaker == 'Michael':
        return 'ENFP'
    elif speaker == 'Jim':
        return 'ENTP'
    elif speaker == 'Dwight':
        return 'ESTJ'
    elif speaker == 'Pam':
        return 'ISFJ'
    elif speaker == 'Kevin':
        return 'ISFP'
    elif speaker == 'Ryan':
        return 'ESTP'
    elif speaker == 'Stanley':
        return 'ISTJ'
    elif speaker == 'Andy':
        return 'ESFP'
    elif speaker == 'Creed':
        return 'INTP'
    elif speaker == 'Kelly':
        return 'ESFP'
    elif speaker == 'Angela':
        return 'ISTJ'
    elif speaker == 'Toby':
        return 'ISFJ'
    elif speaker == 'Oscar':
        return 'INTJ'
    elif speaker == 'Meredith':
        return 'ESTP'
    elif speaker == 'Phyllis':
        return 'ISFJ'
    elif speaker == 'Jan':
        return 'ENTJ'
    elif speaker == 'Darryl':
        return 'ISTP'
    elif speaker == 'Roy':
        return 'ESFP'
    elif speaker == 'Katy':
        return 'ESFJ'
    elif speaker == 'Karen':
        return 'ISTJ'
    elif speaker == 'Charles':
        return 'ISTJ'
    elif speaker == 'David':
        return 'INFJ'
    elif speaker == 'Holly':
        return 'INFP'
    elif speaker == 'Carol':
        return 'ISFJ'
    else:
        return None


def map_f1(MBTI):
    if MBTI in ['INFP', 'ISFP']:
        return 'Fi'
    elif MBTI in ['INFJ', 'INTJ']:
        return 'Ni'
    elif MBTI in ['INTP', 'ISTP']:
        return 'Ti'
    elif MBTI in ['ISTJ', 'ISFJ']:
        return 'Si'
    elif MBTI in ['ENFJ', 'ESFJ']:
        return 'Fe'
    elif MBTI in ['ESFP', 'ESTP']:
        return 'Se'
    elif MBTI in ['ENFP', 'ENTP']:
        return 'Ne'
    elif MBTI in ['ESTJ', 'ENTJ']:
        return 'Te'
    else:
        return None

def map_f2(MBTI):
    if MBTI in ['INFP', 'INTP']:
        return 'Ne'
    elif MBTI in ['INFJ', 'ISFJ']:
        return 'Fe'
    elif MBTI in ['ISFP', 'ISTP']:
        return 'Se'
    elif MBTI in ['ISTJ', 'INTJ']:
        return 'Te'
    elif MBTI in ['ESTP', 'ENTP']:
        return 'Ti'
    elif MBTI in ['ESFP', 'ENFP']:
        return 'Fi'
    elif MBTI in ['ESFJ', 'ESTJ']:
        return 'Si'
    elif MBTI in ['ENTJ', 'ENFJ']:
        return 'Ni'
    else:
        return None
    
raw_agg['MBTI'] = raw_agg['speaker'].apply(map_personality)
raw_agg['F1'] = raw_agg['MBTI'].apply(map_f1)
raw_agg['F2'] = raw_agg['MBTI'].apply(map_f2)


MBTI = (raw_agg['MBTI'].value_counts() / len(raw_agg) * 100)
F1 = (raw_agg['F1'].value_counts() / len(raw_agg) * 100)
F2 = (raw_agg['F2'].value_counts() / len(raw_agg) * 100)
Fs_col = pd.concat([raw_agg['F1'], raw_agg['F2']])
Fs = (Fs_col.value_counts() / len(Fs_col) * 100)

nb_i = 0
nb_n = 0
nb_f = 0
nb_p = 0
for types in raw_agg['MBTI']:
    nb_i += types.count('I')
    nb_n += types.count('N')
    nb_f += types.count('F')
    nb_p += types.count('P')
prop_i = 100 * nb_i / len(raw_agg)
prop_n = 100 * nb_n / len(raw_agg)
prop_f = 100 * nb_f / len(raw_agg)
prop_p = 100 * nb_p/ len(raw_agg)

mbti_df = pd.DataFrame({'Type':MBTI.index, 'Type_%': MBTI.values})
F1_df = pd.DataFrame({'F1':F1.index, 'F1_%': F1.values})
F2_df = pd.DataFrame({'F2':MBTI.index, 'F2_%': MBTI.values})
Fs_df = pd.DataFrame({'Fs':MBTI.index, 'Fs_%': MBTI.values})
prop = pd.DataFrame({'Axis': ['I/E', 'N/S', 'F/T', 'P/J'], 'Axis_%': [prop_i, prop_n, prop_f, prop_p]})

# Concatenate DataFrames along columns
meta_data = pd.concat([mbti_df, F1_df, F2_df, Fs_df, prop], axis=1)

meta_data.to_csv('Input/meta_data.csv', index=False)
meta_data.to_csv('Output/meta_data.csv', index=False)
raw_agg.to_csv('Input/raw_agg.csv', index=False)
raw_agg

Unnamed: 0,speaker,line_text,MBTI,F1,F2
0,Andy,"Hey, Big Tuna! You're single right? She's pre...",ESFP,Se,Fi
1,Angela,I bet it's gonna be me. Probably gonna be me. ...,ISTJ,Si,Te
2,Darryl,It's not my real name. Darryl Philbin. Then Re...,ISTP,Ti,Se
3,Dwight,[singing] Shall I play for you? Pa rum pump um...,ESTJ,Te,Si
4,Holly,"Hi. Yes, uh, I'm Holly Flax. I was told to ask...",INFP,Fi,Ne
5,Jan,[on her cell phone] Just before lunch. That wo...,ENTJ,Te,Ni
6,Jim,"Oh, I told you. I couldn't close it. So... Act...",ENTP,Ne,Ti
7,Karen,Jim's nice enough. I dont... I don't know how...,ISTJ,Si,Te
8,Kelly,I have a customer meeting. I just had the long...,ESFP,Se,Fi
9,Kevin,"Yeah, it'll be you. I don't wanna be laid off....",ISFP,Fi,Se


### Bag speeches

In [6]:
bags_agg = raw_agg.copy()

bags_agg['line_text'] = bags_agg['line_text'].astype(str)

bags_agg['line_text'] = bags_agg['line_text'].str.lower()

def supprimer_ponctuation(texte):
    return ''.join([char for char in texte if char not in string.punctuation])
bags_agg['line_text'] = bags_agg['line_text'].apply(supprimer_ponctuation)

bags_agg['line_token'] = bags_agg['line_text'].apply(word_tokenize)
bags_agg['words_counted'] = bags_agg['line_token'].apply(lambda bag: dict(Counter(bag)))
bags_agg = bags_agg.drop(columns=['line_text', 'line_token'])
bags_agg = bags_agg.reset_index(drop=True)
bags_agg

Unnamed: 0,speaker,MBTI,F1,F2,words_counted
0,Andy,ESFP,Se,Fi,"{'hey': 33, 'big': 25, 'tuna': 50, 'youre': 48..."
1,Angela,ISTJ,Si,Te,"{'i': 323, 'bet': 1, 'its': 59, 'gon': 8, 'na'..."
2,Darryl,ISTP,Ti,Se,"{'its': 26, 'not': 23, 'my': 21, 'real': 2, 'n..."
3,Dwight,ESTJ,Te,Si,"{'singing': 9, 'shall': 5, 'i': 1341, 'play': ..."
4,Holly,INFP,Fi,Ne,"{'hi': 3, 'yes': 4, 'uh': 4, 'im': 23, 'holly'..."
5,Jan,ENTJ,Te,Ni,"{'on': 89, 'her': 9, 'cell': 4, 'phone': 24, '..."
6,Jim,ENTP,Ne,Ti,"{'oh': 326, 'i': 1088, 'told': 23, 'you': 1142..."
7,Karen,ISTJ,Si,Te,"{'jims': 6, 'nice': 5, 'enough': 3, 'i': 85, '..."
8,Kelly,ESFP,Se,Fi,"{'i': 288, 'have': 41, 'a': 118, 'customer': 1..."
9,Kevin,ISFP,Fi,Se,"{'yeah': 47, 'itll': 2, 'be': 28, 'you': 179, ..."


### Compare statistics : thematic Bags VS speakers Bags by MBTI (types and mental functions)

In [7]:
#Hypothesis
df_stopwords = pd.read_csv('Input/Bags/stopwords.csv')
stop = df_stopwords['0'].tolist()
df_very = pd.read_csv('Input/Bags/very.csv')
very = df_very['0'].tolist()
df_names = pd.read_csv('Input/Bags/names.csv')
names = df_names['0'].tolist()
df_yes = pd.read_csv('Input/Bags/yes.csv')
yes = df_yes['0'].tolist()
df_no = pd.read_csv('Input/Bags/no.csv')
no = df_no['0'].tolist()
df_me = pd.read_csv('Input/Bags/me.csv')
me = df_me['0'].tolist()
df_you = pd.read_csv('Input/Bags/you.csv')
you = df_you['0'].tolist()

df_emotions = pd.read_csv('Input/Bags/emotions.csv')
emotions = df_emotions['0'].tolist()
df_think = pd.read_csv('Input/Bags/think.csv')
think = df_think['0'].tolist()
df_perception = pd.read_csv('Input/Bags/perception.csv')
perception = df_perception['0'].tolist()
df_judge = pd.read_csv('Input/Bags/judgement.csv')
judge = df_judge['0'].tolist()

df_introv = pd.read_csv('Input/Bags/introv.csv')
introv = df_introv['0'].tolist()
df_extrav = pd.read_csv('Input/Bags/extrav.csv')
extrav = df_extrav['0'].tolist()
df_intui = pd.read_csv('Input/Bags/intui.csv')
intui = df_intui['0'].tolist()
df_sens = pd.read_csv('Input/Bags/sens.csv')
sens = df_sens['0'].tolist()



# Copier le DataFrame
bag_raw = bags_agg.copy()

# Calcul du nombre total de mots par ligne
bag_raw['total_words'] = bag_raw['words_counted'].apply(lambda x: sum(x.values()))

# Initialisation des listes pour stocker les nombres d'occurrences
nombres_very = []
nombres_names = []
nombres_me = []
nombres_you = []
nombres_yes = []
nombres_no = []
nombres_stop = []
nombres_emotions = []
nombres_think = []
nombres_perception = []
nombres_judge = []
nombres_introv = []
nombres_extrav = []
nombres_intui = []
nombres_sens = []

# Parcourir chaque locuteur
for index, row in bag_raw.iterrows():
    # Initialisation des compteurs pour ce locuteur
    total_very = 1
    total_names = 1
    total_me = 1
    total_you = 1
    total_yes = 1
    total_no = 1
    total_stop = 1
    total_emotions = 1
    total_think = 1
    total_perception = 1
    total_judge = 1
    total_introv = 1
    total_extrav = 1
    total_intui = 1
    total_sens = 1
    
    
    # Compter les occurrences des mots pour ce locuteur
    for word, count in row['words_counted'].items():
        if word in very:
            total_very += count
        if word in names:
            total_names += count
        if word in stop:
            total_stop += count
        if word in me:
            total_me += count
        if word in you:
            total_you += count
        if word in yes:
            total_yes += count
        if word in no:
            total_no += count
        
        if word in emotions:
            total_emotions += count
        if word in think:
            total_think += count
        if word in perception:
            total_perception += count
        if word in judge:
            total_judge += count
        if word in introv:
            total_introv += count
        if word in extrav:
            total_extrav += count
        if word in intui:
            total_intui += count
        if word in sens:
            total_sens += count
        
    
    # Ajouter les nombres d'occurrences à la liste respective
    nombres_very.append(total_very)
    nombres_names.append(total_names)
    nombres_stop.append(total_stop)
    nombres_me.append(total_me)
    nombres_you.append(total_you)
    nombres_yes.append(total_yes)
    nombres_no.append(total_no)
    nombres_emotions.append(total_emotions)
    nombres_think.append(total_think)
    nombres_perception.append(total_perception)
    nombres_judge.append(total_judge)
    nombres_introv.append(total_introv)
    nombres_extrav.append(total_extrav)
    nombres_intui.append(total_intui)
    nombres_sens.append(total_sens)
    
# Ajouter les colonnes de nombres d'occurrences au DataFrame bag_raw
bag_raw['Nb Very'] = nombres_very
bag_raw['Nb Names'] = nombres_names
#bag_raw['Nb Stop'] = nombres_stop
bag_raw['Nb Me'] = nombres_me
bag_raw['Nb You'] = nombres_you
bag_raw['Nb Yes'] = nombres_yes
bag_raw['Nb No'] = nombres_no
bag_raw['Nb Emotions'] = nombres_emotions
bag_raw['Nb Think'] = nombres_think
bag_raw['Nb Perception'] = nombres_perception
bag_raw['Nb Judge'] = nombres_judge
bag_raw['Nb Introv'] = nombres_introv
bag_raw['Nb Extrav'] = nombres_extrav
bag_raw['Nb Intui'] = nombres_intui
bag_raw['Nb Sens'] = nombres_sens



# Calcul des pourcentages de yes et no par rapport au nombre total de mots
bag_raw['Superlat'] = (bag_raw['Nb Very'] / bag_raw['total_words']) * 100
bag_raw['Names'] = (bag_raw['Nb Names'] / bag_raw['total_words']) * 100
#bag_raw['Stop'] = (bag_raw['Nb Stop'] / bag_raw['total_words']) * 100
bag_raw['No/Yes'] = bag_raw['Nb No'] / (bag_raw['Nb No'] + bag_raw['Nb Yes']) * 100
bag_raw['Me/You'] = bag_raw['Nb Me'] / (bag_raw['Nb Me'] + bag_raw['Nb You']) * 100
bag_raw['Bags I/E'] = bag_raw['Nb Introv'] / (bag_raw['Nb Introv'] + bag_raw['Nb Extrav']) * 100
bag_raw['Bags N/S'] = bag_raw['Nb Intui'] / (bag_raw['Nb Intui'] + bag_raw['Nb Sens']) * 100
bag_raw['Bags F/T'] = bag_raw['Nb Emotions'] / (bag_raw['Nb Emotions'] + bag_raw['Nb Think']) * 100
bag_raw['Bags P/J'] = bag_raw['Nb Perception'] / (bag_raw['Nb Perception'] + bag_raw['Nb Judge']) * 100


# Supprimer les colonnes de nombres d'occurrences non nécessaires
bag_raw = bag_raw.drop(columns=['Nb Me', 'Nb You', 'Nb Yes', 'Nb No','Nb Very','total_words', 'Nb Names'])
bag_raw = bag_raw.drop(columns=['Nb Perception','Nb Judge','Nb Emotions','Nb Think','Nb Intui','Nb Sens'])
bag_raw.to_csv('Output/bag_statistics.csv', index=False)
bag_raw.to_csv('Input/bag_statistics.csv', index=False)
bag_raw.sort_values(by='Bags I/E', ascending = False)
bag_raw

Unnamed: 0,speaker,MBTI,F1,F2,words_counted,Nb Introv,Nb Extrav,Superlat,Names,No/Yes,Me/You,Bags I/E,Bags N/S,Bags F/T,Bags P/J
0,Andy,ESFP,Se,Fi,"{'hey': 33, 'big': 25, 'tuna': 50, 'youre': 48...",49,90,1.145096,1.340967,48.258706,33.668342,35.251799,33.333333,78.571429,68.148148
1,Angela,ISTJ,Si,Te,"{'i': 323, 'bet': 1, 'its': 59, 'gon': 8, 'na'...",38,87,0.665161,2.171185,63.255034,22.888889,30.4,50.0,75.862069,59.770115
2,Darryl,ISTP,Ti,Se,"{'its': 26, 'not': 23, 'my': 21, 'real': 2, 'n...",12,17,0.505732,0.74174,62.5,20.0,41.37931,40.0,92.857143,71.428571
3,Dwight,ESTJ,Te,Si,"{'singing': 9, 'shall': 5, 'i': 1341, 'play': ...",106,208,0.885321,1.511468,53.140265,29.886982,33.757962,40.298507,75.598086,69.69697
4,Holly,INFP,Fi,Ne,"{'hi': 3, 'yes': 4, 'uh': 4, 'im': 23, 'holly'...",13,19,1.43457,1.154654,52.674897,24.637681,40.625,71.428571,72.727273,80.434783
5,Jan,ENTJ,Te,Ni,"{'on': 89, 'her': 9, 'cell': 4, 'phone': 24, '...",44,84,1.166937,0.929227,51.091703,23.501577,34.375,48.484848,75.0,70.909091
6,Jim,ENTP,Ne,Ti,"{'oh': 326, 'i': 1088, 'told': 23, 'you': 1142...",172,222,1.434514,2.052364,48.411142,22.490185,43.654822,62.5,78.321678,64.069264
7,Karen,ISTJ,Si,Te,"{'jims': 6, 'nice': 5, 'enough': 3, 'i': 85, '...",23,29,1.442469,2.31466,40.096618,29.411765,44.230769,44.444444,76.190476,65.853659
8,Kelly,ESFP,Se,Fi,"{'i': 288, 'have': 41, 'a': 118, 'customer': 1...",29,41,1.283106,2.20431,57.432432,34.375,41.428571,57.142857,88.095238,74.647887
9,Kevin,ISFP,Fi,Se,"{'yeah': 47, 'itll': 2, 'be': 28, 'you': 179, ...",32,42,1.141062,2.53018,53.90625,31.629393,43.243243,40.0,77.272727,75.757576
