In [1]:
import numpy as np
import pandas as pd
from os import listdir
import matplotlib.pyplot as plt
import itertools as it
from statsmodels.sandbox.stats.multicomp import multipletests
import sklearn.preprocessing as pp
import statsmodels.api as sm
#import nltk
import scipy.stats as st
import statsmodels.formula.api as smf
from statsmodels.stats.outliers_influence import variance_inflation_factor
import seaborn as sns
import time
import Helper as hp

#Load Data
data = pd.read_csv("CHI_2019_FULL.csv")

#Set Labels 
emotion_label = ['Anger', 'Boredom', 'Disgust', 'Fear', 'Happiness', 'Emo_Neutral', 'Sadness']
affect_label = ['Aggressiv', 'Cheerful', 'Intoxicated', 'Nervous', 'Aff_Neutral', 'Tired']
loi_label = ['Disinterest', 'Normal', 'High Interest']

#Get specific data and save it into new data frames
# We use the pandas .copy(deep=True) function to prevent the SettingWithCopyWarning we would otherwise get. Since we do
# not write, but only read from the data, the warning does not affect the data frames
df_emotion = data[['Anger', 'Boredom', 'Disgust', 'Fear', 'Happiness', 'Emo_Neutral', 'Sadness', 'Filename']].copy(deep=True)
df_affect = data[['Aggressiv', 'Cheerful', 'Intoxicated', 'Nervous', 'Aff_Neutral', 'Tired', 'Filename']].copy(deep=True)
df_loi = data[['Disinterest', 'Normal', 'High Interest', 'Filename']].copy(deep=True)
df_ar_val = data[['Arousal', 'Valence', 'Filename']].copy(deep=True)
#For further usage, we want to append the CharacterID as a column, which is saved with other information in the filename
#Since we only want the digits, we can remove all non-digit characters of the filename column and append the column to the df

df_emotion['Char_ID'] = df_emotion['Filename'].replace('\D+','', regex = True).copy(deep=True)
df_affect['Char_ID'] = df_affect['Filename'].replace('\D+','', regex = True).copy(deep=True)
df_loi['Char_ID'] = df_loi['Filename'].replace('\D+','', regex = True).copy(deep=True)
df_ar_val['Char_ID'] = df_ar_val['Filename'].replace('\D+','', regex = True).copy(deep=True)

#Load data about speakers
char_data = pd.read_csv("CHI_2019_CharacterData.csv")  
char_data.replace({'Male': 0.0, 'Female': 1.0}, inplace = True)

#Join above tables and Character Tables
#To Join DataFrames we have to cast the column on which we want to join to int, so that both columns have the same data type
char_data['ID'] = char_data['ID'].astype(int)
df_ar_val['Char_ID'] = df_ar_val['Char_ID'].astype(int)
df_emotion['Char_ID'] = df_emotion['Char_ID'].astype(int)
df_affect['Char_ID'] = df_affect['Char_ID'].astype(int)
df_loi['Char_ID'] = df_loi['Char_ID'].astype(int)

#Save new data frames
df_ar_val_char = df_ar_val.merge(char_data, how = 'left', left_on='Char_ID', right_on='ID')
df_emotion_char = df_emotion.merge(char_data, how = 'left', left_on='Char_ID', right_on= 'ID')
df_affect_char = df_affect.merge(char_data, how = 'left', left_on='Char_ID', right_on= 'ID')
df_loi_char = df_loi.merge(char_data, how = 'left', left_on='Char_ID', right_on= 'ID')

#Now, we only want to have data containing information about the answers
#For that we need to extract from the filename column, whether the file was part of an answer
#a = answer, p = presentation, q = question
#sentence_type should be the same for all tables, but just to be sure
arval_sentence_type = df_ar_val_char.Filename.str.replace('\d+','').str[3:-4]
df_ar_val_char['SentenceType'] = arval_sentence_type
emo_sentence_type = df_emotion_char.Filename.str.replace('\d+','').str[3:-4]
df_emotion_char['SentenceType'] = emo_sentence_type
aff_sentence_type = df_affect_char.Filename.str.replace('\d+','').str[3:-4]
df_affect_char['SentenceType'] = aff_sentence_type
loi_sentence_type = df_loi_char.Filename.str.replace('\d+','').str[3:-4]
df_loi_char['SentenceType'] = loi_sentence_type

#For affect, we will have to drop the intoxication column and thus we will re-normalize the other values
affect_label.remove('Intoxicated')
df_affect_char = df_affect_char.drop(['Intoxicated'], axis = 1)
norm_test = pp.normalize(df_affect_char[affect_label], norm = 'l1')
df_affect_char[affect_label] = norm_test

df_loi_char['Normal Interest'] = df_loi_char['Disinterest'] + df_loi_char['Normal']
df_loi_char = df_loi_char.drop(['Disinterest', 'Normal'], axis = 1)
loi_label = ['Normal Interest', 'High Interest']

  data_klasses = (pandas.Series, pandas.DataFrame, pandas.Panel)


In [2]:
#Now select only those who have SentenceType == 'a'
df_ar_val_char_a = df_ar_val_char.loc[df_ar_val_char['SentenceType'] == 'a'].copy()
df_emotion_char_a = df_emotion_char.loc[df_emotion_char['SentenceType'] == 'a'].copy()
df_affect_char_a = df_affect_char.loc[df_affect_char['SentenceType'] == 'a'].copy()
df_loi_char_a = df_loi_char.loc[df_loi_char['SentenceType'] == 'a'].copy()

#Now select only those who have SentenceType == 'a'
df_ar_val_char_p = df_ar_val_char.loc[df_ar_val_char['SentenceType'] == 'p'].copy()
df_emotion_char_p = df_emotion_char.loc[df_emotion_char['SentenceType'] == 'p'].copy()
df_affect_char_p = df_affect_char.loc[df_affect_char['SentenceType'] == 'p'].copy()
df_loi_char_p = df_loi_char.loc[df_loi_char['SentenceType'] == 'p'].copy()

#Now select only those who have SentenceType == 'a'
df_ar_val_char_q = df_ar_val_char.loc[df_ar_val_char['SentenceType'] == 'q'].copy()
df_emotion_char_q = df_emotion_char.loc[df_emotion_char['SentenceType'] == 'q'].copy()
df_affect_char_q = df_affect_char.loc[df_affect_char['SentenceType'] == 'q'].copy()
df_loi_char_q = df_loi_char.loc[df_loi_char['SentenceType'] == 'q'].copy()

In [3]:
IDs = df_emotion_char_p['Char_ID'].copy()
IDs.drop_duplicates()

# For presentations we need an aggregate, e.g. median for the presenters
for i in IDs:
    #first for emotion
    for l in emotion_label:    
        emo = df_emotion_char_p.loc[df_emotion_char_p.Char_ID == i,l]
        emo_neu = hp.constructMedianSeries(emo)
        df_emotion_char_p.loc[df_emotion_char_p.Char_ID == i,l] = emo_neu
    #Then for affect
    for l in affect_label:    
        aff = df_affect_char_p.loc[df_affect_char_p.Char_ID == i,l]
        aff_neu = hp.constructMedianSeries(aff)
        df_affect_char_p.loc[df_affect_char_p.Char_ID == i,l] = aff_neu  
    for l in loi_label:    
        loi = df_loi_char_p.loc[df_loi_char_p.Char_ID == i,l]
        loi_neu = hp.constructMedianSeries(loi)
        df_loi_char_p.loc[df_loi_char_p.Char_ID == i,l] = loi_neu  
    for l in ['Arousal', 'Valence']:    
        arval = df_ar_val_char_p.loc[df_ar_val_char_p.Char_ID == i,l]
        arval_neu = hp.constructMedianSeries(arval)
        df_ar_val_char_p.loc[df_ar_val_char_p.Char_ID == i,l] = arval_neu 
        
#Now let's drop the duplicate values, since we only need one row now per person
df_emotion_char_p.drop_duplicates(subset=['ID'], inplace = True)
df_affect_char_p.drop_duplicates(subset=['ID'], inplace = True)
df_loi_char_p.drop_duplicates(subset=['ID'], inplace = True)
df_ar_val_char_p.drop_duplicates(subset=['ID'], inplace = True) 

In [4]:
emotion_label.extend(['Sex', 'Academic Status', 'IsNativeSpeaker'])
affect_label.extend(['Sex', 'Academic Status', 'IsNativeSpeaker'])
loi_label.extend(['Sex', 'Academic Status', 'IsNativeSpeaker'])

df_emotion_char_p.columns = df_emotion_char_p.columns.map(lambda x: 'P_' + x if x in emotion_label else x)
df_affect_char_p.columns = df_affect_char_p.columns.map(lambda x: 'P_' + x if x in affect_label else x)
df_loi_char_p.columns = df_loi_char_p.columns.map(lambda x: 'P_' + x if x in loi_label else x)
df_ar_val_char_p.columns = df_ar_val_char_p.columns.map(lambda x: 'P_' + x if x in ['Arousal', 'Valence', 'Sex', 'Academic Status', 'IsNativeSpeaker'] else x)

df_emotion_char_a.columns = df_emotion_char_a.columns.map(lambda x: 'A_' + x if x in emotion_label else x)
df_affect_char_a.columns = df_affect_char_a.columns.map(lambda x: 'A_' + x if x in affect_label else x)
df_loi_char_a.columns = df_loi_char_a.columns.map(lambda x: 'A_' + x if x in loi_label else x)
df_ar_val_char_a.columns = df_ar_val_char_a.columns.map(lambda x: 'A_' + x if x in ['Arousal', 'Valence', 'Sex', 'Academic Status', 'IsNativeSpeaker'] else x)

df_emotion_char_q.columns = df_emotion_char_q.columns.map(lambda x: 'Q_' + x if x in emotion_label else x)
df_affect_char_q.columns = df_affect_char_q.columns.map(lambda x: 'Q_' + x if x in affect_label else x)
df_loi_char_q.columns = df_loi_char_q.columns.map(lambda x: 'Q_' + x if x in loi_label else x)
df_ar_val_char_q.columns = df_ar_val_char_q.columns.map(lambda x: 'Q_' + x if x in ['Arousal', 'Valence', 'Sex', 'Academic Status', 'IsNativeSpeaker'] else x)

df_emotion_char_p = df_emotion_char_p.drop(['Name', 'Char_ID', 'VideoTitle', 'SentenceType', 'Filename'], axis = 1)
df_affect_char_p = df_affect_char_p.drop(['Name', 'Char_ID', 'VideoTitle', 'SentenceType', 'Filename'], axis = 1)
df_loi_char_p = df_loi_char_p.drop(['Name', 'Char_ID', 'VideoTitle', 'SentenceType', 'Filename'], axis = 1)
df_ar_val_char_p = df_ar_val_char_p.drop(['Name', 'Char_ID', 'VideoTitle', 'SentenceType', 'Filename'], axis = 1)

df_emotion_char_a = df_emotion_char_a.drop(['Name', 'Char_ID', 'VideoTitle', 'SentenceType', 'Filename'], axis = 1)
df_affect_char_a = df_affect_char_a.drop(['Name', 'Char_ID', 'VideoTitle', 'SentenceType', 'Filename'], axis = 1)
df_loi_char_a = df_loi_char_a.drop(['Name', 'Char_ID', 'VideoTitle', 'SentenceType', 'Filename'], axis = 1)
df_ar_val_char_a = df_ar_val_char_a.drop(['Name', 'Char_ID', 'VideoTitle', 'SentenceType', 'Filename'], axis = 1)

df_emotion_char_q = df_emotion_char_q.drop(['Name', 'Char_ID', 'VideoTitle', 'SentenceType', 'Filename'], axis = 1)
df_affect_char_q = df_affect_char_q.drop(['Name', 'Char_ID', 'VideoTitle', 'SentenceType', 'Filename'], axis = 1)
df_loi_char_q = df_loi_char_q.drop(['Name', 'Char_ID', 'VideoTitle', 'SentenceType', 'Filename'], axis = 1)
df_ar_val_char_q = df_ar_val_char_q.drop(['Name', 'Char_ID', 'VideoTitle', 'SentenceType', 'Filename'], axis = 1)

In [5]:
#To do: Merge Data Frames; First Question and Presentation, then answer and question
ar_val_char = df_ar_val_char_q.merge(df_ar_val_char_p, how = 'left', left_on='VideoID', right_on='VideoID')
loi_char = df_loi_char_q.merge(df_loi_char_p, how = 'left', left_on='VideoID', right_on='VideoID')
affect_char = df_affect_char_q.merge(df_affect_char_p, how = 'left', left_on='VideoID', right_on='VideoID')
emotion_char = df_emotion_char_q.merge(df_emotion_char_p, how = 'left', left_on='VideoID', right_on='VideoID')
emotion_char

Unnamed: 0,Q_Anger,Q_Boredom,Q_Disgust,Q_Fear,Q_Happiness,Q_Emo_Neutral,Q_Sadness,ID_x,Q_Sex,Q_Academic Status,...,P_Boredom,P_Disgust,P_Fear,P_Happiness,P_Emo_Neutral,P_Sadness,ID_y,P_Sex,P_Academic Status,P_IsNativeSpeaker
0,0.002253,0.005161,0.028762,0.002877,0.001428,0.000320,0.959199,193,0.0,,...,0.026247,0.225678,0.010919,0.009425,0.003056,0.550885,192,1.0,Grad Student,Europ. Non-Native
1,0.007885,0.004149,0.152280,0.004555,0.003764,0.000246,0.827121,194,1.0,,...,0.026247,0.225678,0.010919,0.009425,0.003056,0.550885,192,1.0,Grad Student,Europ. Non-Native
2,0.003784,0.006456,0.014762,0.002672,0.001569,0.000624,0.970133,195,0.0,,...,0.026247,0.225678,0.010919,0.009425,0.003056,0.550885,192,1.0,Grad Student,Europ. Non-Native
3,0.004143,0.000882,0.151633,0.000632,0.002108,0.000065,0.840536,196,0.0,,...,0.026247,0.225678,0.010919,0.009425,0.003056,0.550885,192,1.0,Grad Student,Europ. Non-Native
4,0.008246,0.043610,0.119756,0.005309,0.007007,0.012636,0.803436,198,0.0,,...,0.016413,0.046260,0.003839,0.004142,0.002316,0.921682,197,1.0,Grad Student,Native Speaker
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
190,0.001621,0.052537,0.003183,0.011076,0.002419,0.003676,0.925487,460,0.0,,...,0.210241,0.174924,0.037075,0.049350,0.028904,0.413051,457,0.0,Grad Student,Native Speaker
191,0.002563,0.004989,0.018547,0.005339,0.001827,0.000377,0.966359,461,0.0,,...,0.210241,0.174924,0.037075,0.049350,0.028904,0.413051,457,0.0,Grad Student,Native Speaker
192,0.001638,0.009019,0.005033,0.020095,0.002618,0.000631,0.960965,462,0.0,,...,0.210241,0.174924,0.037075,0.049350,0.028904,0.413051,457,0.0,Grad Student,Native Speaker
193,0.000195,0.004939,0.001564,0.000372,0.000364,0.000408,0.992159,463,0.0,,...,0.210241,0.174924,0.037075,0.049350,0.028904,0.413051,457,0.0,Grad Student,Native Speaker


In [6]:
df_ar_val_char_a.reset_index(drop = True, inplace = True)
df_ar_val_char_q.reset_index(drop = True, inplace = True)
arval = pd.concat([ar_val_char, df_ar_val_char_a], axis = 1)
arval.drop(['ID_x', 'ID_y', 'VideoID', 'ID', 'Q_Academic Status','Q_IsNativeSpeaker', 'A_IsNativeSpeaker', 'A_Academic Status', 'P_Academic Status', 'P_IsNativeSpeaker', 'P_Sex'], axis = 1, inplace = True)

df_loi_char_a.reset_index(drop = True, inplace = True)
df_loi_char_q.reset_index(drop = True, inplace = True)
loi = pd.concat([loi_char, df_loi_char_a], axis = 1)
loi.drop(['ID_x', 'ID_y', 'VideoID', 'ID', 'Q_Academic Status','Q_IsNativeSpeaker', 'A_IsNativeSpeaker', 'A_Academic Status', 'P_Academic Status', 'P_IsNativeSpeaker', 'P_Sex'], axis = 1, inplace = True)

df_affect_char_a.reset_index(drop = True, inplace = True)
df_affect_char_q.reset_index(drop = True, inplace = True)
affect = pd.concat([affect_char, df_affect_char_a], axis = 1)
affect.drop(['ID_x', 'ID_y', 'VideoID', 'ID', 'Q_Academic Status','Q_IsNativeSpeaker', 'A_IsNativeSpeaker', 'A_Academic Status', 'P_Academic Status', 'P_IsNativeSpeaker', 'P_Sex'], axis = 1, inplace = True)

df_emotion_char_a.reset_index(drop = True, inplace = True)
df_emotion_char_q.reset_index(drop = True, inplace = True)
emotion = pd.concat([emotion_char, df_emotion_char_a], axis = 1)
emotion.drop(['ID_x', 'ID_y', 'VideoID', 'ID', 'Q_Academic Status','Q_IsNativeSpeaker', 'A_IsNativeSpeaker', 'A_Academic Status', 'P_Academic Status', 'P_IsNativeSpeaker', 'P_Sex'], axis = 1, inplace = True)
#Now we have our dataframe ready for logsitic regression:
#Each row resembles an interaction between one question, the corresponding answer and the median of the presenter
#We could do this since both dataframes are default sorted by filenames (emotion_char and df_emotion_char_a)
#So each question is in the same line as the corresponding answer

#Since we only need the labels of gender once, we drop the others before concatenating all data frames together
#for the omnibus model
test_affect = affect.drop(['Q_Sex', 'A_Sex'], axis = 1)
test_loi= loi.drop(['Q_Sex', 'A_Sex'], axis = 1)
test_arval = arval.drop(['Q_Sex', 'A_Sex'], axis = 1)

#Concat all our data together; Since it's sorted in the same way, we can just use pd.concat()
data = pd.concat([emotion, test_affect, test_loi, test_arval], axis = 1)

In [7]:
#We only want to standardize the data, which contains the continous values, not our labels
test = data.loc[:,data.columns != 'A_Sex']
test = test.loc[:, test.columns != 'Q_Sex']
#Apply Standard Scaler to relevant columns
scaler = pp.StandardScaler()
data[test.columns] = scaler.fit_transform(data[test.columns])
#data.describe() #Calling this we see that the std is ~1 and mean is very close to 0

In [8]:
X = hp.calculate_vif(data[test.columns], 5)

48
Dropping 'Q_Aggressiv' at index: 21
47


  vif = 1. / (1. - r_squared_i)


Dropping 'A_Aggressiv' at index: 30
46
Dropping 'A_High Interest' at index: 38
45
Dropping 'P_High Interest' at index: 36
44
Dropping 'Q_High Interest' at index: 34
43
Dropping 'A_Sadness' at index: 20
42
Dropping 'Q_Sadness' at index: 6
41
Dropping 'P_Happiness' at index: 10
40
Dropping 'P_Boredom' at index: 7
39
Dropping 'P_Tired' at index: 25
38
Dropping 'A_Tired' at index: 28
37
Dropping 'P_Sadness' at index: 10
36
Dropping 'A_Boredom' at index: 11
35
Dropping 'A_Happiness' at index: 13
34
Dropping 'Q_Tired' at index: 17
33
Dropping 'Q_Happiness' at index: 4
32
Dropping 'P_Aggressiv' at index: 16
31
Dropping 'Q_Boredom' at index: 1
30
Dropping 'P_Fear' at index: 6
29
Remaining variables:
[['Q_Anger', 'Q_Disgust', 'Q_Fear', 'Q_Emo_Neutral', 'P_Anger', 'P_Disgust', 'P_Emo_Neutral', 'A_Anger', 'A_Disgust', 'A_Fear', 'A_Emo_Neutral', 'Q_Cheerful', 'Q_Nervous', 'Q_Aff_Neutral', 'P_Cheerful', 'P_Nervous', 'P_Aff_Neutral', 'A_Cheerful', 'A_Nervous', 'A_Aff_Neutral', 'Q_Normal Interest', '

In [9]:
t = [x for x in X.columns]
t.extend(['A_Sex', 'Q_Sex'])
t.extend(['P_High Interest', 'A_High Interest', 'Q_High Interest']) #We add these because they were previously
#discarded by VIF, but we use them as reference variable, so we'll include them in the data, but not in the model
df_vif2 = data[t]
df_vif2

Unnamed: 0,Q_Anger,Q_Disgust,Q_Fear,Q_Emo_Neutral,P_Anger,P_Disgust,P_Emo_Neutral,A_Anger,A_Disgust,A_Fear,...,Q_Valence,P_Arousal,P_Valence,A_Arousal,A_Valence,A_Sex,Q_Sex,P_High Interest,A_High Interest,Q_High Interest
0,-0.336285,-0.537413,-0.252757,-0.399605,-0.430552,-0.202080,-0.337672,-0.479038,-0.279274,-0.351458,...,-0.109221,1.624868,-1.316756,1.583623,-1.072335,1.0,0.0,1.594492,1.332803,1.646681
1,-0.155717,0.048516,-0.184362,-0.402044,-0.430552,-0.202080,-0.337672,-0.425289,-0.200572,-0.352068,...,-1.026426,1.624868,-1.316756,1.277500,-0.012624,1.0,1.0,1.594492,1.321897,0.140860
2,-0.287199,-0.603825,-0.261113,-0.389589,-0.430552,-0.202080,-0.337672,-0.603274,-0.586883,-0.422374,...,-1.018831,1.624868,-1.316756,2.626954,0.045308,1.0,0.0,1.594492,1.377899,0.828516
3,-0.275690,0.045447,-0.344262,-0.408008,-0.430552,-0.202080,-0.337672,-0.052876,-0.166518,-0.153466,...,0.195554,1.624868,-1.316756,1.411346,-1.638982,1.0,0.0,1.594492,0.920352,0.449306
4,-0.144143,-0.105767,-0.153629,0.006197,-0.450753,-0.912372,-0.386864,-0.384093,-0.512082,-0.298347,...,-0.135704,-1.104231,-0.606353,-0.877265,-0.913776,1.0,0.0,-1.160816,-0.635318,0.698191
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
190,-0.356547,-0.658752,0.081432,-0.289028,1.819272,-0.403008,1.380592,-0.556303,-0.481645,-0.360483,...,-0.397165,-0.464602,-0.415888,0.361207,-1.354555,0.0,0.0,0.682264,1.303107,1.653794
191,-0.326346,-0.585870,-0.152406,-0.397727,1.819272,-0.403008,1.380592,-0.483753,-0.580928,-0.357739,...,-1.232549,-0.464602,-0.415888,0.923056,-0.712541,0.0,0.0,0.682264,1.320710,1.550033
192,-0.356002,-0.649976,0.449044,-0.389358,1.819272,-0.403008,1.380592,-0.535734,-0.512548,-0.363471,...,-0.936234,-0.464602,-0.415888,-0.636406,-0.871200,0.0,0.0,0.682264,1.092611,0.524793
193,-0.402266,-0.666432,-0.354860,-0.396706,1.819272,-0.403008,1.380592,-0.028771,-0.067705,-0.191394,...,0.992200,-0.464602,-0.415888,1.040047,0.274062,0.0,0.0,0.682264,1.277822,0.732213


## Now that we have constructed our data frame, let's do Logistic Regression!
Naively, let's start by pluging in all our data, each feature group in relation to its reference variable (expect for Arousal Valence, since they are not linearly dependent).

In [10]:
#The below code throws an LinAlgError: Singular Matrix; Furthermore I get Runtime Warnings concerning overflow and 
#division by zero (which aligns with the above inf values for affect of Question and Answer)
#to prevent this error we use the bfgs method (https://stackoverflow.com/questions/20703733/logit-regression-and-singular-matrix-error-in-python)
#Using this, we get results, but the model does not converge
emo_sex_model = smf.logit('A_Sex ~ Q_Sex * (Q_Anger + Q_Boredom + Q_Disgust + Q_Sadness + Q_Fear + Q_Happiness + Q_Arousal + Q_Valence + Q("Q_High Interest") + Q_Aggressiv + Q_Cheerful + Q_Nervous + Q_Tired) + A_Anger + A_Boredom + A_Disgust + A_Fear + A_Happiness + A_Sadness + A_Arousal + A_Valence +Q("A_High Interest") + A_Aggressiv + A_Cheerful + A_Nervous + A_Tired + P_Anger + P_Boredom + P_Disgust + P_Fear + P_Happiness + P_Sadness + P_Arousal + P_Valence + Q("P_High Interest") + P_Aggressiv + P_Cheerful + P_Tired + P_Nervous', data = data)
emotion_res = emo_sex_model.fit(method = 'bfgs')
emotion_res.summary()

         Current function value: 0.136129
         Iterations: 35
         Function evaluations: 36
         Gradient evaluations: 36




0,1,2,3
Dep. Variable:,A_Sex,No. Observations:,195.0
Model:,Logit,Df Residuals:,141.0
Method:,MLE,Df Model:,53.0
Date:,"Fri, 20 Nov 2020",Pseudo R-squ.:,0.8035
Time:,13:31:33,Log-Likelihood:,-26.545
converged:,False,LL-Null:,-135.1
,,LLR p-value:,9.619e-22

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-0.2079,1.044,-0.199,0.842,-2.255,1.839
Q_Sex,2.1242,3.102,0.685,0.493,-3.955,8.203
Q_Anger,-1.5968,7.914,-0.202,0.840,-17.107,13.914
Q_Boredom,0.3673,40.798,0.009,0.993,-79.595,80.329
Q_Disgust,-0.9788,60.496,-0.016,0.987,-119.548,117.590
Q_Sadness,0.7060,82.025,0.009,0.993,-160.060,161.472
Q_Fear,-0.9302,5.894,-0.158,0.875,-12.482,10.621
Q_Happiness,1.3813,11.863,0.116,0.907,-21.871,24.633
Q_Arousal,4.1939,1.893,2.215,0.027,0.484,7.904


That the above model does not converge is no surprise, since we use all features, even if they are collinear. This is why we now go on and only use the features that were not dropped using VIF.

In [11]:
model = smf.logit('A_Sex ~ Q_Sex * (Q_Anger + Q_Disgust + Q_Fear + Q_Arousal + Q_Valence + Q_Cheerful + Q_Nervous + Q("Q_Normal Interest")) + A_Anger + A_Disgust + A_Fear + A_Arousal + A_Valence + A_Cheerful + A_Nervous + Q("A_Normal Interest") + P_Anger + P_Disgust + P_Arousal + P_Valence + Q("P_Normal Interest")+ P_Cheerful + P_Nervous', data = df_vif2)
res = model.fit()
res.summary()

Optimization terminated successfully.
         Current function value: 0.250974
         Iterations 10


0,1,2,3
Dep. Variable:,A_Sex,No. Observations:,195.0
Model:,Logit,Df Residuals:,162.0
Method:,MLE,Df Model:,32.0
Date:,"Fri, 20 Nov 2020",Pseudo R-squ.:,0.6377
Time:,13:31:33,Log-Likelihood:,-48.94
converged:,True,LL-Null:,-135.1
,,LLR p-value:,3.769e-21

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,0.0510,0.360,0.142,0.887,-0.655,0.757
Q_Sex,0.4246,0.720,0.590,0.555,-0.986,1.835
Q_Anger,0.0213,0.472,0.045,0.964,-0.903,0.946
Q_Disgust,-0.4828,0.514,-0.940,0.347,-1.489,0.524
Q_Fear,0.1891,0.646,0.293,0.770,-1.077,1.455
Q_Arousal,2.2794,0.805,2.830,0.005,0.701,3.858
Q_Valence,0.2009,0.403,0.498,0.619,-0.590,0.992
Q_Cheerful,-1.4154,0.484,-2.922,0.003,-2.365,-0.466
Q_Nervous,-0.6492,0.455,-1.428,0.153,-1.540,0.242
