In [1]:
import json
import pandas
import re
import string
import pickle
import time
from sys import stdout
import sys
from collections import defaultdict, Counter
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from operator import itemgetter
from statsmodels.formula.api import ols
from statsmodels.tools.tools import add_constant
import statsmodels.api as sm
from sklearn import linear_model
import math
from scipy.spatial.distance import pdist,squareform
from datetime import datetime, timedelta
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
def createDataFrame(path):
    return pd.read_excel(path,
    header=0,
    index_col= 0,
    keep_default_na=True)

def calculate_pvalues(df):
    df = df._get_numeric_data()
    dfcols = pd.DataFrame(columns=df.columns)
    pvalues = dfcols.transpose().join(dfcols, how='outer')
    for r in df.columns:
        for c in df.columns:
            if c == r:
                df_corr = df[[r]].dropna()
            else:
                df_corr = df[[r,c]].dropna()
            pvalues[r][c] = stats.spearmanr(df_corr[r], df_corr[c])[1]
    return pvalues

def flattenList(alist):
    flat_list = []
    for sublist in alist:
        if sublist is None:
            flat_list.append(None)
        else: 
            tempList = []
            for item in sublist:    
                tempList.append(item[0])
            flat_list.append(tempList)
    return flat_list

def corrAnalysis(df1, df2):
    newList = []
    coeffmat = np.zeros((df1.shape[1], df2.shape[1]))
    pvalmat = np.zeros((df1.shape[1], df2.shape[1]))

    for i in range(df1.shape[1]):    
        for j in range(df2.shape[1]):        
            corrtest = stats.spearmanr(df1[df1.columns[i]], df2[df2.columns[j]])  

            coeffmat[i,j] = corrtest[0]
            pvalmat[i,j] = corrtest[1]
    newList.append(pd.DataFrame(coeffmat, columns=df2.columns, index=df1.columns))
    newList.append(pd.DataFrame(pvalmat, columns=df2.columns, index=df1.columns))
    return newList

def returnValueList(clist):
    return clist[1]

def checkIfDuplicates_1(listOfElems):
    if len(listOfElems) == len(set(listOfElems)):
        return False
    else:
        return True

def getHighCorrelationByPvalues(dfList, pvalue):
    highCorrelationList = []
    for i in range(0, len(dfList[0].index)):
        for j in range(0, len(dfList[0].columns)):
            tempList = []
            if(dfList[0].values[i,j] >= 0.1 and dfList[1].values[i,j] < pvalue):    
                tempList.append(dfList[0].index[i])
                tempList.append(dfList[0].columns[j])
                tempList.append(dfList[0].values[i,j])
                tempList.append(dfList[1].values[i,j])
                highCorrelationList.append(tempList)
    return highCorrelationList

def getAllCorrelations(dfList):
    correlationList = []
    for i in range(0, len(dfList[0].index)):
        for j in range(0, len(dfList[0].columns)):
            tempList = []
            tempList.append(dfList[0].index[i])
            tempList.append(dfList[0].columns[j])
            tempList.append(dfList[0].values[i,j])
            tempList.append(dfList[1].values[i,j])
            correlationList.append(tempList)
    return correlationList

def mergeCorrPDataFrames(dfList):
    df1 = dfList[0].copy(deep=True)
    df2 = dfList[1].copy(deep=True)

    df1.index = pd.MultiIndex.from_tuples([tuple(('corr', i)) for i in df1.index])
    df2.index = pd.MultiIndex.from_tuples([tuple(('p-value', i)) for i in df2.index])
    df = pd.concat([df1, df2])
    df.index.rename(['Type','ResponseValue'], inplace=True)
    return df
    

def getLowCorrelationByPvalues(dfList, pvalue):
    highCorrelationList = []
    for i in range(0, len(dfList[0].index)):
        for j in range(0, len(dfList[0].columns)):
            tempList = []
            if(dfList[0].values[i,j] <= -0.1 and dfList[1].values[i,j] < pvalue):    
                tempList.append(dfList[0].index[i])
                tempList.append(dfList[0].columns[j])
                tempList.append(dfList[0].values[i,j])
                tempList.append(dfList[1].values[i,j])
                highCorrelationList.append(tempList)
    return highCorrelationList

def getComparisonList(conListSorted, comparison):
    newCorrList = []
    for corr in conListSorted:
        for corrAll in getAllCorrelations(comparison):
            if corr[0] == corrAll[0] and corr[1] == corrAll[1]:
                newCorrList.append(corrAll)
    return newCorrList

def getComparison(corr1,corr2comp):
    newList = []
    df1 = pd.DataFrame.from_records(corr1, columns = ['ResponseValue','ExplanatoryConcept', 'Correlation', 'p-value'])
    df2 = pd.DataFrame.from_records(getComparisonList(corr1, corr2comp), columns = ['ResponseValue','ExplanatoryConcept', 'Correlation', 'p-value'])
    df1.columns = ['ResponseValue','ExplanatoryConcept', "ubs_correlation", "ubs_p_value"]
    df2.columns = ['ResponseValue','ExplanatoryConcept', "wf_correlation", "wf_p_value"]
    fin_df = pd.merge(df1,df2,on=['ResponseValue', 'ExplanatoryConcept'],how='left')
    
    return fin_df

In [3]:
pickle_off = open("..\\ProcessedData\\MultiVariateBase.pickle","rb")
multiVariateBase = pickle.load(pickle_off)
concDummyId = pickle.load(pickle_off)
groupAlloc = pickle.load(pickle_off)
pickle_off.close()

In [8]:
#Get Overall occurences
occDummy = concDummyId.drop(columns = "Identifier")
occDummy = occDummy.replace(0,np.nan).count().sort_values(ascending = False)
mostOccuredConcepts = pd.DataFrame(occDummy.head(20), columns = ['Occurence'])
mostOccuredConcepts

Unnamed: 0,Occurence
no person,544
nature,514
travel,334
desktop,308
landscape,289
people,270
sky,237
water,232
outdoors,222
summer,216


In [9]:
#Communication Strategy
comBase = multiVariateBase[['Identifier','CommentsEnabled','CaptionPolarity','CaptionSubjectivity','CaptionWordCount','9-12','12-15','15-18','18-21','21-24','Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday']]
#Content Strategy
conBase = multiVariateBase[['Identifier','Video','G1','G2','G3','G4','G5','G6','G7','G8','G9','G10','G11','G12','G13','G14','G15','G16','G17','G18','G19','G20','G21','G22','G23','G24']]
#User Engagement
usEng = multiVariateBase[['Identifier','Likes','Comments','AvgCommentPolarity']]

In [23]:
#Poisson Regression Analysis
regrComL1 = sm.GLM(usEng.query('Identifier == 1')['Likes'], comBase.query('Identifier == 1').loc[:,'CommentsEnabled':], family = sm.families.Poisson(),divide='ignore', invalid='ignore').fit()
regrComL0 = sm.GLM(usEng.query('Identifier == 0')['Likes'], comBase.query('Identifier == 0').loc[:,'CommentsEnabled':], family = sm.families.Poisson(),divide='ignore', invalid='ignore').fit()

regrComC1 = sm.GLM(usEng.query('Identifier == 1')['Comments'], comBase.query('Identifier == 1').loc[:,'CommentsEnabled':], family = sm.families.Poisson(),divide='ignore', invalid='ignore').fit()
regrComC0 = sm.GLM(usEng.query('Identifier == 0')['Comments'], comBase.query('Identifier == 0').loc[:,'CommentsEnabled':], family = sm.families.Poisson(),divide='ignore', invalid='ignore').fit()

regrConL1 = sm.GLM(usEng.query('Identifier == 1')['Likes'], conBase.query('Identifier == 1').loc[:,'Video':], family = sm.families.Poisson(),divide='ignore', invalid='ignore').fit()
regrConL0 = sm.GLM(usEng.query('Identifier == 0')['Likes'], conBase.query('Identifier == 0').loc[:,'Video':], family = sm.families.Poisson(),divide='ignore', invalid='ignore').fit()

regrConC1 = sm.GLM(usEng.query('Identifier == 1')['Comments'], conBase.query('Identifier == 1').loc[:,'Video':], family = sm.families.Poisson(),divide='ignore', invalid='ignore').fit()
regrConC0 = sm.GLM(usEng.query('Identifier == 0')['Comments'], conBase.query('Identifier == 0').loc[:,'Video':], family = sm.families.Poisson(),divide='ignore', invalid='ignore').fit()

#Switched in thesis
print(regrComL0.summary())
print(regrComC0.summary())
print(regrComL1.summary())
print(regrComC1.summary())

print(regrConL0.summary())
print(regrConC0.summary())
print(regrConL1.summary())
print(regrConC1.summary())

                 Generalized Linear Model Regression Results                  
Dep. Variable:                  Likes   No. Observations:                  746
Model:                            GLM   Df Residuals:                      732
Model Family:                 Poisson   Df Model:                           13
Link Function:                    log   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:            -7.7487e+06
Date:                Thu, 17 Dec 2020   Deviance:                   1.5491e+07
Time:                        10:32:34   Pearson chi2:                 5.33e+07
No. Iterations:                     7                                         
Covariance Type:            nonrobust                                         
                          coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------------
CommentsEnabled         7.6241    

In [24]:
# OLS
regrComA1 = sm.regression.linear_model.OLS(usEng.query('Identifier == 1')['AvgCommentPolarity'], comBase.query('Identifier == 1').loc[:,'CommentsEnabled':],divide='ignore', invalid='ignore').fit()
regrComA0 = sm.regression.linear_model.OLS(usEng.query('Identifier == 0')['AvgCommentPolarity'], comBase.query('Identifier == 0').loc[:,'CommentsEnabled':],divide='ignore', invalid='ignore').fit()

regrConA1 = sm.regression.linear_model.OLS(usEng.query('Identifier == 1')['AvgCommentPolarity'], conBase.query('Identifier == 1').loc[:,'Video':],divide='ignore', invalid='ignore').fit()
regrConA0 = sm.regression.linear_model.OLS(usEng.query('Identifier == 0')['AvgCommentPolarity'], conBase.query('Identifier == 0').loc[:,'Video':],divide='ignore', invalid='ignore').fit()

print(regrComA0.summary())
print(regrConA0.summary())

print(regrComA1.summary())
print(regrConA1.summary())

                            OLS Regression Results                            
Dep. Variable:     AvgCommentPolarity   R-squared:                       0.008
Model:                            OLS   Adj. R-squared:                 -0.010
Method:                 Least Squares   F-statistic:                    0.4399
Date:                Thu, 17 Dec 2020   Prob (F-statistic):              0.955
Time:                        10:41:13   Log-Likelihood:                -120.69
No. Observations:                 746   AIC:                             269.4
Df Residuals:                     732   BIC:                             334.0
Df Model:                          13                                         
Covariance Type:            nonrobust                                         
                          coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------
CommentsEnabled         0.0791    

  return np.sqrt(eigvals[0]/eigvals[-1])


In [30]:
# Load saved correlations
pickle_off = open("..\\ProcessedData\\CorrelationLists.pickle","rb")
b0rvImage = pickle.load(pickle_off)
b0rvRest = pickle.load(pickle_off)
b1rvImage = pickle.load(pickle_off)
b1rvRest = pickle.load(pickle_off)
pickle_off.close()

# # Calculate Correlations
# b0rvImage = corrAnalysis(usEng.query('Identifier == 0').loc[:,'Likes':],concDummyId.query('Identifier == 0').loc[:,'beach':])
# b0rvRest = corrAnalysis(usEng.query('Identifier == 0').loc[:,'Likes':],multiVariateBase.query('Identifier == 0').loc[:,'CommentsEnabled':])

# b1rvImage = corrAnalysis(usEng.query('Identifier == 1').loc[:,'Likes':],concDummyId.query('Identifier == 1').loc[:,'beach':])
# b1rvRest = corrAnalysis(usEng.query('Identifier == 1').loc[:,'Likes':],multiVariateBase.query('Identifier == 1').loc[:,'CommentsEnabled':])

# # Save calculated Correlations
# pickling_on = open("..\\ProcessedData\\CorrelationLists.pickle","wb")
# pickle.dump(b0rvImage, pickling_on)
# pickle.dump(b0rvRest, pickling_on)
# pickle.dump(b1rvImage, pickling_on)
# pickle.dump(b1rvRest, pickling_on)
# pickling_on.close()

In [33]:
# concept <--> response variable
# Single Variable/Concept Analysis with spearmans rank correlation using defined significance

#Gets sorted correlation and p value lists
b0ImageCorrHighSorted = sorted(getHighCorrelationByPvalues(b0rvImage, 0.05), key=itemgetter(3))
b1ImageCorrHighSorted = sorted(getHighCorrelationByPvalues(b1rvImage, 0.05), key=itemgetter(3))

b0RestCorrHighSorted = sorted(getHighCorrelationByPvalues(b0rvRest, 0.05), key=itemgetter(3))
b1RestCorrHighSorted = sorted(getHighCorrelationByPvalues(b1rvRest, 0.05), key=itemgetter(3))

b0ImageCorrLowSorted = sorted(getLowCorrelationByPvalues(b0rvImage, 0.05), key=itemgetter(3))
b1ImageCorrLowSorted = sorted(getLowCorrelationByPvalues(b1rvImage, 0.05), key=itemgetter(3))

b0RestCorrHighSorted = sorted(getLowCorrelationByPvalues(b0rvRest, 0.05), key=itemgetter(3))
b1RestCorrHighSorted = sorted(getLowCorrelationByPvalues(b1rvRest, 0.05), key=itemgetter(3))

In [35]:
#Creates dataFrames from Lists
b0ImageCorrHigh = pd.DataFrame.from_records(b0ImageCorrHighSorted, columns = ['ResponseValue','ExplanatoryConcept', 'Correlation', 'p-value'])
b1ImageCorrHigh = pd.DataFrame.from_records(b1ImageCorrHighSorted, columns = ['ResponseValue','ExplanatoryConcept', 'Correlation', 'p-value'])

b0ImageCorrLow = pd.DataFrame.from_records(b0ImageCorrLowSorted, columns = ['ResponseValue','ExplanatoryConcept', 'Correlation', 'p-value'])
b1ImageCorrLow = pd.DataFrame.from_records(b1ImageCorrLowSorted, columns = ['ResponseValue','ExplanatoryConcept', 'Correlation', 'p-value'])

In [37]:
#Correlation Data sorted by response variable
b0LikesLow = b0ImageCorrLow[b0ImageCorrLow.ResponseValue == 'Likes'].loc[:,'ExplanatoryConcept':]
b0CommentsLow = b0ImageCorrLow[b0ImageCorrLow.ResponseValue == 'Comments'].loc[:,'ExplanatoryConcept':]
b0CommentPolarityLow = b0ImageCorrLow[b0ImageCorrLow.ResponseValue == 'totalCommentPolarity'].loc[:,'ExplanatoryConcept':]
b0CommentSubjectivityLow = b0ImageCorrLow[b0ImageCorrLow.ResponseValue == 'totalCommentSubjectivity'].loc[:,'ExplanatoryConcept':]

b0LikesHigh = b0ImageCorrHigh[b0ImageCorrHigh.ResponseValue == 'Likes'].loc[:,'ExplanatoryConcept':]
b0CommentsHigh = b0ImageCorrHigh[b0ImageCorrHigh.ResponseValue == 'Comments'].loc[:,'ExplanatoryConcept':]
b0CommentPolarityHigh = b0ImageCorrHigh[b0ImageCorrHigh.ResponseValue == 'totalCommentPolarity'].loc[:,'ExplanatoryConcept':]
b0CommentSubjectivityHigh = b0ImageCorrHigh[b0ImageCorrHigh.ResponseValue == 'totalCommentSubjectivity'].loc[:,'ExplanatoryConcept':]

b1LikesLow = b1ImageCorrLow[b1ImageCorrLow.ResponseValue == 'Likes'].loc[:,'ExplanatoryConcept':]
b1CommentsLow = b1ImageCorrLow[b1ImageCorrLow.ResponseValue == 'Comments'].loc[:,'ExplanatoryConcept':]
b1CommentPolarityLow = b1ImageCorrLow[b1ImageCorrLow.ResponseValue == 'totalCommentPolarity'].loc[:,'ExplanatoryConcept':]
b1CommentSubjectivityLow = b1ImageCorrLow[b1ImageCorrLow.ResponseValue == 'totalCommentSubjectivity'].loc[:,'ExplanatoryConcept':]

b1LikesHigh = b1ImageCorrHigh[b1ImageCorrHigh.ResponseValue == 'Likes'].loc[:,'ExplanatoryConcept':]
b1CommentsHigh = b1ImageCorrHigh[b1ImageCorrHigh.ResponseValue == 'Comments'].loc[:,'ExplanatoryConcept':]
b1CommentPolarityHigh = b1ImageCorrHigh[b1ImageCorrHigh.ResponseValue == 'totalCommentPolarity'].loc[:,'ExplanatoryConcept':]
b1CommentSubjectivityHigh = b1ImageCorrHigh[b1ImageCorrHigh.ResponseValue == 'totalCommentSubjectivity'].loc[:,'ExplanatoryConcept':]

b0CorrRest = mergeCorrPDataFrames(b0rvRest)
b1CorrRest = mergeCorrPDataFrames(b1rvRest)



In [38]:
#Correlation Data comparison UBS <--> WF
b0HighCorrComp = getComparison(b0ImageCorrHighSorted,b1rvImage)
b0LowCorrComp = getComparison(b0ImageCorrLowSorted,b1rvImage)

b1HighCorrComp = getComparison(b1ImageCorrHighSorted,b0rvImage)
b1LowCorrComp = getComparison(b1ImageCorrLowSorted,b0rvImage)

In [42]:
b0HighCorrComp.head(20).rename(columns = {"ubs_correlation": "Id0", "ubs_p_value":"Id0_pvalue", "wf_correlation": "Id1", "wf_p_value":"Id1_pvalue"})

Unnamed: 0,ResponseValue,ExplanatoryConcept,Id0,Id0_pvalue,Id1,Id1_pvalue
0,Likes,nature,0.310151,4.257631e-18,0.051886,0.39401
1,Likes,travel,0.302086,3.3154530000000004e-17,-0.04477,0.46214
2,Likes,landscape,0.298952,7.234853e-17,-0.022497,0.711856
3,Likes,water,0.286395,1.500895e-15,-0.028007,0.645613
4,Likes,wildlife,0.259548,5.981099e-13,-0.087423,0.150455
5,Likes,mammal,0.234956,8.151484e-11,-0.088691,0.144603
6,Likes,sky,0.231692,1.504188e-10,-0.021079,0.72928
7,Likes,ocean,0.229438,2.284291e-10,-0.011509,0.850132
8,Likes,snow,0.22521,4.941888e-10,0.048935,0.421497
9,Likes,mountain,0.222607,7.887889e-10,-0.063281,0.298391


In [43]:
b1HighCorrComp.head(20).rename(columns = {"ubs_correlation": "Id0", "ubs_p_value":"Id0_pvalue", "wf_correlation": "Id1", "wf_p_value":"Id1_pvalue"})

Unnamed: 0,ResponseValue,ExplanatoryConcept,Id0,Id0_pvalue,Id1,Id1_pvalue
0,Comments,horizontal,0.349118,3.247705e-09,-0.064822,0.076831
1,Comments,creativity,0.284557,1.838589e-06,-0.044758,0.222067
2,Likes,horizontal,0.271691,5.467398e-06,-0.116184,0.001479
3,Comments,vertical,0.231046,0.0001204958,0.010068,0.783683
4,Comments,business,0.227896,0.0001498893,-0.074652,0.041511
5,Likes,business,0.217784,0.0002960015,-0.154169,2.3e-05
6,Likes,vertical,0.209357,0.0005099241,-0.059167,0.106371
7,Comments,indoors,0.193353,0.001353171,-0.041548,0.257055
8,Likes,creativity,0.192612,0.00141315,-0.014506,0.692436
9,Likes,no person,0.181809,0.002614331,0.00496,0.892426


In [44]:
b0LowCorrComp.head(20).rename(columns = {"ubs_correlation": "Id1", "ubs_p_value":"Id1_pvalue", "wf_correlation": "Id0", "wf_p_value":"Id0_pvalue"})

Unnamed: 0,ResponseValue,ExplanatoryConcept,Id1,Id1_pvalue,Id0,Id0_pvalue
0,Likes,isolated,-0.27035,5.822759e-14,0.008957,0.883091
1,Likes,food,-0.237511,5.014771e-11,0.056427,0.35389
2,Likes,disjunct,-0.224897,5.229209e-10,-0.004933,0.935461
3,Likes,design,-0.220165,1.216886e-09,0.097357,0.10914
4,Likes,pattern,-0.198909,4.28884e-08,0.004932,0.935474
5,Likes,fabric,-0.196956,5.839695e-08,-0.011218,0.853884
6,Likes,wear,-0.19242,1.181494e-07,0.017673,0.771706
7,Likes,textile,-0.186584,2.855311e-07,-0.011218,0.853884
8,Likes,fashion,-0.17322,1.943174e-06,0.023502,0.699586
9,Likes,cooking,-0.166785,4.652473e-06,0.033258,0.584974


In [45]:
b1LowCorrComp.head(20).rename(columns = {"ubs_correlation": "Id1", "ubs_p_value":"Id1_pvalue", "wf_correlation": "Id0", "wf_p_value":"Id0_pvalue"})

Unnamed: 0,ResponseValue,ExplanatoryConcept,Id1,Id1_pvalue,Id0,Id0_pvalue
0,Comments,architecture,-0.198228,0.001013,0.019817,0.5889116
1,Comments,group together,-0.181427,0.00267,,
2,AvgCommentPolarity,desktop,-0.179615,0.00295,-0.032306,0.3782454
3,Comments,building,-0.179578,0.002956,0.00152,0.9669425
4,AvgCommentPolarity,motley,-0.164509,0.006544,0.007688,0.8339635
5,Likes,group together,-0.164445,0.006565,,
6,Likes,two,-0.158717,0.008736,0.004423,0.9040148
7,Comments,two,-0.156792,0.009597,-0.030413,0.4068378
8,AvgCommentPolarity,banner,-0.154906,0.010513,0.033875,0.3555238
9,Likes,success,-0.152206,0.011959,-0.068195,0.06265191


In [49]:
# Save calculated Correlations
pickling_on = open("..\\ProcessedData\\FinalAnalysisData.pickle","wb")

#Communication Strategy
pickle.dump(regrComL0, pickling_on)
pickle.dump(regrComC0, pickling_on)
pickle.dump(regrComA0, pickling_on)
pickle.dump(regrComL1, pickling_on)
pickle.dump(regrComC1, pickling_on)
pickle.dump(regrComA1, pickling_on)

#Content Strategy
pickle.dump(regrConL0, pickling_on)
pickle.dump(regrConC0, pickling_on)
pickle.dump(regrConA0, pickling_on)
pickle.dump(regrConL1, pickling_on)
pickle.dump(regrConC1, pickling_on)
pickle.dump(regrConA1, pickling_on)

#Correlations
pickle.dump(b0rvImage, pickling_on)
pickle.dump(b0rvRest, pickling_on)
pickle.dump(b1rvImage, pickling_on)
pickle.dump(b1rvRest, pickling_on)

pickling_on.close()