In [361]:
import pymongo
import pandas as pd
import pickle
from data_prep import DataPrep
from model import Model
from sklearn.preprocessing import MinMaxScaler

In [362]:
mc = pymongo.MongoClient()
db = mc['my-facebook-webscrape']
fb_statuses = db['fb-statuses']



In [363]:
def anonymize_name(name):
    output = ""
    for i in name.split():
        output += i[0] + '.'
    return output

In [364]:
names_and_statuses = list(fb_statuses.find({'friends_dict': {'$exists': False}}, {'statuses':1, 'name':1, '_id':0}))

df_dict = {'NAME': [], 'DATE': [], 'STATUS': []}
for entry in names_and_statuses:
    
    name = entry['name']
#     name = anonymize_name(name)
    
    statuses = entry['statuses']
    
    for date, status in statuses.items():
        df_dict['NAME'].append(name)
        df_dict['DATE'].append(date)
        df_dict['STATUS'].append(status)

df = pd.DataFrame(df_dict)
df['STATUS_COUNT'] = df.groupby("NAME")["STATUS"].transform('count')

In [365]:
df

Unnamed: 0,DATE,NAME,STATUS,STATUS_COUNT
0,08/03/2009 9:20pm,Kellian Valenti,I'm done (╯°□°）╯︵ ┻━┻),3
1,08/02/2010 4:32am,Kellian Valenti,Summer,3
2,08/03/2010 4:19am,Kellian Valenti,Summer,3
3,05/03/2018 12:17pm,Chen Chi,Kind of healing especially in these tough days...,94
4,04/14/2018 4:03pm,Chen Chi,A rainy week -_-\nDimmed ocean view,94
5,04/13/2018 9:48am,Chen Chi,I'm fine\nSmiley face:) @ Downtown Seattle,94
6,03/03/2018 9:22pm,Chen Chi,I'm fine\nSmiley face:) @ Downtown Seattle,94
7,02/20/2018 12:41pm,Chen Chi,想去的地方终于打了卡，相聚离开都有时候，但我们都走在一条叫做努力的路上.See u when...,94
8,01/12/2018 7:18pm,Chen Chi,Amazing Samsung~,94
9,01/12/2018 7:11pm,Chen Chi,"油腻少女范儿get! @ Chelsea, Manhattan",94


In [376]:
traits = ['OPN', 'CON', 'EXT', 'AGR', 'NEU']

for trait in traits:
    pkl_model = pickle.load(open(trait + '_model.pkl', "rb"))
    dp = DataPrep()
    X = df['STATUS']
    
    trait_scores = pkl_model.predict(X, regression=True)
    df['pred_s'+trait] = trait_scores
    
    trait_categories = pkl_model.predict(X, regression=False)
    df['pred_c'+trait] = trait_categories
    
    trait_categories_probs = pkl_model.predict_proba(X)
    df['pred_prob_c'+trait] = trait_categories_probs[:, 1]

KeyboardInterrupt: 

In [377]:
df['STATUS']

0                                 I'm done (╯°□°）╯︵ ┻━┻) 
1                                                 Summer 
2                                                 Summer 
3       Kind of healing especially in these tough days...
4                    A rainy week -_-\nDimmed ocean view 
5             I'm fine\nSmiley face:) @ Downtown Seattle 
6             I'm fine\nSmiley face:) @ Downtown Seattle 
7       想去的地方终于打了卡，相聚离开都有时候，但我们都走在一条叫做努力的路上.See u when...
8                                       Amazing Samsung~ 
9                        油腻少女范儿get! @ Chelsea, Manhattan 
10                       油腻少女范儿get! @ Chelsea, Manhattan 
11                                                  板栗烧鸡 
12      May we be at peace.\nBegin New Year with stand...
13                             和每一个角落说一个不够正式的再见 再见再见再见再见 
14                             和每一个角落说一个不够正式的再见 再见再见再见再见 
15                   Sunset on Hudson River\n对港口，船只，蜜汁热爱 
16                   Sunset on Hudson River\n对港口，船只，蜜汁热爱 
17            

In [367]:
trait_categories = []
trait_categories_probabilities = []
trait_scores = [
'pred_sOPN', 'pred_sCON', 'pred_sEXT', 'pred_sAGR', 'pred_sNEU',
]
trait_categories_scores_percentiles = []

In [368]:
scaler = MinMaxScaler(feature_range=(0, 50))
scores = df[trait_scores]
scaled_scores = scaler.fit_transform(scores)

df_scaled = pd.DataFrame(scaled_scores)
df_scaled.columns = trait_scores
df_scaled

for col in trait_scores:
    df[col] = df_scaled[col]

df

Unnamed: 0,DATE,NAME,STATUS,STATUS_COUNT,pred_sOPN,pred_cOPN,pred_prob_cOPN,pred_sCON,pred_cCON,pred_prob_cCON,pred_sEXT,pred_cEXT,pred_prob_cEXT,pred_sAGR,pred_cAGR,pred_prob_cAGR,pred_sNEU,pred_cNEU,pred_prob_cNEU
0,08/03/2009 9:20pm,Kellian Valenti,I'm done (╯°□°）╯︵ ┻━┻),3,31.860802,True,0.680632,27.761192,False,0.451479,28.885957,False,0.366726,23.597127,False,0.420443,22.304670,False,0.465721
1,08/02/2010 4:32am,Kellian Valenti,Summer,3,24.962801,False,0.273929,26.220598,False,0.340480,29.976178,False,0.115958,36.138516,True,0.897117,10.518579,False,0.031585
2,08/03/2010 4:19am,Kellian Valenti,Summer,3,24.962801,False,0.273929,26.220598,False,0.340480,29.976178,False,0.115958,36.138516,True,0.897117,10.518579,False,0.031585
3,05/03/2018 12:17pm,Chen Chi,Kind of healing especially in these tough days...,94,31.413932,True,0.530702,28.287230,False,0.449099,26.826149,False,0.393263,32.312904,True,0.794428,22.823460,False,0.457878
4,04/14/2018 4:03pm,Chen Chi,A rainy week -_-\nDimmed ocean view,94,32.709772,True,0.965541,24.251671,False,0.168348,29.571759,False,0.337807,26.582994,True,0.630551,16.377962,False,0.331671
5,04/13/2018 9:48am,Chen Chi,I'm fine\nSmiley face:) @ Downtown Seattle,94,46.031905,True,0.703691,30.356917,True,0.647431,39.259323,True,0.676224,28.939255,False,0.172295,24.361002,False,0.230940
6,03/03/2018 9:22pm,Chen Chi,I'm fine\nSmiley face:) @ Downtown Seattle,94,46.031905,True,0.703691,30.356917,True,0.647431,39.259323,True,0.676224,28.939255,False,0.172295,24.361002,False,0.230940
7,02/20/2018 12:41pm,Chen Chi,想去的地方终于打了卡，相聚离开都有时候，但我们都走在一条叫做努力的路上.See u when...,94,32.654076,True,0.972727,22.301363,False,0.163636,25.675199,False,0.044552,18.026916,False,0.154545,18.584256,False,0.036364
8,01/12/2018 7:18pm,Chen Chi,Amazing Samsung~,94,27.001952,True,0.619031,35.464650,True,0.791822,30.843268,True,0.603767,22.828685,True,0.690817,16.136341,False,0.087121
9,01/12/2018 7:11pm,Chen Chi,"油腻少女范儿get! @ Chelsea, Manhattan",94,32.074215,True,0.667879,26.928203,False,0.453508,29.005143,False,0.366726,25.780804,False,0.437506,22.064044,True,0.556141


In [369]:
df_mean_scores = df.groupby('NAME')[[
    'pred_sOPN', 'pred_sCON', 'pred_sEXT', 'pred_sAGR', 'pred_sNEU',
]].mean()

df_mean_scores['status_counts'] = df.groupby('NAME')['STATUS_COUNT'].count()

df_mean_scores.sort_values(by=['status_counts'], ascending=False)

Unnamed: 0_level_0,pred_sOPN,pred_sCON,pred_sEXT,pred_sAGR,pred_sNEU,status_counts
NAME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Stanley Cheng,32.135479,27.376874,30.418839,27.827454,17.608064,209
Emmanuel Watkins,32.193403,26.643276,29.272339,25.898633,19.380555,207
Kimberli Cheung Wright,33.037520,28.075634,30.000318,27.884735,17.792256,206
Eddie Ignacio,32.182786,27.134386,28.543318,26.571175,19.488914,205
Bradley Li,32.904767,26.892076,28.862625,27.377663,18.245517,205
William Young,32.280717,26.353868,29.145153,25.903751,20.278178,204
Annelise Yee,32.428381,27.076602,29.918682,27.222200,18.765195,203
Rosanna Cheng,32.650958,27.269315,29.888626,27.070168,18.229877,201
Nick DeJesus,31.993604,25.820168,29.015642,26.164273,19.784358,200
Jonathan Cheung,31.904739,27.432580,29.993243,27.174066,18.840942,119


In [370]:
df_mean_scores = df.groupby('NAME')[[
    'pred_sOPN', 'pred_sCON', 'pred_sEXT', 'pred_sAGR', 'pred_sNEU',
]].mean()

df_mean_scores = df.groupby(['NAME'], as_index=False).agg(
                      {'pred_sOPN':['mean'], 'pred_sCON':['mean'], 'pred_sEXT':['mean'], 'pred_sAGR':['mean'], 'pred_sNEU':['mean']})

df_mean_scores.columns = ['NAME', 'avg_pred_sOPN', 'avg_pred_sCON', 'avg_pred_sEXT', 'avg_pred_sAGR', 'avg_pred_sNEU']

df = df.merge(df_mean_scores, how='right', on='NAME')

In [371]:
df_mean_scores = df.groupby('NAME')[[
    'pred_prob_cOPN', 'pred_prob_cCON', 'pred_prob_cEXT', 'pred_prob_cAGR', 'pred_prob_cNEU'
]].median()

df_mean_scores['status_counts'] = df.groupby('NAME')['STATUS_COUNT'].count()

df_mean_scores.sort_values(by=['status_counts'], ascending=False)

Unnamed: 0_level_0,pred_prob_cOPN,pred_prob_cCON,pred_prob_cEXT,pred_prob_cAGR,pred_prob_cNEU,status_counts
NAME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Stanley Cheng,0.791092,0.462953,0.390909,0.563636,0.318182,209
Emmanuel Watkins,0.695140,0.451479,0.366726,0.472727,0.393477,207
Kimberli Cheung Wright,0.773593,0.448980,0.380220,0.572727,0.324242,206
Eddie Ignacio,0.727041,0.436364,0.366726,0.464433,0.402673,205
Bradley Li,0.770942,0.445455,0.366726,0.540220,0.363636,205
William Young,0.759258,0.418182,0.370526,0.465251,0.413759,204
Annelise Yee,0.781818,0.451479,0.409420,0.519187,0.390909,203
Rosanna Cheng,0.790505,0.445455,0.366726,0.513103,0.390909,201
Nick DeJesus,0.745455,0.414398,0.366726,0.512987,0.425767,200
Jonathan Cheung,0.761515,0.441335,0.390909,0.509231,0.345455,119


In [372]:
# df_mean_scores = df.groupby('NAME')[[
#     'pred_cOPN', 'pred_cCON', 'pred_cEXT', 'pred_cAGR', 'pred_cNEU',
# ]].count()

# df_mean_scores['status_counts'] = df.groupby('NAME')['STATUS_COUNT'].count()

# df_mean_scores.sort_values(by=['status_counts'], ascending=False)

In [373]:
possible_percentiles = list(range(101))
    
def calc_percentile(score, pop_scores):
    score_percentiles = []
    
    for perc in possible_percentiles:
        score_percentiles.append(np.percentile(pop_scores, perc))
        
    calc_perc = 0
    for score_perc, perc in zip(score_percentiles, possible_percentiles):
        if int(score_perc) >= score:
            calc_perc = perc
            break
        elif int(score_perc) > score:
            break
            
    return calc_perc

In [374]:
import scipy.stats as stats

df['sOPN_percentile'] = df['avg_pred_sOPN'].apply(lambda x: stats.percentileofscore(df['avg_pred_sOPN'].sort_values(),x))
df['sCON_percentile'] = df['avg_pred_sCON'].apply(lambda x: stats.percentileofscore(df['avg_pred_sCON'].sort_values(),x))
df['sEXT_percentile'] = df['avg_pred_sEXT'].apply(lambda x: stats.percentileofscore(df['avg_pred_sAGR'].sort_values(),x))
df['sAGR_percentile'] = df['avg_pred_sAGR'].apply(lambda x: stats.percentileofscore(df['avg_pred_sEXT'].sort_values(),x))
df['sNEU_percentile'] = df['avg_pred_sNEU'].apply(lambda x: stats.percentileofscore(df['avg_pred_sNEU'].sort_values(),x))

In [375]:
df_mean_scores = df.groupby('NAME')[[
    'sOPN_percentile', 'sCON_percentile', 'sEXT_percentile', 'sAGR_percentile', 'sNEU_percentile',
]].mean()

df_mean_scores['status_counts'] = df.groupby('NAME')['STATUS_COUNT'].count()

df_mean_scores.sort_values(by=['status_counts'], ascending=False)

Unnamed: 0_level_0,sOPN_percentile,sCON_percentile,sEXT_percentile,sAGR_percentile,sNEU_percentile,status_counts
NAME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Stanley Cheng,37.075949,57.303797,97.481013,4.746835,13.177215,209
Emmanuel Watkins,42.481013,24.987342,93.936709,1.430380,63.936709,207
Kimberli Cheung Wright,84.753165,75.310127,97.316456,4.746835,18.373418,206
Eddie Ignacio,39.873418,44.025316,90.518987,2.569620,67.696203,205
Bradley Li,77.936709,32.075949,91.101266,3.556962,30.291139,205
William Young,47.259494,19.639241,93.455696,1.430380,87.487342,204
Annelise Yee,55.949367,40.417722,97.075949,3.405063,45.151899,203
Rosanna Cheng,69.291139,50.620253,97.012658,3.253165,27.645570,201
Nick DeJesus,28.778481,12.537975,93.215190,1.784810,80.715190,200
Jonathan Cheung,21.139241,60.974684,97.316456,3.392405,49.632911,119
