In [1]:
import pandas as pd
import seaborn as sns
import pandasql as psql
import numpy as np
import matplotlib.pyplot as plt
from dython.nominal import identify_nominal_columns
from scipy import stats
from scipy.stats import chisquare
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder, MinMaxScaler
import mca
import prince

In [21]:
activity = pd.read_csv('original_data/activity.csv')
survey = pd.read_csv('original_data/survey_clear.csv')

In [22]:
activity = activity.dropna()

In [23]:
activity_survey = pd.merge(activity, survey, on=['egoid'])
activity_survey.to_csv('merged_data/activity_survey.csv', index=False)

# activity_survey = pd.read_csv('merged_data/activity_survey.csv')

In [24]:
def get_redundant_pairs(df):
    '''Get diagonal and lower triangular pairs of correlation matrix'''
    pairs_to_drop = []
    cols = df.columns
    for i in range(0, df.shape[1]):
        for j in range(0, i+1):
            pairs_to_drop.append((cols[i], cols[j], df.iat[i, j]))
    return pairs_to_drop

def get_top_abs_correlations(df):
    abs_corr = df.corr().abs()
    # au_corr = df.corr().abs().unstack()
    labels_to_drop = get_redundant_pairs(abs_corr)
    return sorted(labels_to_drop, key=lambda tup: tup[2], reverse=True)
    # au_corr = au_corr.drop(labels=labels_to_drop).sort_values(ascending=False)
    # return au_corr[0:n]

In [25]:
c1 = get_top_abs_correlations(activity_survey)
c1

In [None]:
('peakcal', 'peakmins', 0.9607914723948829),
('cardiocal', 'cardiomins', 0.9567735185533744),
('lowrangemins', 'complypercent', 0.9412248931445473),
('fatburncal', 'fatburnmins', 0.8625015334223399),
('fatburncal', 'fairlyactiveminutes', 0.7715594542738567),
('lowrangecal', 'lowrangemins', 0.7548677042920645),
('club7rc_5', 'sedentarytime_scale_1', 0.7517370456401893),
('club7rc_5', 'SelfEff_diet_1_1', 0.7137024249691005),
('lowrangecal', 'complypercent', 0.6992630539562396),
('veryactiveminutes', 'steps', 0.6956130605019214),
('fatburnmins', 'meanrate', 0.6570696974480279),
('fatburncal', 'veryactiveminutes', 0.6524032785080631),
('lightlyactiveminutes', 'sedentaryminutes', 0.6356660009175586),
('veryactiveminutes', 'sdrate', 0.6299855083543254),
('sedentaryminutes', 'complypercent', 0.5962465713081001),
('veryactiveminutes', 'fairlyactiveminutes', 0.5895204991755101),
('fatburnmins', 'fairlyactiveminutes', 0.5893570134160409),
('club7rc_5', 'PSQI2_1', 0.5866469928455073),
('club7rc_5', 'SelfEff_diet_25_1', 0.5785041190899385),
('cardiomins', 'sdrate', 0.5452896691952817),
('steps', 'sdrate', 0.5447584588939157),
('fairlyactiveminutes', 'steps', 0.544264026067217),
('fatburncal', 'steps', 0.526351699286462),
('cardiocal', 'sdrate', 0.524690281916914),
('lowrangemins', 'sedentaryminutes', 0.5168834110893948),
('lowrangecal', 'sedentaryminutes', 0.5147434316635867),
('club7rc_5', 'physicalactivity5_1', 0.4970974096756166),
('lightlyactiveminutes', 'steps', 0.49213548001179924),
('club7rc_5', 'SelfEff_diet_26_1', 0.48919859480321704),
('fatburncal', 'meanrate', 0.48755164554996777),
('fatburncal', 'sdrate', 0.4833166996964153),
('lowrangecal', 'lightlyactiveminutes', 0.47880337436157105),
('club7rc_5', 'PSQI4_1', 0.466190532614564),
('cardiocal', 'veryactiveminutes', 0.4616353630381654),
('club7rc_5', 'PSQIGlobal_1', 0.46090807503070896),
('club7rc_5', 'SelfEff_diet_6_1', 0.4608923337198263),
('club7rc_5', 'selfreg_scale_1', 0.45844484476158903),
('floors', 'steps', 0.4581370720055981),
('club7rc_5', 'SelfEff_exercise_15_1', 0.4571867638156217),
('SelfEff_exercise_4_5', 'SelfEff_exercise_8_2', 0.45480189212367256),
('fairlyactiveminutes', 'sdrate', 0.45393586003552844),
('club7rc_5', 'HealthSatisfactionScale_1', 0.4370084816842345),
('club7rc_5', 'STAITraitTotal_1', 0.4369706217187404),
('club7rc_5', 'SelfEff_exercise_2_1', 0.4366539152846289),
('fatburnmins', 'steps', 0.4336378170574076),
('cardiomins', 'veryactiveminutes', 0.432529237283147),
('lightlyactiveminutes', 'complypercent', 0.42783671864101896),
('club7rc_5', 'SelfEff_exercise_9_1', 0.41635834982134223),
('club7rc_5', 'physicalactivity4_1', 0.4052891583291527),
('fatburncal', 'lightlyactiveminutes', 0.4038976459402531),
('lowrangemins', 'meanrate', 0.40130422373304436),


In [None]:
for column in activity_survey:
    print(activity_survey)

In [20]:

activity_survey = activity_survey.select_dtypes(exclude=['object'])
normalized_activity_survey = (activity_survey-activity_survey.mean())/activity_survey.std()


  normalized_activity_survey = (activity_survey-activity_survey.mean())/activity_survey.std()


In [18]:

cov_matrix = normalized_activity_survey.cov()
cov_matrix

Unnamed: 0,AdditionalHealthIssue_1,Agreeableness_1,BDI10_1_2,BDI10_1_7,BDI10_2_2,BDI10_2_7,BDI10_3_2,BDI10_3_7,BDI10_4_2,BDI10_4_7,...,usebeer_2,usecaffine_2,usedrugs_2,usedrugs_prescr_2,usetobacco_2,usewine_2,veryactiveminutes,visiting_2,volunteer_2,yourelig_1
AdditionalHealthIssue_1,1.000000,0.049947,-0.095409,0.005773,0.090726,-0.036681,-0.024725,0.089516,0.053766,-0.038991,...,-0.064490,-0.093305,0.013440,0.029433,-0.175404,-0.025859,-0.011261,0.055833,-0.082523,-0.068575
Agreeableness_1,0.049947,1.000000,0.002907,0.099719,0.015123,-0.004071,0.036426,0.115998,0.033696,0.010956,...,-0.000726,-0.020012,-0.054940,0.005067,0.078002,0.091892,0.001906,-0.069970,-0.049568,-0.074009
BDI10_1_2,-0.095409,0.002907,1.000000,0.055326,-0.813085,-0.096064,-0.184629,-0.141826,-0.311409,-0.158334,...,-0.116664,0.041593,-0.020048,0.037495,0.013601,-0.133286,0.067106,-0.020853,-0.021254,-0.146204
BDI10_1_7,0.005773,0.099719,0.055326,1.000000,0.017235,-0.383581,-0.039997,-0.113513,-0.071592,-0.020427,...,-0.084113,-0.053946,0.094038,-0.046847,0.021714,0.018899,0.051533,-0.015187,0.090637,0.003966
BDI10_2_2,0.090726,0.015123,-0.813085,0.017235,1.000000,0.042637,0.120839,0.073031,-0.002838,0.113634,...,0.109906,-0.053963,0.007430,-0.037250,-0.035011,0.128444,-0.033150,-0.000650,-0.014600,0.113622
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
usewine_2,-0.025859,0.091892,-0.133286,0.018899,0.128444,-0.032393,0.092147,0.055093,0.041893,0.097848,...,0.650723,0.017465,0.112203,-0.031118,0.031236,1.000000,-0.029681,-0.060542,-0.076126,0.030630
veryactiveminutes,-0.011261,0.001906,0.067106,0.051533,-0.033150,-0.048887,-0.026436,-0.044307,-0.026709,0.004130,...,-0.053635,0.021991,0.001661,-0.016081,-0.039446,-0.029681,1.000000,0.042531,0.035360,-0.033332
visiting_2,0.055833,-0.069970,-0.020853,-0.015187,-0.000650,-0.038031,0.007200,0.088585,-0.006411,-0.052341,...,-0.058464,0.046393,0.002865,-0.072533,0.012088,-0.060542,0.042531,1.000000,0.000108,0.110076
volunteer_2,-0.082523,-0.049568,-0.021254,0.090637,-0.014600,-0.002767,0.006548,-0.135606,0.099282,0.114874,...,-0.073059,-0.067991,-0.043698,-0.113110,-0.046321,-0.076126,0.035360,0.000108,1.000000,0.002838


In [19]:
from numpy.linalg import eig

us_egnvalues, us_egnvectors = eig(cov_matrix)

total_egnvalues = sum(us_egnvalues)

var_exp = [(i/total_egnvalues) for i in sorted(us_egnvalues, reverse=True)]

LinAlgError: Array must not contain infs or NaNs

In [None]:
cov_dict = {}
for index, column in enumerate(activity_survey.columns):
    cov_dict[column] = us_egnvalues[index]
    print(us_egnvalues[index], '\t', column)