In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from os.path import exists
import json
import scipy.stats as stats
import math
import sklearn.linear_model as lm

import sys
sys.path.insert(0, "../src/")
import util as util

%autosave 5

In [None]:
if exists('../data/encoded-SpeedDatingData-WithLocations.csv'):
    fullDatingDatabase = pd.read_csv('../data/encoded-SpeedDatingData-WithLocations.csv')
else:
    fullDatingDatabase = pd.read_csv("../data/encoded-SpeedDatingData.csv")
    fullDatingDatabase = util.getLocations(fullDatingDatabase,True)
    fullDatingDatabase.to_csv('../data/encoded-SpeedDatingData-WithLocations.csv',index=False)

In [None]:
columnList = ["iid","gender","round","order","pid","match","samerace","age","field_cd",
              "undergra","mn_sat","tuition","race","imprace","imprelig","from","zipcode","income","goal","date","go_out","career_c","sports","tvsports",
              "exercise","dining","museums","art","hiking","gaming","clubbing","reading","tv","theater","movies","concerts","music","shopping","yoga",
              "exphappy","expnum","attr1_1","sinc1_1","intel1_1","fun1_1","shar1_1","attr4_1","sinc4_1","intel4_1","fun4_1","shar4_1",
              "attr2_1","sinc2_1","intel2_1","fun2_1","shar2_1","attr3_1","sinc3_1","intel3_1","fun3_1","attr5_1","sinc5_1","intel5_1",
              "fun5_1","match_es","attr1_s","sinc1_s","intel1_s","fun1_s","shar1_s","attr3_s","sinc3_s","intel3_s","fun3_s","lats","lons","int_corr"]
nonBinaryCategoricalList = ["field_cd","undergra","race","zipcode","goal","date","go_out","career_c","zipcode_o"]
stringToFloatList = ["tuition","mn_sat","income"]
identityList = ["iid","pid"]
sharedList = ["match","samerace","partnerDistance","int_corr"]
partnerList = [col for col in columnList if ((col not in identityList) and (col != "match"))]
pointDistributionList = ["pf_o","1_1","4_1","2_1","1_s"]
interestColumns = ["sports","tvsports","exercise","dining","museums","art","hiking","gaming","clubbing","reading","tv","theater","movies","concerts","music","shopping","yoga"]

In [None]:
columnDataDictionary = {"columnList": columnList,
                        "nonBinaryCategoricalList": nonBinaryCategoricalList,
                        "stringToFloatList": stringToFloatList,
                        "pointDistributionList": pointDistributionList,
                        "sharedList":sharedList,
                        "partnerList": partnerList,
                        "interestColumns":interestColumns
                       }

with open('../data/columnDataDictionary.json', 'w') as fp:
    json.dump(columnDataDictionary, fp)

!rm -r ../data/processedData
!mkdir ../data/processedData

with open('../data/processedData/columnDataDictionary.json', 'w') as fp:
    json.dump(columnDataDictionary, fp)

In [None]:
blindDateData = fullDatingDatabase[columnList + ['wave']]
blindDateData = util.switchNumbersAndCategoriesFromRawData(blindDateData)

In [None]:
blindDateDataWave69 = blindDateData[(blindDateData['wave'].astype(int) >= 6) & (blindDateData['wave'].astype(int) <= 9)]
blindDateDataWaveNot69 = blindDateData[(blindDateData['wave'].astype(int) < 6) | (blindDateData['wave'].astype(int) > 9)]

In [None]:
total = blindDateData.shape[0]
for col in columnList:
    colData = blindDateData[col]
    colData69 = blindDateDataWave69[col]
    colDataNot69 = blindDateDataWaveNot69[col]
    
    print(f'{col} {colData.dtype}')
    nanCount = sum([util.isNan(x) for x in colData])
    print(f'{100 * nanCount/total}% of data is nan. {nanCount} out of {total}')
    dataList = list(set(colData))
        
    if len(dataList)<=25:
        print(dataList)
    elif str(col) == 'met':
        print(colData.value_counts())
    else:
        if colData.dtype == "O":
            print(f"data varries across {len(dataList)} values")
        elif (("pf_o" in str(col)) or col.endswith("1_1") or col.endswith("4_1") or col.endswith("2_1") or col.endswith("1_s")):
            print(f"{colData69.mean()} +/- {colData69.std()}")
            fig = plt.figure()
            plt.hist(colData69)
            plt.title('Waves 6 - 9')
            plt.show()
            print(f"{colDataNot69.mean()} +/- {colDataNot69.std()}")
            fig = plt.figure()
            plt.hist(colDataNot69)
            plt.title('Excluding Waves 6 - 9')
            plt.show()
        else:
            print(f"{colData.mean()} +/- {colData.std()}")
            fig = plt.figure()
            plt.hist(colData)
            plt.show()
    print("\n")

In [None]:
gender0 = blindDateData[blindDateData['gender'] == 0]
gender1 = blindDateData[blindDateData['gender'] == 1]

In [None]:
selectedIndecies = np.random.choice(gender0.shape[0],replace = False,size = (40))

In [None]:
recordedCorr = []
pearsonCorr = []
spearmanCorr = []
for i in selectedIndecies:
    candidate = gender0.iloc[[i]]
    partner = gender1[(gender1['iid'] == float(candidate['pid'])) & (gender1['pid'] == float(candidate['iid']))]
    candidateInterests = []
    partnerInterests = []
    for col in interestColumns:
        candidateScore = float(candidate[col])
        partnerScore = float(partner[col])
        if (util.isNan(candidateScore) == False and util.isNan(partnerScore) == False):
            candidateInterests.append(candidateScore)
            partnerInterests.append(partnerScore)
    if len(candidateInterests) >= 2:
        recordedCorr.append(candidate['int_corr'])
        pearsonValue = stats.pearsonr(np.array(candidateInterests),np.array(partnerInterests))[0]
        spearmanValue = stats.spearmanr(np.array(candidateInterests),np.array(partnerInterests))[0]
        pearsonCorr.append(pearsonValue)
        spearmanCorr.append(spearmanValue)

pearsonError = np.mean((np.array(pearsonCorr) - np.array(recordedCorr))/np.array(recordedCorr))
spearmanError = np.mean((np.array(spearmanCorr) - np.array(recordedCorr))/np.array(recordedCorr))
print(f'Pearson Error {pearsonError}')
print(f'Spearman Error {spearmanError}')

In [None]:
recordedArray = np.array(recordedCorr).reshape(-1,1)
pearsonArray = np.array(pearsonCorr).reshape(-1,1)
spearmanArray = np.array(spearmanCorr).reshape(-1,1)

util.plotCorrelation(pearsonArray,recordedArray,'pearson')
util.plotCorrelation(spearmanArray,recordedArray,'spearman')

In [None]:
#Pearson has more accuracy
sharedInterestCoefficientsDictionary = dict()
model = lm.LinearRegression()
model.fit(pearsonArray,recordedArray)
sharedInterestCoefficientsDictionary["m"] = list(model.coef_[0])
sharedInterestCoefficientsDictionary["b"] = list(model.intercept_)
with open("../data/sharedInterestCoefficientsDictionary.json","w") as fp:
    json.dump(sharedInterestCoefficientsDictionary,fp)

In [None]:
util.getLocation('92069','San Marcos')