In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
from os.path import exists
from os import remove
import json
from sklearn.model_selection import train_test_split
import sklearn.model_selection as ms
import sklearn.metrics as sm
from sklearn.model_selection import cross_validate
import sklearn.linear_model as lm
from sklearn.neighbors import KNeighborsClassifier as knn
from sklearn.ensemble import GradientBoostingClassifier as grad
from sklearn.ensemble import RandomForestClassifier as rf
from sklearn.ensemble import VotingClassifier
from sklearn import metrics
import scipy.stats as stats
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
import requests

import sys
sys.path.insert(0, "../src/")
import util as util

%autosave 5

# Grab datingFull.csv

In [None]:
with open('../data/processedData/columnDataDictionary.json') as d:
    columnDataDictionary = json.load(d)
if exists('../data/plotlyDashData/datingFull.csv'):
    datingFull = pd.read_csv('../data/plotlyDashData/datingFull.csv')
else:
    datingFull = pd.read_csv('../data/processedData/datingFull.csv')

# Stringify specified columns

In [None]:
datingFull = util.stringifyCategoricalColumns(datingFull)

# Ensure datingFull has locations

In [None]:
if("lats" not in datingFull.columns or "lons" not in datingFull.columns):
    datingFull = util.getLocations(datingFull,True)

# Join to partner

In [None]:
if ("iid" in datingFull.columns) or ("pid" in datingFull.columns):
    partner = datingFull.copy()
    datingFull = util.joinToPartner(datingFull,partner).drop(["iid","pid","iid_o","pid_o"],axis=1)

# Fix Ambiguous Scores

In [None]:
halfwayChangeColumns = [str(col) for col in datingFull.columns if (("1_s" in str(col)) | ("3_s" in str(col)))]

if(len(halfwayChangeColumns) > 0):
    datingFull = util.fixAmbiguousScores(datingFull)

# Replace Nans and save

In [None]:
datingFullNumerical = datingFull.select_dtypes(include=['uint8','int64','float64']).drop("match",axis=1)
datingFullNumerical = util.replaceNansWithTrainingDataValues(datingFullNumerical)
for col in datingFullNumerical.columns:
    datingFull[col] = datingFullNumerical[col]
datingFull.to_csv('../data/plotlyDashData/datingFull.csv',index=False)

# Prepare profile database

In [None]:
nameDictionary = dict()
uniqueNames = []
if exists('../data/nameDictionary.json'):
    with open('../data/nameDictionary.json') as d:
        nameDictionary = json.load(d)
        uniqueNames = nameDictionary["uniqueNames"]

if exists('../data/plotlyDashData/profileData.csv'):
    profileData = pd.read_csv('../data/plotlyDashData/profileData.csv')
    profileData = util.stringifyCategoricalColumns(profileData)
            
else:
    preprofileData = pd.read_csv('../data/encoded-SpeedDatingData-WithLocations.csv')
    profileList = ["iid","gender","age","race","field_cd","order","round","undergra","mn_sat","tuition","imprace","imprelig","from","zipcode","income",
                   "goal","date","go_out","career_c","sports","tvsports","career",
                  "exercise","dining","museums","art","hiking","gaming","clubbing","reading","tv","theater","movies","concerts","music","shopping","yoga",
                  "exphappy","expnum","attr1_1","sinc1_1","intel1_1","fun1_1","shar1_1","attr4_1","sinc4_1","intel4_1","fun4_1","shar4_1",
                  "attr2_1","sinc2_1","intel2_1","fun2_1","shar2_1","attr3_1","sinc3_1","intel3_1","fun3_1","attr5_1","sinc5_1","intel5_1",
                  "fun5_1","match_es","attr1_s","sinc1_s","intel1_s","fun1_s","shar1_s","attr3_s","sinc3_s","intel3_s","fun3_s","lats","lons"]
    
    preprofileData = preprofileData[profileList]
    
    preprofileData = util.switchNumbersAndCategoriesFromRawData(preprofileData)

    #fix ambiguous scores
    halfwayChangeColumns = [str(col) for col in preprofileData.columns if (("1_s" in str(col)) | ("3_s" in str(col)))]

    if(len(halfwayChangeColumns) > 0):
        oldOrders = preprofileData["order"]
        preprofileData = util.fixAmbiguousScores(preprofileData)
        util.halfwayQuestionSanityTest(preprofileData," post-fixAmbiguousScores and pre-saving")
        preprofileData["order"] = oldOrders
    
    #nanReplacement
    preprofileData = util.replaceNansWithTrainingDataValues(preprofileData)
    
    #Create profile and database
    uniqueIIDs = list(set(preprofileData["iid"]))
    firstIID = str(uniqueIIDs[0])
    for iid in uniqueIIDs:
        name = ""
        personid = str(iid)
        
        person = preprofileData.loc[preprofileData["iid"]==iid]
        person = person.loc[person["order"]==person["order"].max()]
        person = person.iloc[[0]]
        
        if type(person) != pd.DataFrame:
            print(type(person))
            breakMeImNotADF
        if person.shape[1] == 1:
            breakMeImAColumn
        
        if personid not in nameDictionary.keys():
            api_url = "https://api.namefake.com/"
            if person["gender"].any() == 0:
                api_url += "female"
            else:
                api_url += "male"

            name = requests.get(api_url).json()["name"]
            if name in uniqueNames:
                while name in uniqueNames:
                    name = requests.get(api_url).json()["name"]
            
            uniqueNames.append(name)
            nameDictionary[personid] = name
        else:
            name = nameDictionary[str(iid)]
        
        nameDictionary["uniqueNames"] = uniqueNames
        with open('../data/nameDictionary.json', 'w') as fp:
            json.dump(nameDictionary, fp)
        
        person["name"] = name
        if personid == firstIID:
            profileData = person
        else:
            profileData = pd.concat([profileData,person])
        
    profileData.drop("order",axis=1).to_csv('../data/plotlyDashData/profileData.csv',index=False)
    
if exists('../data/plotlyDashData/partnerProfileWithDummies.csv'):
    partnerProfileWithDummies = pd.read_csv('../data/plotlyDashData/partnerProfileWithDummies.csv')
    partnerProfileWithDummies = util.stringifyCategoricalColumns(partnerProfileWithDummies)
else:                   
    partnerProfileWithDummies = util.addDummies(profileData.copy())
    partnerProfileWithDummies["race_5.0"] = 0 #no native americans, thus excluded from original dummification method
    partnerProfileWithDummies = util.replaceNansWithTrainingDataValues(partnerProfileWithDummies)            
    partnerProfileWithDummies.to_csv('../data/plotlyDashData/partnerProfileWithDummies.csv',index=False)


# Missing column analysis

In [None]:
colList = list(partnerProfileWithDummies.columns)
for col in colList:
    if col in columnDataDictionary["nonBinaryCategoricalList"]:
        partnerProfileWithDummies[col] = partnerProfileWithDummies[col].apply(str)

In [None]:
columnsCollected = [str(col) for col in partnerProfileWithDummies.columns] + [str(col)+"_o" for col in partnerProfileWithDummies.columns]
missingColumns = []
unneccesaryColumns = ["samerace","match"]
necessaryColumns = pd.read_csv('../data/plotlyDashData/datingTrain.csv').drop(unneccesaryColumns,axis=1).columns
for necCol in necessaryColumns:
    if necCol not in columnsCollected:
        missingColumns.append(necCol)

print("from profile data")
for col in datingFull.columns:
    if col in missingColumns and "_o" not in col:
        print(col)

print("\nfrom partner data")
for col in datingFull.columns:
    if col in missingColumns and "_o" in col:
        print(col)

# Create Description Dictionary

In [None]:
if exists("../data/plotlyDashData/descriptionDictionary.json") == False:
    descriptionDictionary = dict()
    if exists("../data/descriptionDictionary.json"):
        with open("../data/descriptionDictionary.json") as d:
            descriptionDictionary = json.load(d)
    
    fullColumns = list(datingFull.columns)
    profileColumns = list(profileData.columns)
    columnsNeedingDescriptions = fullColumns + profileColumns
    for col in columnsNeedingDescriptions:
        strcol = str(col)
        if strcol not in descriptionDictionary.keys():
            print(s)
            desc = input()
            descriptionDictionary[strcol] = desc

    with open("../data/plotlyDashData/descriptionDictionary.json","w") as fp:
        json.dump(descriptionDictionary,fp)
    with open("../data/descriptionDictionary.json","w") as fp:
        json.dump(descriptionDictionary,fp)

with open("../data/plotlyDashData/descriptionDictionary.json") as d:
    descriptionDictionary = json.load(d)
    
for k in descriptionDictionary.keys():
    print(f"{k}: {descriptionDictionary[k]}")