In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
from os.path import exists
from os import remove
import json
from sklearn.model_selection import train_test_split
import sklearn.model_selection as ms
import sklearn.metrics as sm
from sklearn.model_selection import cross_validate
import sklearn.linear_model as lm
from sklearn.neighbors import KNeighborsClassifier as knn
from sklearn.ensemble import GradientBoostingClassifier as grad
from sklearn.ensemble import RandomForestClassifier as rf
from sklearn.ensemble import VotingClassifier
from sklearn import metrics
import scipy.stats as stats
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
import requests

import sys
sys.path.insert(0, "../src/")
import util as util

%autosave 5

# Grab datingFull.csv

In [None]:
with open('../data/processedData/columnDataDictionary.json') as d:
    columnDataDictionary = json.load(d)
if exists('../data/plotlyDashData/datingFull.csv'):
    datingFull = pd.read_csv('../data/plotlyDashData/datingFull.csv')
else:
    datingFull = pd.read_csv('../data/processedData/datingFull.csv')

# Stringify specified columns

In [None]:
datingFull['zipcode'] = datingFull['zipcode'].apply(str)
for col in list(datingFull.columns):
    if col in columnDataDictionary["nonBinaryCategoricalList"]:
        datingFull[col] = datingFull[col].apply(str)

# Ensure datingFull has locations

In [None]:
if("lats" not in datingFull.columns or "lons" not in datingFull.columns):
    datingFull = util.getLocations(datingFull,True)

# Fix Ambiguous Scores

In [None]:
halfwayChangeColumns = [str(col) for col in datingFull.columns if (("1_s" in str(col)) | ("3_s" in str(col)))]

if(len(halfwayChangeColumns) > 0):
    datingFull = util.fixAmbiguousScores(datingFull)

# Replace Nans and save

In [None]:
datingFullNumerical = datingFull.select_dtypes(include=['uint8','int64','float64']).drop("match",axis=1)
datingFullNumerical = util.replaceNansWithTrainingDataValues(datingFullNumerical)
for col in datingFullNumerical.columns:
    datingFull[col] = datingFullNumerical[col]
datingFull.to_csv('../data/plotlyDashData/datingFull.csv',index=False)

# Prepare profile database

In [None]:
if exists('../data/plotlyDashData/profileData.csv') and exists('../data/plotlyDashData/partnerProfileWithDummies.csv'):
    profileData = pd.read_csv('../data/plotlyDashData/profileData.csv')
    partnerProfileWithDummies = pd.read_csv('../data/plotlyDashData/partnerProfileWithDummies.csv')
else:
    preprofileData = pd.read_csv('../data/encoded-SpeedDatingData-WithLocations.csv')
    profileList = ["iid","gender","age","field_cd","order","undergra","mn_sat","tuition","race","imprace","imprelig","from","zipcode","income",
                   "goal","date","go_out","career_c","sports","tvsports","career",
                  "exercise","dining","museums","art","hiking","gaming","clubbing","reading","tv","theater","movies","concerts","music","shopping","yoga",
                  "exphappy","expnum","attr1_1","sinc1_1","intel1_1","fun1_1","shar1_1","attr4_1","sinc4_1","intel4_1","fun4_1","shar4_1",
                  "attr2_1","sinc2_1","intel2_1","fun2_1","shar2_1","attr3_1","sinc3_1","intel3_1","fun3_1","attr5_1","sinc5_1","intel5_1",
                  "fun5_1","match_es","attr1_s","sinc1_s","intel1_s","fun1_s","shar1_s","attr3_s","sinc3_s","intel3_s","fun3_s","lats","lons"]

    preprofileData = preprofileData["profileList"]

    #copy over data from datingFull
    for profilecol in profileList:
        for fullcol in datingFull.columns:
            if profilecol == fullcol:
                preprofileData[profilecol] = datingFull[fullcol]

    #Create profile and database
    uniqueIIDs = list(set(preprofileData["iid"]))
    uniqueNames = []
    firstIID = uniqueIIDs[0]
    for iid in uniqueIIDs:
        person = preprofileData[preprofileData["iid"] == iid]
        person = person[person["order"] == person["order"].max()]
        person = person.drop("order",axis = 1)
        api_url = "https://api.namefake.com/";
        if person["gender"] == 0:
            api_url += "female"
        else:
            api_url += "male"

        name = requests.get(api_url).json()["name"]
        if name not in uniqueNames:
            person["name"] = name
            uniqueNames.append(name)
        else:
            while name in uniqueNames:
                name = requests.get(api_url).json()["name"]
            person["name"] = name
            uniqueNames.append(name)

        if iid == firstIID:
            profileData = person
        else:
            profileData = pd.concat([profileData,person],axis=0)

    profileData.to_csv('../data/plotlyDashData/profileData.csv',index=False)

    partnerProfileWithDummies = profileData.copy()

    for col in profileCategoricalDataColumns:
        dummyData = pd.get_dummies(profileData[col],prefix=col,drop_first=False)
        if len(dummyData.columns) <= 25:
            for dummyCol in dummyData.columns:
                partnerProfileWithDummies = pd.concat([partnerProfileWithDummies,dummyData],axis=1)
    partnerProfileWithDummies.to_csv('../data/plotlyDashData/partnerProfileWithDummies.csv',index=False)


# Missing column analysis

In [None]:
columnsCollected = profileData.columns + [str(col)+"_o" for col in partnerProfileWithDummies.columns]

for datingFullCol in datingFull.columns:
    if datingFullCol not in columnsCollected:
        print datingFullCol