In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
from os.path import exists
from os import remove
import json
from sklearn.model_selection import train_test_split
import sklearn.model_selection as ms
from sklearn.model_selection import cross_validate
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics

import sys
sys.path.insert(0, "../util/")
import util as util

%autosave 5

Autosaving every 5 seconds


# Adding dummies and train-test-split

In [2]:
datingTrainWithDummiesExists = exists('../data/processedData/datingTrainWithDummies.csv')
datingTestWithDummiesExists = exists('../data/processedData/datingTestWithDummies.csv')
datingTrainWithoutDummiesExists = exists('../data/processedData/datingTrainWithoutDummies.csv')
datingTestWithoutDummiesExists = exists('../data/processedData/datingTestWithoutDummies.csv')
datingFullWithDummiesExists = exists('../data/processedData/datingFullWithDummies.csv')
datingFullWithoutDummiesExists = exists('../data/processedData/datingFullWithoutDummies.csv')

relatedDummiesDictionaryExists = exists('../data/processedData/relatedDummiesDictionary.json')

In [3]:
with open('../data/processedData/columnDataDictionary.json') as d:
    columnDataDictionary = json.load(d)
columnList = columnDataDictionary['columnList']
nonBinaryCategoricalList = columnDataDictionary['nonBinaryCategoricalList']
stringToFloatList = columnDataDictionary['stringToFloatList']
pointDistributionList = columnDataDictionary['pointDistributionList']
partnerList = columnDataDictionary['partnerList']

In [4]:
if (datingTrainWithDummiesExists and datingTestWithDummiesExists and datingFullWithDummiesExists and
    datingTrainWithoutDummiesExists and datingTestWithoutDummiesExists and datingFullWithoutDummiesExists and
    relatedDummiesDictionaryExists):
    datingTrainWithDummies = pd.read_csv('../data/processedData/datingTrainWithDummies.csv')
    datingTestWithDummies = pd.read_csv('../data/processedData/datingTestWithDummies.csv')
    datingTrainWithoutDummies = pd.read_csv('../data/processedData/datingTrainWithoutDummies.csv')
    datingTestWithoutDummies = pd.read_csv('../data/processedData/datingTestWithoutDummies.csv')
    datingFullWithDummies = pd.read_csv('../data/processedData/datingFullWithDummies.csv')
    datingFullWithoutDummies = pd.read_csv('../data/processedData/datingFullWithoutDummies.csv')
    
    with open('../data/processedData/relatedDummiesDictionary.json') as d:
        relatedDummiesDictionary = json.load(d)
    for df in [datingTrainWithDummies,datingTestWithDummies,datingFullWithDummies,datingTrainWithoutDummies,datingTestWithoutDummies,datingFullWithoutDummies]:
        df['zipcode'] = df['zipcode'].apply(str)
        if 'zipcode_o' in list(df.columns):
            df['zipcode_o'] = df['zipcode_o'].apply(str)
        for col in nonBinaryCategoricalList:
            if col in list(df.columns):
                df[col] = df[col].apply(str)
        
else:
    !rm -r ../data/processedData
    !mkdir ../data/processedData
    
    datingData = pd.read_csv('../data/encoded-SpeedDatingData.csv')
    blindDateData = datingData[columnList]
    
    for col in stringToFloatList:
        blindDateData[col] = blindDateData[col].str.replace(',', '').astype(float)
    blindDateData['zipcode'] = blindDateData['zipcode'].str.replace(',', '')
    for col in nonBinaryCategoricalList:
        blindDateData[col] = blindDateData[col].apply(str)
    
    blindDateCategoricalData = blindDateData.select_dtypes(include=['O'])
    for col in blindDateCategoricalData.columns:
        blindDateData[col]=blindDateData[col].fillna('nan')
    relatedDummiesDictionary = {}
    for col in blindDateCategoricalData.columns:
        dummyData = pd.get_dummies(blindDateData[col],prefix=col,drop_first=True)
        for dummyCol in dummyData.columns:
            relatedDummiesDictionary[str(dummyCol)] = list(dummyData.columns)
            if col in partnerList:
                partnerList.append(str(dummyCol))
        blindDateData = pd.concat([blindDateData,dummyData],axis=1)
    with open('../data/processedData/relatedDummiesDictionary.json', 'w') as fp:
        json.dump(relatedDummiesDictionary, fp)
        
    partnerList = list(set(partnerList))
    columnDataDictionary = {"columnList": columnList,
                        "nonBinaryCategoricalList": nonBinaryCategoricalList,
                        "stringToFloatList": stringToFloatList,
                        "pointDistributionList": pointDistributionList,
                        "partnerList": partnerList}

    with open('../data/processedData/columnDataDictionary.json', 'w') as fp:
            json.dump(columnDataDictionary, fp)
    
    datingFullWithDummies = blindDateData.copy()
    match = datingFullWithDummies['match']
    X = datingFullWithDummies.drop(['match'], axis=1)
    
    datingTrainWithDummies, datingTestWithDummies, matchTrain, matchTest = train_test_split(X, match, test_size=0.2)
    
    datingTrainWithDummies['match'] = matchTrain
    datingTestWithDummies['match'] = matchTest
    
    datingTrainWithDummies.to_csv('../data/processedData/datingTrainWithDummies.csv',index=False)
    datingTestWithDummies.to_csv('../data/processedData/datingTestWithDummies.csv',index=False)
    datingFullWithDummies.to_csv('../data/processedData/datingFullWithDummies.csv',index=False)
    
    dummyColumns = list(relatedDummiesDictionary.keys())
    datingTrainWithoutDummies = datingTrainWithDummies.drop(dummyColumns, axis=1)
    datingTestWithoutDummies = datingTestWithDummies.drop(dummyColumns, axis=1)
    datingFullWithoutDummies = datingFullWithDummies.drop(dummyColumns, axis=1)
    
    datingTrainWithoutDummies.to_csv('../data/processedData/datingTrainWithoutDummies.csv',index=False)
    datingTestWithoutDummies.to_csv('../data/processedData/datingTestWithoutDummies.csv',index=False)
    datingFullWithoutDummies.to_csv('../data/processedData/datingFullWithoutDummies.csv',index=False)

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


## Data Cleaning

### Join partner

In [5]:
for partnerCol in partnerList:
    if (("_o" not in str(partnerCol)) and (partnerCol+"_o" not in datingTrainWithoutDummies.columns) and (partnerCol+"_o" not in datingTrainWithDummies.columns)):
        partnerWithDummies = datingFullWithDummies.copy()
        partnerWithoutDummies = datingFullWithoutDummies.copy()
        datingTrainWithDummies = util.joinToPartner(datingTrainWithDummies,partnerWithDummies)
        datingTrainWithoutDummies = util.joinToPartner(datingTrainWithoutDummies,partnerWithoutDummies)
        datingTrainWithDummies.to_csv('../data/processedData/datingTrainWithDummies.csv',index=False)
        datingTrainWithoutDummies.to_csv('../data/processedData/datingTrainWithoutDummies.csv',index=False)
        break

### Get distance

In [6]:
if (("partnerDistance" not in datingTrainWithoutDummies.columns) or 
    ("partnerDistance" not in datingTrainWithDummies.columns)):
    datingTrainWithoutDummies = util.returnDFWithpartnerDistance(datingTrainWithoutDummies,"train",True)
    locationColumns = ["lats","lons","lats_o","lons_o","partnerDistance"]
    for locationCol in locationColumns:
        datingTrainWithDummies[locationCol] = datingTrainWithoutDummies[locationCol]
    datingTrainWithDummies.to_csv('../data/processedData/datingTrainWithDummies.csv',index=False)
    datingTrainWithoutDummies.to_csv('../data/processedData/datingTrainWithoutDummies.csv',index=False)

### Fix ambiguous scores

In [7]:
halfwayChangeColumnsFromWithDummies = [str(col) for col in datingTrainWithDummies.columns if (("1_s" in str(col)) | ("3_s" in str(col)))]
halfwayChangeColumnsFromWithoutDummies = [str(col) for col in datingTrainWithoutDummies.columns if (("1_s" in str(col)) | ("3_s" in str(col)))]

if(len(halfwayChangeColumnsFromWithDummies) > 0):
    for question in pointDistributionList:
        questionSumString = f'{question}_sum'
        questionCols = [str(col) for col in datingTrainWithDummies if question in str(col)]
        datingTrainWithDummies[questionSumString] = datingTrainWithDummies[questionCols].sum(axis = 1)
        datingTrainWithoutDummies[questionSumString] = datingTrainWithoutDummies[questionCols].sum(axis = 1)
        for questionCol in questionCols:
            questionValues = [np.nan] * datingTrainWithDummies.shape[0]
            for rowindex in range(datingTrainWithDummies.shape[0]):
                row = datingTrainWithDummies.iloc[rowindex]
                if util.isNan(row[questionSumString]) | (row[questionSumString] == 0):
                    questionValues[rowindex] = row[str(questionCol)]
                else:
                    row[str(questionCol)] * 100 / row[questionSumString]
            datingTrainWithDummies[str(questionCol)] = pd.Series(questionValues)
            datingTrainWithoutDummies[str(questionCol)] = pd.Series(questionValues)
            datingTrainWithDummies = datingTrainWithDummies.drop(questionSumString,axis=1)
            datingTrainWithoutDummies = datingTrainWithoutDummies.drop(questionSumString,axis=1)
    for halfwayQuestion in halfwayChangeColumnsFromWithoutDummies:
        targetQuestion = ""
        if ("1_s" in halfwayQuestion):
            if halfwayQuestion == "attr1_s_o":
                targetQuestion = "pf_o_att"
            elif halfwayQuestion == "sinc1_s_o":
                targetQuestion = "pf_o_sin"
            elif halfwayQuestion == "intel1_s_o":
                targetQuestion = "pf_o_int"
            elif halfwayQuestion == "fun1_s_o":
                targetQuestion = "pf_o_fun"
            elif halfwayQuestion == "amb1_s_o":
                targetQuestion = "pf_o_amb"
            elif halfwayQuestion == "shar1_s_o":
                targetQuestion = "pf_o_sha"
            else:
                targetQuestion = halfwayQuestion.replace("1_s","1_1")
        else:
            targetQuestion = halfwayQuestion.replace("3_s","3_1")
        currentMindsetAnswers = []
        for rowindex in range(datingTrainWithDummies.shape[0]):
            row = datingTrainWithDummies.iloc[rowindex]
            if util.isNan(row[halfwayQuestion]) | (row["order"] <= int(row["round"])):
                currentMindsetAnswers.append(row[targetQuestion])
            else:
                currentMindsetAnswers.append(row[halfwayQuestion])
        datingTrainWithDummies[targetQuestion] = pd.Series(currentMindsetAnswers)
        datingTrainWithoutDummies[targetQuestion] = pd.Series(currentMindsetAnswers)
        datingTrainWithDummies = datingTrainWithDummies = datingTrainWithDummies.drop(halfwayQuestion,axis = 1)
        datingTrainWithoutDummies = datingTrainWithoutDummies.drop(halfwayQuestion,axis = 1)
    datingTrainWithDummies.to_csv('../data/processedData/datingTrainWithDummies.csv',index=False)
    datingTrainWithoutDummies.to_csv('../data/processedData/datingTrainWithoutDummies.csv',index=False)

KeyError: 'pf_o_sum'

### Replace Nans

In [None]:
datingTrainNumerical = datingTrainWithDummies.select_dtypes(include=['uint8','int64','float64'])

if (exists('../data/processedData/trainNanReplacementValuesDictionary.json')):
    with open('../data/processedData/trainNanReplacementValuesDictionary.json') as d:
        trainNanReplacementValuesDictionary = json.load(d)
else:
    trainNanReplacementValuesDictionary = {}
    for col in datingTrainNumerical:
        if col in ["int_corr","age","age_o","pf_o_att","pf_o_sin","pf_o_int","pf_o_fun","pf_o_amb","pf_o_sha","attr_o","sinc_o","intel_o","amb_o","fun_o","shar_o",
                  "like_o","prob_o","mn_sat","tuition","income","sports","tvsports","exercise","dining","museums","art","hiking","gaming","reading","tv",
                  "theater","movies","concerts","music","shopping","yoga","exphappy","expnum","attr1_1","sinc1_1","intel1_1","fun1_1","shar1_1","attr4_1","sinc4_1",
                  "intel4_1","fun4_1","shar4_1","attr2_1","sinc2_1","intel2_1","fun2_1","shar2_1","attr3_1","sinc3_1","intel3_1","fun3_1","attr5_1","sinc5_1",
                   "intel5_1","fun5_1","attr","sinc","intel","fun","amb","like","prob","match_es","sports_o","tvsports_o","exercise_o","dining_o","museums_o","art_o",
                  "hiking_o","gaming_o","clubbing_o","reading_o","tv_o","theater_o","movies_o","concerts_o","music_o","shopping_o","yoga_o","exphappy_o","expnum_o",
                   "attr4_1_o","sinc4_1_o","intel4_1_o","fun4_1_o","shar4_1_o","attr2_1_o","sinc2_1_o","intel2_1_o","fun2_1_o","shar2_1_o","attr3_1_o","sinc3_1_o",
                   "intel3_1_o","fun3_1_o","attr5_1_o","sinc5_1_o","intel5_1_o","fun5_1_o","match_es_o","lats","lons","lats_o","lons_o","partnerDistance"]:
            trainNanReplacementValuesDictionary[str(col)] = datingTrainNumerical[col].mean()
        elif col in ["imprace","imprelig","zipcode","goal","date","go_out","career_c","met","imprace_o","imprelig_o","zipcode_o","goal_o","date_o","career_c_o"]:
            trainNanReplacementValuesDictionary[str(col)] = round(np.mean(datingTrainNumerical[col].mode().values))
        else:
            trainNanReplacementValuesDictionary[str(col)] = 0
    with open('../data/processedData/trainNanReplacementValuesDictionary.json', 'w') as fp:
        json.dump(trainNanReplacementValuesDictionary, fp)

In [None]:
datingTrainWithDummies = util.replaceNansWithTrainingDataValues(datingTrainWithDummies)
datingTrainWithoutDummies = util.replaceNansWithTrainingDataValues(datingTrainWithoutDummies)

# Training

## Logistic Regression

## KNN

## Random Forest Classifier

## Boosting

# Individual Testing

## Logistic Regression

## KNN

## Random Forest Classifier

## Boosting

# Ensemble Testing