In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
from os.path import exists
from os import remove
import json
from sklearn.model_selection import train_test_split
import sklearn.model_selection as ms
from sklearn.model_selection import cross_validate
import sklearn.linear_model as lm
from sklearn.neighbors import KNeighborsClassifier as knn
from sklearn.ensemble import GradientBoostingClassifier as grad
from sklearn.ensemble import RandomForestRegressor as rf
from sklearn import metrics

import sys
sys.path.insert(0, "../util/")
import util as util

%autosave 5

Autosaving every 5 seconds


# Adding dummies and train-test-split

In [2]:
datingTrainExists = exists('../data/processedData/datingTrain.csv')
datingTestExists = exists('../data/processedData/datingTest.csv')
datingFullExists = exists('../data/processedData/datingFull.csv')

relatedDummiesDictionaryExists = exists('../data/processedData/relatedDummiesDictionary.json')

In [3]:
if exists('../data/processedData/columnDataDictionary.json'):
    with open('../data/processedData/columnDataDictionary.json') as d:
        columnDataDictionary = json.load(d)
else:
    with open('../data/columnDataDictionary.json') as d:
        columnDataDictionary = json.load(d)
columnList = columnDataDictionary['columnList']
nonBinaryCategoricalList = columnDataDictionary['nonBinaryCategoricalList']
stringToFloatList = columnDataDictionary['stringToFloatList']
pointDistributionList = columnDataDictionary['pointDistributionList']
partnerList = columnDataDictionary['partnerList']

In [4]:
if (datingTrainExists and datingTestExists and datingFullExists and relatedDummiesDictionaryExists):
    datingTrain = pd.read_csv('../data/processedData/datingTrain.csv')
    datingTest = pd.read_csv('../data/processedData/datingTest.csv')
    datingFull = pd.read_csv('../data/processedData/datingFull.csv')
    
    with open('../data/processedData/relatedDummiesDictionary.json') as d:
        relatedDummiesDictionary = json.load(d)
    for df in [datingTrain,datingTest,datingFull]:
        df['zipcode'] = df['zipcode'].apply(str)
        if 'zipcode_o' in list(df.columns):
            df['zipcode_o'] = df['zipcode_o'].apply(str)
        for col in nonBinaryCategoricalList:
            if col in list(df.columns):
                df[col] = df[col].apply(str)
        
else:
    !rm -r ../data/processedData
    !mkdir ../data/processedData
    
    datingData = pd.read_csv('../data/encoded-SpeedDatingData-WithLocations.csv')   
    
    blindDateData = datingData[columnList]
    
    for col in stringToFloatList:
        blindDateData[col] = blindDateData[col].str.replace(',', '').astype(float)
    
    blindDateData['zipcode'] = blindDateData['zipcode'].apply(str)
    blindDateData['zipcode'] = blindDateData['zipcode'].str.replace(',', '')
    
    for col in nonBinaryCategoricalList:
        blindDateData[col] = blindDateData[col].apply(str)
    
    blindDateCategoricalData = blindDateData.select_dtypes(include=['O'])
    for col in blindDateCategoricalData.columns:
        blindDateData[col]=blindDateData[col].fillna('nan')
    relatedDummiesDictionary = {}
    for col in blindDateCategoricalData.columns:
        dummyData = pd.get_dummies(blindDateData[col],prefix=col,drop_first=True)
        if len(dummyData.columns) <= 21:
            for dummyCol in dummyData.columns:
                relatedDummiesDictionary[str(dummyCol)] = list(dummyData.columns)
                if col in partnerList:
                    partnerList.append(str(dummyCol))
            blindDateData = pd.concat([blindDateData,dummyData],axis=1)
    with open('../data/processedData/relatedDummiesDictionary.json', 'w') as fp:
        json.dump(relatedDummiesDictionary, fp)
        
    partnerList = list(set(partnerList))
    columnDataDictionary = {"columnList": columnList,
                        "nonBinaryCategoricalList": nonBinaryCategoricalList,
                        "stringToFloatList": stringToFloatList,
                        "pointDistributionList": pointDistributionList,
                        "partnerList": partnerList}

    with open('../data/processedData/columnDataDictionary.json', 'w') as fp:
            json.dump(columnDataDictionary, fp)
    
    datingFull = blindDateData.copy()
    match = datingFull['match']
    X = datingFull.drop(['match'], axis=1)
    
    datingTrain, datingTest, matchTrain, matchTest = train_test_split(X, match, test_size=0.2)
    
    datingTrain['match'] = matchTrain
    datingTest['match'] = matchTest
    
    datingTrain.to_csv('../data/processedData/datingTrain.csv',index=False)
    datingTest.to_csv('../data/processedData/datingTest.csv',index=False)
    datingFull.to_csv('../data/processedData/datingFull.csv',index=False)
    
    dummyColumns = list(relatedDummiesDictionary.keys())

## Data Cleaning

### Join partner

In [5]:
for partnerCol in partnerList:
    if (("_o" not in str(partnerCol)) and (partnerCol+"_o" not in datingTrain.columns)):
        partner = datingFull.copy()
        datingTrain = util.joinToPartner(datingTrain,partner)
        datingTrain.to_csv('../data/processedData/datingTrain.csv',index=False)
        break

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  partner_o['iid_o'] = partner_o['iid']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  partner_o['pid_o'] = partner_o['pid']


### Get distance

In [6]:
if "partnerDistance" not in datingTrain.columns:
    datingTrain = util.returnDFWithpartnerDistance(datingTrain,"train",True)
    datingTrain.to_csv('../data/processedData/datingTrain.csv',index=False)

### Fix met value according to [The Data Science Book of Love](https://www.kaggle.com/code/lucabasa/the-data-science-book-of-love/notebook)

In [7]:
if max(datingTrain['met']) > 1 or max(datingTrain['met_o']) > 1:
    datingTrain.loc[datingTrain.met.isna(), 'met'] = 0
    datingTrain.loc[datingTrain.met_o.isna(), 'met_o'] = 0

    datingTrain.loc[datingTrain.met < 2, 'met'] = 0
    datingTrain.loc[datingTrain.met_o < 2, 'met_o'] = 0
    datingTrain.loc[datingTrain.met > 1, 'met'] = 1
    datingTrain.loc[datingTrain.met_o > 1, 'met_o'] = 1

### Fix ambiguous scores

In [None]:
halfwayChangeColumns = [str(col) for col in datingTrain.columns if (("1_s" in str(col)) | ("3_s" in str(col)))]

if(len(halfwayChangeColumns) > 0):
    datingTrain = util.fixAmbiguousScores(datingTrain)
    util.halfwayQuestionSanityTest(datingTrain," post-fixAmbiguousScores and pre-saving")
    datingTrain.to_csv('../data/processedData/datingTrain.csv',index=False)

--Return--
None
> [0;32m/Users/garysimmons/NYCDSA/projects/datingSelectionClassifier/util/util.py[0m(288)[0;36mhalfwayQuestionSanityTest[0;34m()[0m
[0;32m    286 [0;31m[0;34m[0m[0m
[0m[0;32m    287 [0;31m    [0;32mif[0m[0;34m([0m[0mlen[0m[0;34m([0m[0mhalfwayChangeColumns[0m[0;34m)[0m [0;34m>[0m [0;36m0[0m [0;32mand[0m [0;34m"order"[0m [0;32mnot[0m [0;32min[0m [0mdf[0m[0;34m.[0m[0mcolumns[0m[0;34m)[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m--> 288 [0;31m        [0mbreakpoint[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    289 [0;31m[0;34m[0m[0m
[0m[0;32m    290 [0;31m[0;32mdef[0m [0mdisplayValueExceptionColumn[0m[0;34m([0m[0mX[0m[0;34m)[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m
--KeyboardInterrupt--

KeyboardInterrupt: Interrupted by user
--Return--
None
> [0;32m/Users/garysimmons/NYCDSA/projects/datingSelectionClassifier/util/util.py[0m(288)[0;36mhalfwayQuestionSanityTest[0;34m()[0m


### Replace Nans

In [None]:
datingTrainNumerical = datingTrain.select_dtypes(include=['uint8','int64','float64']).drop('match',axis = 1)

if (exists('../data/processedData/trainNanReplacementValuesDictionary.json')):
    with open('../data/processedData/trainNanReplacementValuesDictionary.json') as d:
        trainNanReplacementValuesDictionary = json.load(d)
else:
    trainNanReplacementValuesDictionary = {}
    for col in datingTrainNumerical:
        if col in ["age","age_o","pf_o_att","pf_o_sin","pf_o_int","pf_o_fun","pf_o_amb","pf_o_sha","attr_o","sinc_o","intel_o","mn_sat","tuition","income",
                   "sports","tvsports","exercise","dining","museums","art","hiking","gaming","reading","tv",
                  "theater","movies","concerts","music","shopping","yoga","exphappy","expnum","attr1_1","sinc1_1","intel1_1","fun1_1","shar1_1","attr4_1","sinc4_1",
                  "intel4_1","fun4_1","shar4_1","attr2_1","sinc2_1","intel2_1","fun2_1","shar2_1","attr3_1","sinc3_1","intel3_1","fun3_1","attr5_1","sinc5_1",
                   "intel5_1","fun5_1","match_es","sports_o","tvsports_o","exercise_o","dining_o","museums_o","art_o",
                  "hiking_o","gaming_o","clubbing_o","reading_o","tv_o","theater_o","movies_o","concerts_o","music_o","shopping_o","yoga_o","exphappy_o","expnum_o",
                   "attr4_1_o","sinc4_1_o","intel4_1_o","fun4_1_o","shar4_1_o","attr2_1_o","sinc2_1_o","intel2_1_o","fun2_1_o","shar2_1_o","attr3_1_o","sinc3_1_o",
                   "intel3_1_o","fun3_1_o","attr5_1_o","sinc5_1_o","intel5_1_o","fun5_1_o","match_es_o","lats","lons","lats_o","lons_o","partnerDistance"]:
            trainNanReplacementValuesDictionary[str(col)] = datingTrainNumerical[col].mean()
        elif col in ["imprace","imprelig","zipcode","goal","date","go_out","career_c","met","imprace_o","imprelig_o","zipcode_o","goal_o","date_o","career_c_o"]:
            trainNanReplacementValuesDictionary[str(col)] = round(np.mean(datingTrainNumerical[col].mode().values))
        else:
            trainNanReplacementValuesDictionary[str(col)] = 0
    with open('../data/processedData/trainNanReplacementValuesDictionary.json', 'w') as fp:
        json.dump(trainNanReplacementValuesDictionary, fp)

In [None]:
datingTrain = util.replaceNansWithTrainingDataValues(datingTrain)

In [None]:
X = datingTrainNumerical
match = datingTrain["match"]

# Training

In [None]:
sqrtn = np.floor(np.sqrt(X.shape[0]))
sqrtfeatures = np.floor(np.sqrt(X.shape[1]))
log2features = np.floor(np.log2(X.shape[1]))

## Logistic Regression

In [None]:
logModel = lm.LogisticRegression()
try:
    logModel.fit(X,match)
except ValueException:
    util.displayValueExceptionColumn(X)

## KNN

In [None]:
knn5 = knn(n_neighbors=5)
knnsqrtn = knn(n_neighbors=sqrtn)
try:
    knn5.fit(X,match)
    knnsqrtn.fit(X,match)
except ValueException:
    util.displayValueExceptionColumn(X)


## Boosting

In [None]:
gradientdeci = grad(learning_rate=0.1)
gradientdeka = grad(learning_rate=10)

try:
    gradientdeci.fit(X,match)
    gradientdeka.fit(X,match)
except ValueException:
    util.displayValueExceptionColumn(X)

## Random Forest Classifier

In [None]:
if exists("../data/processedData/forestParams.json"):
    with open('../data/processedData/forestParams.json') as d:
        forestParams = json.load(d)
        accurateForestParams = forestParams["accurateForestParams"]
        recallForestParams = forestParams["recallForestParams"]
else:
    n_estimator_list = list(np.linspace(100,1000,11))
    searchParams = [{
        "criterion":["gini","entropy","log_loss"],
        "n_estimators": n_estimator_list,
        "max_depth":[sqrtfeatures,log2features,None],
        "max_features":["sqrt","log2",None]
    }]

    accurateForest0 = rf()
    recallForest0 = rf()

    accurateForestGrid = ms.GridSearchCV(accurateForest0, searchParams, scoring='accuracy')
    recallForestGrid = ms.GridSearchCV(recallForest0, searchParams, scoring='recall')

    accurateForestGrid.fit(X,match)
    recallForestGrid.fit(X,match)

    accurateForestParams = accurateForestGrid.best_params_
    recallForestParams = recallForestGrid.best_params_
    forestParams = {
        "accurateForestParams": accurateForestParams,
        "recallForestParams": recallForestParams
    }
    with open("../data/processedData/forestParams.json", 'w') as fp:
        json.dump(forestParams, fp)
    
accurateForest = rf(n_estimators = accurateForestParams["n_estimators"],
                    criterion = accurateForestParams["criterion"],
                    max_depth = accurateForestParams["max_depth"],
                    max_depth = accurateForestParams["max_depth"])
recallForest = rf(n_estimators = recallForestParams["n_estimators"],
                    criterion = recallForestParams["criterion"],
                    max_depth = recallForestParams["max_depth"],
                    max_depth = recallForestParams["max_depth"])

try:
    accurateForest.fit(X,match)
    recallForest.fit(X,match)
except ValueException:
    util.displayValueExceptionColumn(X)

# Individual Testing

## Logistic Regression

## KNN

## Boosting

## Random Forest Classifier

# Ensemble Testing