# 2.0 Data
This notebook does extracts the variables of interest, splits the data into training, validation and test sets, then saves the data for downstream analysis and modeling.  


In [1]:
# libraries
import os
import sys
import inspect
sys.path.append("../src")

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

from shared import directories
from shared import filenames
from shared import variables

### Obtain raw data

In [2]:
df = pd.read_csv(os.path.join(directories.RAW_DATA_DIR,
                              filenames.RAW_FILENAME),
                 encoding="Latin-1", low_memory=False)

### Extract variables of interest

In [3]:
delete = ['has_null', 'd_d_age','importance_same_religion','d_importance_same_race', 'd_importance_same_religion', 'field', 
          'd_pref_o_attractive','d_pref_o_sincere', 'd_pref_o_intelligence', 'd_pref_o_funny', 'd_pref_o_ambitious',
          'd_pref_o_shared_interests', 'd_attractive_o', 'd_sinsere_o', 'd_intelligence_o', 'd_funny_o', 'd_ambitous_o',
          'd_shared_interests_o', 'd_attractive_important', 'd_sincere_important', 'd_intellicence_important', 
          'd_funny_important', 'd_ambtition_important', 'd_shared_interests_important', 'd_attractive', 'd_sincere',
          'd_intelligence', 'd_funny', 'd_ambition', 'd_attractive_partner', 'd_sincere_partner', 'd_intelligence_partner',
          'd_funny_partner', 'd_ambition_partner', 'd_shared_interests_partner', 'd_sports', 'd_tvsports', 'd_exercise',
          'd_dining', 'd_museums','d_art', 'd_hiking', 'd_gaming', 'd_clubbing', 'd_reading', 'd_tv', 'd_theater',
          'd_movies', 'd_concerts', 'd_music', 'd_shopping', 'd_yoga', 'd_interests_correlate', 'expected_happy_with_sd_people',
          'expected_num_interested_in_me', 'expected_num_matches', 'd_expected_happy_with_sd_people', 
          'd_expected_num_interested_in_me', 'd_expected_num_matches', 'like', 'guess_prob_liked', 'd_like',
          'd_guess_prob_liked', 'met']
df = df.drop(columns=delete)


### Correct / Normalize Feature Names

In [4]:
df.rename({'sinsere_o': 'sincere_o',
           'intellicence_important': 'intelligence_important',
           'ambtition_important': 'ambitious_important',
           'ambition_partner': 'ambitious_partner',
           'ambition' : 'ambitious',
           'ambitous_o' : 'ambitious_o'},
          inplace=True,
          axis='columns')

### Recode Race Levels
Recode race labels for easy plotting and correct misspelled column names

In [5]:
df['race'] = df['race'].replace({
    'asian/pacific islander/asian-american': 'asian',
    'european/caucasian-american': 'caucasian',
    'black/african american': 'black',
    'latino/hispanic american': 'latino',
    'other': 'other'})
df['race_o'] = df['race_o'].replace({
    'asian/pacific islander/asian-american': 'asian',
    'european/caucasian-american': 'caucasian',
    'black/african american': 'black',
    'latino/hispanic american': 'latino',
    'other': 'other'})

## Features
### Participants per Wave
Here we capture the number of males and females per wave. Also include the percentage of matches per wave = matches/2*wave size (since matches are double counted)

In [6]:
df2 = df[df['gender'] == 'male'][['gender', 'wave', 'match']].groupby(['wave', 'gender']).agg(['count']).reset_index()
df3 = df[df['gender'] == 'female'][['gender', 'wave', 'match']].groupby(['wave', 'gender']).agg(['count']).reset_index()
df2.columns =['wave', 'gender', 'wave_males']
df3.columns =['wave2', 'gender2', 'wave_females']
df4 = pd.concat([df2, df3], axis=1)
df4 = df4[['wave', 'wave_males', 'wave_females']]
df4['wave_size'] = df4['wave_males'] + df4['wave_females']
df = pd.merge(df, df4, on='wave')

### Convert select numeric variables to categorical

In [7]:
df.samerace = df.samerace.astype(object)
df.wave = df.wave.astype(object)
df['match'] = np.where(df['match']==0, 'Not Matched', 'Matched')

### Difference in Perceptions
The differences between self and partner perceptions along the six characteristics being studies. Specifically, interested in
    1. the differences between the subjects self-perception and their partner's assessment of them across the six
    characteristics being studied, and    
    2. the differences between the subjects self-perceptions and their perceptions of their partner

In [8]:
# Compute difference in subject self rating and rating of partner (positive number means higher score for subject, from subject's perspective)
df['rel_attractive_s'] = (df['attractive'] - df['attractive_partner']) / df['attractive']
df['rel_sincere_s'] = (df['sincere'] - df['sincere_partner']) / df['sincere']
df['rel_intelligence_s'] = (df['intelligence'] - df['intelligence_partner']) / df['intelligence']
df['rel_funny_s'] = (df['funny'] - df['funny_partner']) / df['funny']
df['rel_ambitious_s'] = (df['ambitious'] - df['ambitious_partner']) / df['ambitious']

# Compute partner perception of relative difference in subject and partner impressions (positive number is good, 0 means agreement)
df['rel_attractive_o'] = (df['attractive_o'] - df['attractive']) / df['attractive']
df['rel_sincere_o'] = (df['sincere_o'] - df['sincere']) / df['sincere']
df['rel_intelligence_o'] = (df['intelligence_o'] - df['intelligence']) / df['intelligence']
df['rel_funny_o'] = (df['funny_o'] - df['funny']) / df['funny']
df['rel_ambitious_o'] = (df['ambitious_o'] - df['ambitious']) / df['ambitious']   

### Split into training, validation and test sets. 
First split data set into a training/test sets 80/20. Then split training set into training/validation sets 80/20

In [9]:
train, test = train_test_split(df, test_size=0.2, random_state=5)
train, validation = train_test_split(train, test_size=0.2, random_state=5)

### Save data 
Save training, validation and test sets in an interim directory for exploratory data analysis

In [10]:
def write(df, directory, filename):
    if isinstance(df, pd.DataFrame):
        if isinstance(filename, str):
            if not os.path.isdir(directory):
                os.mkdir(directory)
            df.to_csv(os.path.join(directory, filename),
                      index=False, index_label=False)
            return(True)
        else:
            return(False)
    else:
        return(False)

In [11]:
write(train, directories.INTERIM_DATA_DIR, filenames.TRAIN_FILENAME)
write(validation, directories.INTERIM_DATA_DIR, filenames.VALIDATION_FILENAME)
write(test, directories.INTERIM_DATA_DIR, filenames.TEST_FILENAME)
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5361 entries, 4364 to 7459
Data columns (total 72 columns):
wave                          5361 non-null object
gender                        5361 non-null object
age                           5303 non-null float64
age_o                         5286 non-null float64
d_age                         5361 non-null int64
race                          5324 non-null object
race_o                        5305 non-null object
samerace                      5361 non-null object
importance_same_race          5311 non-null float64
pref_o_attractive             5297 non-null float64
pref_o_sincere                5297 non-null float64
pref_o_intelligence           5297 non-null float64
pref_o_funny                  5289 non-null float64
pref_o_ambitious              5281 non-null float64
pref_o_shared_interests       5268 non-null float64
attractive_o                  5210 non-null float64
sincere_o                     5163 non-null float64
intelligence_