# Speed Dating Project
## Data Audit
This notebook performs audits, cleans, and splits the data into training, validation and test sets in preparation for the exploratory data analysis.


In [1]:
# libraries
import os
import sys
import inspect
sys.path.append("../src")

import itertools
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import numpy as np
import pandas as pd
import seaborn as sns

from shared import directories
from shared import filenames
from shared import variables
from visualization import visual

In [24]:
# Obtain Raw Data
df = pd.read_csv(os.path.join(directories.RAW_DATA_DIR,
                              filenames.RAW_FILENAME),
                 encoding="Latin-1", low_memory=False)

## Preprocessing
Correct spelling and modify race category levels for easy plotting.  Additional variables are added to capture the differences between a subject's self rating on key attributes and their partner's rating of the subject.  Also capture the difference in attributes between subject and partner, from the perspective of the subject.  

In [25]:
# Preprocessing
# Recode race levels
df['race'] = df['race'].replace({
    'asian/pacific islander/asian-american': 'asian',
    'european/caucasian-american': 'caucasian',
    'black/african american': 'black',
    'latino/hispanic american': 'latino',
    'other': 'other'})
df['race_o'] = df['race_o'].replace({
    'asian/pacific islander/asian-american': 'asian',
    'european/caucasian-american': 'caucasian',
    'black/african american': 'black',
    'latino/hispanic american': 'latino',
    'other': 'other'})

# Correct spelling
df.rename({'sinsere_o': 'sincere_o',
           'd_sinsere_o': 'd_sincere_o',           
           'intellicence_important': 'intelligence_important',
           'd_intellicence_important': 'd_intelligence_important',   
           'ambitous_o': 'ambition_o',
           'ambtition_important': 'ambition_important',
           'd_ambtition_important': 'd_ambition_important'},
            axis='columns', inplace=True)

# Add variables that capture the attribute difference between subjects self rating and the partner's rating of the subject
df['self_rating_difference_attractive'] = df['attractive'] - df['attractive_o']
df['self_rating_difference_sincere'] = df['sincere'] - df['sincere_o']
df['self_rating_difference_intelligence'] = df['intelligence'] - df['intelligence_o']
df['self_rating_difference_funny'] = df['funny'] - df['funny_o']
df['self_rating_difference_ambition'] = df['ambition'] - df['ambition_o']

# Add variables to capture the attribute differences between subject and partner, subject's perspective
df['rating_difference_attractive'] = df['attractive'] - df['attractive_partner']
df['rating_difference_sincere'] = df['sincere'] - df['sincere_partner']
df['rating_difference_intelligence'] = df['intelligence'] - df['intelligence_partner']
df['rating_difference_funny'] = df['funny'] - df['funny_partner']
df['rating_difference_ambition'] = df['ambition'] - df['ambition_partner']


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8378 entries, 0 to 8377
Data columns (total 133 columns):
has_null                               int64
wave                                   int64
gender                                 object
age                                    float64
age_o                                  float64
d_age                                  int64
d_d_age                                object
race                                   object
race_o                                 object
samerace                               int64
importance_same_race                   float64
importance_same_religion               float64
d_importance_same_race                 object
d_importance_same_religion             object
field                                  object
pref_o_attractive                      float64
pref_o_sincere                         float64
pref_o_intelligence                    float64
pref_o_funny                           float64
pref_o_ambitio