# Speed Dating Project
## Prepare Data
This notebook does extracts the variables of interest, splits the data into training, validation and test sets, then saves the data for downstream analysis and modeling.  


In [1]:
# libraries
import os
import sys
import inspect
sys.path.append("../src")

import pandas as pd
from sklearn.model_selection import train_test_split

from shared import directories
from shared import filenames
from shared import variables
from visualization import visual

### Obtain raw data

In [2]:
df = pd.read_csv(os.path.join(directories.RAW_DATA_DIR,
                              filenames.RAW_FILENAME),
                 encoding="Latin-1", low_memory=False)

### Inspect Data Structure

In [3]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8378 entries, 0 to 8377
Columns: 123 entries, has_null to match
dtypes: float64(57), int64(7), object(59)
memory usage: 7.9+ MB
None


### Extract variables of interest

In [4]:
omit = ['wave', 'field']
df = df.drop(columns=omit)

### Format Data
Recode race labels for easy plotting and correct misspelled column names

In [5]:
# Preprocessing
# Recode race levels
df['race'] = df['race'].replace({
    'asian/pacific islander/asian-american': 'asian',
    'european/caucasian-american': 'caucasian',
    'black/african american': 'black',
    'latino/hispanic american': 'latino',
    'other': 'other'})
df['race_o'] = df['race_o'].replace({
    'asian/pacific islander/asian-american': 'asian',
    'european/caucasian-american': 'caucasian',
    'black/african american': 'black',
    'latino/hispanic american': 'latino',
    'other': 'other'})

# Correct spelling
df.rename({'sinsere_o': 'sincere_o',
           'd_sinsere_o': 'd_sincere_o',           
           'intellicence_important': 'intelligence_important',
           'd_intellicence_important': 'd_intelligence_important',   
           'ambitous_o': 'ambition_o',
           'ambtition_important': 'ambition_important',
           'd_ambtition_important': 'd_ambition_important'},
            axis='columns', inplace=True)

### Split into training, validation and test sets. 
First split data set into a training/test sets 80/20. Then split training set into training/validation sets 80/20

In [7]:
train, test = train_test_split(df, test_size=0.2, random_state=5)
train, validation = train_test_split(train, test_size=0.2, random_state=5)

### Save data 
Save training, validation and test sets in an interim directory for exploratory data analysis

In [8]:
def write(df, directory, filename):
    if isinstance(df, pd.DataFrame):
        if isinstance(filename, str):
            if not os.path.isdir(directory):
                os.mkdir(directory)
            df.to_csv(os.path.join(directory, filename),
                      index=False, index_label=False)
            return(True)
        else:
            return(False)
    else:
        return(False)

In [10]:
write(train, directories.INTERIM_DATA_DIR, filenames.TRAIN_FILENAME)
write(validation, directories.INTERIM_DATA_DIR, filenames.VALIDATION_FILENAME)
write(test, directories.INTERIM_DATA_DIR, filenames.TEST_FILENAME)

True