# 2.0 Data
This notebook does extracts the variables of interest, splits the data into training, validation and test sets, then saves the data for downstream analysis and modeling.  


In [1]:
# libraries
import os
import sys
import inspect
sys.path.append("../src")

import pandas as pd
from sklearn.model_selection import train_test_split

from shared import directories
from shared import filenames
from shared import variables
from visualization import visual

### Obtain raw data

In [2]:
df = pd.read_csv(os.path.join(directories.RAW_DATA_DIR,
                              filenames.RAW_FILENAME),
                 encoding="Latin-1", low_memory=False)

### Extract variables of interest

In [3]:
omit = ['field']
df = df.drop(columns=omit)

### Correct / Normalize Feature Names

In [4]:
df.rename({'sinsere_o': 'sincere_o',
           'intellicence_important': 'intelligence_important',
           'ambtition_important': 'ambitious_important',
           'ambition_partner': 'ambitious_partner',
           'ambition' : 'ambitious',
           'ambitous_o' : 'ambitious_o'},
          inplace=True,
          axis='columns')

### Recode Race Levels
Recode race labels for easy plotting and correct misspelled column names

In [5]:
df['race'] = df['race'].replace({
    'asian/pacific islander/asian-american': 'asian',
    'european/caucasian-american': 'caucasian',
    'black/african american': 'black',
    'latino/hispanic american': 'latino',
    'other': 'other'})
df['race_o'] = df['race_o'].replace({
    'asian/pacific islander/asian-american': 'asian',
    'european/caucasian-american': 'caucasian',
    'black/african american': 'black',
    'latino/hispanic american': 'latino',
    'other': 'other'})

## Features
### Participants per Wave
Here we capture the number of males and females per wave.

In [6]:
df2 = df[df['gender'] == 'male'][['gender', 'wave', 'has_null']].groupby(['wave', 'gender']).agg(['count']).reset_index()
df3 = df[df['gender'] == 'female'][['gender', 'wave', 'has_null']].groupby(['wave', 'gender']).agg(['count']).reset_index()
df2.columns =['wave', 'gender', 'wave_males']
df3.columns =['wave2', 'gender2', 'wave_females']
df4 = pd.concat([df2, df3], axis=1)
df4 = df4[['wave', 'wave_males', 'wave_females']]
df = pd.merge(df, df4, on='wave')

### Difference in Perceptions
The differences between self and partner perceptions along the six characteristics being studies. Specifically, interested in
    1. the differences between the subjects self-perception and their partner's assessment of them across the six
    characteristics being studied, and    
    2. the differences between the subjects self-perceptions and their perceptions of their partner

In [7]:
# Compute subject perception of relative differences
df['rd_attractive'] = (df['attractive_partner'] - df['attractive']) / df['attractive']
df['rd_sincere'] = (df['sincere_partner'] - df['sincere']) / df['sincere']
df['rd_intelligence'] = (df['intelligence_partner'] - df['intelligence']) / df['intelligence']
df['rd_funny'] = (df['funny_partner'] - df['funny']) / df['funny']
df['rd_ambitious'] = (df['ambitious_partner'] - df['ambitious']) / df['ambitious']

# Compute relative difference in subject and partner impressions
df['rd_attractive_o'] = (df['attractive_o'] - df['attractive']) / df['attractive']
df['rd_sincere_o'] = (df['sincere_o'] - df['sincere']) / df['sincere']
df['rd_intelligence_o'] = (df['intelligence_o'] - df['intelligence']) / df['intelligence']
df['rd_funny_o'] = (df['funny_o'] - df['funny']) / df['funny']
df['rd_ambitious_o'] = (df['ambitious_o'] - df['ambitious']) / df['ambitious']   

### Split into training, validation and test sets. 
First split data set into a training/test sets 80/20. Then split training set into training/validation sets 80/20

In [8]:
train, test = train_test_split(df, test_size=0.2, random_state=5)
train, validation = train_test_split(train, test_size=0.2, random_state=5)

### Save data 
Save training, validation and test sets in an interim directory for exploratory data analysis

In [9]:
def write(df, directory, filename):
    if isinstance(df, pd.DataFrame):
        if isinstance(filename, str):
            if not os.path.isdir(directory):
                os.mkdir(directory)
            df.to_csv(os.path.join(directory, filename),
                      index=False, index_label=False)
            return(True)
        else:
            return(False)
    else:
        return(False)

In [10]:
write(train, directories.INTERIM_DATA_DIR, filenames.TRAIN_FILENAME)
write(validation, directories.INTERIM_DATA_DIR, filenames.VALIDATION_FILENAME)
write(test, directories.INTERIM_DATA_DIR, filenames.TEST_FILENAME)

True