# Imports

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import pickle
import seaborn as sns
from sqlalchemy import create_engine
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve, auc
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import cross_val_score, KFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix

%matplotlib inline



# Reading in DataFrames

In [2]:
X = pickle.load(open('fly_df_X.p', 'rb'))
y = pickle.load(open('fly_df_y.p', 'rb'))

# Column renaming

In [13]:
X = X.rename(columns={'plane_travel_amount': 'q1', 
                      'height_inches': 'q22', 
                      'children_under_18': 'q2',
                      'three_arm_rests': 'q3',
                      'two_arm_rests': 'q4',
                      'shades': 'q5',
                      'move_unsold': 'q6',
                      'speak': 'q7',
                      'get_up': 'q8',
                      'recline_obligation': 'q9',
                      'friends_switch': 'q10',
                      'family_switch': 'q11',
                      'wake_bathroom': 'q12',
                      'wake_walk': 'q13',
                      'baby': 'q14',
                      'unruly_children': 'q15',
                      'electronics': 'q16',
                      'smoked': 'q17',
                      'gender': 'q18',
                      'age': 'q19',
                      'household_income': 'q20',
                      'education': 'q21'})

In [15]:
X

Unnamed: 0,q1,q2,q3,q4,q5,q6,q7,q8,q9,q10,...,q12,q13,q14,q15,q16,q18,q19,q20,q21,q22
2,4.0,0.0,4.0,1.0,1.0,0.0,0.0,4.0,1.0,0.0,...,0.0,1.0,1.0,2.0,0.0,1.0,1.0,1.0,0.0,68.0
3,4.0,0.0,1.0,1.0,0.0,0.0,0.0,4.0,0.0,1.0,...,0.0,1.0,1.0,2.0,0.0,1.0,1.0,0.0,0.0,71.0
4,3.0,0.0,3.0,3.0,0.0,0.0,0.0,5.0,0.0,0.0,...,1.0,1.0,1.0,2.0,1.0,1.0,1.0,3.0,0.0,67.0
5,4.0,1.0,4.0,4.0,1.0,1.0,0.0,3.0,1.0,1.0,...,1.0,2.0,2.0,2.0,0.0,1.0,1.0,2.0,1.0,69.0
6,3.0,1.0,3.0,4.0,0.0,0.0,1.0,5.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,1.0,1.0,4.0,4.0,74.0
7,4.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,1.0,1.0,2.0,0.0,1.0,1.0,0.0,0.0,72.0
9,4.0,0.0,0.0,0.0,0.0,2.0,2.0,1.0,1.0,2.0,...,2.0,2.0,2.0,2.0,0.0,1.0,1.0,0.0,0.0,66.0
10,3.0,0.0,1.0,1.0,0.0,2.0,0.0,0.0,1.0,0.0,...,1.0,2.0,1.0,2.0,0.0,1.0,1.0,3.0,2.0,72.0
15,4.0,0.0,1.0,1.0,0.0,0.0,1.0,5.0,1.0,1.0,...,0.0,1.0,1.0,2.0,0.0,1.0,0.0,4.0,4.0,72.0
17,4.0,1.0,1.0,1.0,0.0,0.0,0.0,3.0,0.0,0.0,...,1.0,2.0,0.0,0.0,0.0,1.0,1.0,1.0,4.0,68.0


# Modeling

In [3]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [4]:
def print_scores(model, y_pred):
    print('Accuracy:',accuracy_score(y_test, y_pred))
    print('Precision:',precision_score(y_test, y_pred, average = 'weighted'))
    print('Recall:',recall_score(y_test, y_pred, average = 'weighted'))
    print('F1:', f1_score(y_test, y_pred, average = 'weighted'))

In [5]:
RF = RandomForestClassifier()
# (n_estimators = 1000, bootstrap=False, criterion= 'gini', max_depth=None, max_features=3, min_samples_leaf=5, min_samples_split=5)
rf_model = RF.fit(X_train,y_train)
rf_y_pred = RF.predict(X_test)
rf_score = RF.predict_proba(X_test)[:,1]

In [7]:
y_pred = RF.predict(X_test)
y_pred

array([ 1.,  1.,  0.,  1.,  1.,  1.,  1.,  0.,  1.,  1.,  1.,  1.,  1.,
        1.,  1.,  1.,  1.,  1.,  1.,  1.,  0.,  1.,  1.,  0.,  1.,  1.,
        1.,  1.,  1.,  1.,  0.,  1.,  1.,  0.,  1.,  1.,  1.,  1.,  1.,
        1.,  1.,  1.,  1.,  1.,  1.,  0.,  1.,  1.,  1.,  1.,  1.,  1.,
        1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,
        1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,
        1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,
        1.,  0.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,
        1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  0.,  1.,
        1.,  1.,  1.,  1.,  1.,  0.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,
        1.,  1.,  1.,  1.,  1.,  1.,  1.,  0.,  1.,  1.,  1.,  1.,  0.,
        1.,  1.,  0.,  1.,  1.,  1.,  1.,  0.,  1.,  1.,  1.,  1.,  1.,
        1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,
        1.,  1.,  0.,  0.,  1.,  0.,  1.,  0.,  1.,  1.,  1.,  0

# Feature Importances

In [16]:
feature_importances = RF.feature_importances_

important_features = pd.DataFrame({'Features': X.columns, 'Importance Score': feature_importances})
important_features.sort('Importance Score', inplace=True, ascending=False)

important_features



Unnamed: 0,Features,Importance Score
20,q22,0.140844
17,q19,0.081792
7,q8,0.074268
18,q20,0.070653
14,q15,0.05977
19,q21,0.059469
12,q13,0.051205
8,q9,0.048492
2,q3,0.046291
3,q4,0.041629


In [17]:
labels = ['Height', 'Get Up', 'Age', 'Income', 'Education', 'Recline Obligation', 'Unruly Children', 'Wake Walk', 'Two Arm Rests',
          'Friends Switch', 'Three Arm Rests', 'Wake Bathroom', 'Baby', 'Speak', 'Move Unsold', 'Plane Travel Amount',
          'Shades', 'Family Switch', 'Gender', 'Children', 'Electronics']

In [18]:
'''Add column of names to go on D3 feature importances visualization'''

important_features['labels'] = labels
important_features

Unnamed: 0,Features,Importance Score,labels
20,q22,0.140844,Height
17,q19,0.081792,Get Up
7,q8,0.074268,Age
18,q20,0.070653,Income
14,q15,0.05977,Education
19,q21,0.059469,Recline Obligation
12,q13,0.051205,Unruly Children
8,q9,0.048492,Wake Walk
2,q3,0.046291,Two Arm Rests
3,q4,0.041629,Friends Switch


In [29]:
important_features = important_features.drop("Features", axis = 1)

In [31]:
'''Write features importances to CSV'''

important_features.to_csv("important_features.csv")