In [1]:
import pandas as pd
import numpy as np
from pandas import Series, DataFrame
import matplotlib.pyplot as plt
import psycopg2 as pg
%matplotlib inline
import seaborn as sns
import dbcreds

In [2]:
import sys
import pickle
from utils import model_utils
import prepare_feature_matrix
import train_model
import test_model
import evaluate_models

In [3]:
for time_split in [{"split_date":"2015-04-30","train_timedelta":"365D","test_timedelta":"365D"}]:
    split_date = pd.to_datetime(time_split['split_date'])
    train_timedelta = pd.Timedelta(time_split['train_timedelta'])
    test_timedelta = pd.Timedelta(time_split['test_timedelta'])

In [4]:
for feature_set in [{"name":"demographics_only"}]:
    feature_set_name = feature_set['name']

    print "Generating Feature Matrix"
    feature_matrix = prepare_feature_matrix.generate_matrix(feature_set_name)

    print "Splitting Train/Test Sets"
    train_data,test_data=model_utils.split_train_test(feature_matrix,split_date,train_timedelta,test_timedelta)

Generating Feature Matrix
Reading data from DB
Cleaning data
Generating LTU/Non-LTU labels
Building model feature matrix
Splitting Train/Test Sets


In [8]:
print feature_matrix.shape
print train_data.shape
print test_data.shape
feature_matrix.head()

(74059, 68)
(8565, 68)
(8824, 68)


Unnamed: 0,application_id,app_start_date,ltu,age,time_since_exit,number_dependents,experience_intended_prof,experience_prev_prof,gender_F,gender_M,...,previous_prof_Unknown,training_area_Agriculture,training_area_ArtsAndHumanities,training_area_Education,training_area_EngineeringAndConstruction,training_area_HealthAndSocialProtection,training_area_STEM,training_area_Service,training_area_SocialScienceTradeAndLaw,training_area_Unkown
0,120687,2016-12-06,True,44,1928.0,2.0,36.0,36.0,0,1,...,0,0,0,0,0,0,0,0,0,1
33,61038,2012-05-17,False,42,170.0,2.0,144.0,10.0,1,0,...,0,0,0,0,1,0,0,0,0,0
55,9206,2007-12-10,False,33,24.0,1.0,30.0,30.0,0,1,...,0,0,0,0,0,0,0,0,0,1
59,8491,2007-11-05,False,32,1224.0,0.0,48.0,48.0,1,0,...,0,0,0,0,0,0,0,0,0,1
70,45682,2011-02-15,False,35,1016.0,0.0,48.0,36.0,1,0,...,0,0,0,0,0,0,0,0,0,1


In [79]:
import sklearn
from sklearn import svm
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

def svm():
    return sklearn.svm.SVC(probability=True,kernel='linear')

def random_forest():
    return sklearn.ensemble.RandomForestClassifier(n_estimators=100,
                       criterion='gini',
                       max_depth=None,
                       random_state=4321) # always set your random seed for reproducibility!

def dec_tree():
    return sklearn.tree.DecisionTreeClassifier()

def knn():
    return sklearn.neighbors.KNeighborsClassifier()

def ann():
    return sklearn.neural_network.MLPClassifier()

def adaboost():
    return sklearn.ensemble.AdaBoostClassifier()

def get_model(model_type):
    switcher = {
        'rf': random_forest,
        'svm': svm,
        'knn': knn,
        'ann': ann,
        'tree': dec_tree,
        'adaboost': adaboost,
    }
    # Get the function from switcher dictionary
    func = switcher.get(model_type)
    # Execute the function
    return func()

#Train Generic Model
def train_model(model_type,train_matrix,model_params):
    model = None

    model = get_model(model_type)

    #Define Model input/output
    y = train_matrix.ltu.astype(float)
    X = train_matrix.drop(['app_start_date','ltu','application_id'], 1)

    model.fit(X=X, y=y)
    return model

In [None]:
model_type = "svm"

start_time = time.time()
model_obj = train_model(model_type,train_data,{})
elapsed_time = time.time() - start_time
print "Elapsed time is:", elapsed_time

In [70]:
def get_feature_importances(model,train_matrix,top_n):
    features=train_matrix.drop(["app_start_date","ltu","application_id"],1).columns.tolist()
    importance=model.feature_importances_
    feature_matrix=pd.DataFrame({"features":features,"importance":importance})
    feature_matrix.sort_values('importance',inplace=True,ascending=False)
    return feature_matrix[:top_n]

In [77]:
def get_feature_importances2(model,train_matrix,top_n):
    features=train_matrix.drop(["app_start_date","ltu","application_id"],1).columns.tolist()
    coefficients=model.coef_
    #feature_matrix=pd.DataFrame({"features":features,"importance":importance})
    #feature_matrix.sort_values('importance',inplace=True,ascending=False)
    #return feature_matrix[:top_n]
    print coefficients

In [78]:
print get_feature_importances2(model_obj,train_data,5)

ValueError: coef_ is only available when using a linear kernel

In [47]:
def test_model(model_type,model,test_matrix):
    pred = model.predict_proba(test_matrix.drop(['app_start_date','ltu','application_id'],1))
    model_results = pd.Series(index=test_matrix.index, data=pred[:,1])
    labels = pd.DataFrame(test_matrix['ltu'])
    all_model_results = pd.concat([labels,pd.DataFrame(data=model_results,columns=['model_1'])], axis=1)
    return all_model_results

model_results = test_model(model_type,model_obj,test_data)


In [48]:
model_results.head()

Unnamed: 0,ltu,model_1
99,True,0.2
143,True,0.2
191,False,0.4
363,False,0.0
434,False,0.2


In [49]:
model_performance = evaluate_models.evaluate_models(model_results,1000)
print model_performance.head()

Calculating performance metrics for model: model_1
        model_id     k     metric     value
model_1  model_1  1000  precision  0.365000
model_1  model_1  1000     recall  0.143137
model_1  model_1  1000   accuracy  0.680417


In [73]:
for model in [{'type':'rf'},{'type':'svm'},{'type':'knn'},{'type':'tree'},{'type':'adaboost'},{'type':'ann'}]:
    print "Training Model"
    model_obj = train_model(model['type'],train_data,{})
    print "Testing Model"
    model_results = test_model(model['type'],model_obj,test_data)
    print "Evaluating Model Performance"
    model_performance = evaluate_models.evaluate_models(model_results,1000)
    print model_performance.head()
    print get_feature_importances(model_obj,train_data,5)


Training Model
Testing Model
Evaluating Model Performance
Calculating performance metrics for model: model_1
        model_id     k     metric     value
model_1  model_1  1000  precision  0.417000
model_1  model_1  1000     recall  0.163529
model_1  model_1  1000   accuracy  0.692203
                   features  importance
1           time_since_exit    0.164138
0                       age    0.118739
4      experience_prev_prof    0.103594
3  experience_intended_prof    0.102723
2         number_dependents    0.045618
Training Model
Testing Model
Evaluating Model Performance
Calculating performance metrics for model: model_1
        model_id     k     metric     value
model_1  model_1  1000  precision  0.337000
model_1  model_1  1000     recall  0.132157
model_1  model_1  1000   accuracy  0.674071


AttributeError: 'SVC' object has no attribute 'feature_importances_'

In [None]:
model_utils.train_model()

In [None]:
test_data.drop(['app_start_date','ltu','application_id'],1).shape