In [167]:
import pandas as pd
import numpy as np

from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import plot_tree
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

##Seaborn for fancy plots. 
import matplotlib.pyplot as plt
import seaborn as sns
plt.rcParams["figure.figsize"] = (8,8)

## 3950 Assignment 1: Part 2

For this assignment we want to use some sort of tree based model to classify the data below. We have a very small training set, so overfitting is a very real concern. 

Some specifics for this assignment:
<ul>
<li>Please use the show_eda to control if EDA stuff is shown. I don't really need to see all the EDA stuff (nor do you after you've done it), so we can make it configurable with a variable to speed up time. Please set this FALSE when you submit, so I can run all and see the outcome without histograms etc...
<li>Please ensure that whatever model you end up with is in a variable named best at the end.
<li>Please use some pipeline in prepping the data. The test data is in an identical format to the training data, so whatever pipeline you've created for your training will work for the testing. 
<li>The accuracy scoring will be an average of accuracy and roc_auc. 
</ul>

### Grading Metrics
<ul>
<li><b>Pipeline Used - 10pts</b> The data loading needs to be in a pipeline. See the test part for illustration. When testing I'll call your pipe with the new data (format is identical to training), so any prep stuff should be in the pipeline. 
<li><b>Tree Based Model Used - 5pts</b> The model used for classification needs to be some variety of tree, beyond that it is up to you. 
<li><b>Accuracy - 5pts</b> The final accuracy acheived. This will be a rough ranking, I'm assuming most people will get a similar level of accuracy, marks will only be deducted if yours is far wosrse, as that's an indication that you probably didn't take any/many steps to improve things. 
<li><b>Clarity and Formatting - 5pts</b> Is it organized and can I read it?
    <ul>
    <li> <b>Note:</b> for this assignment, and in general, please get rid of my comments and replace them with your own. I'm going to read this, so all of these instructions aren't really required. Think of this as a template, get rid of the stuff that isn't needed, and leave only the things you need to explain your code. 
    </ul>
</ul>

For submission, please drop the URL for your repository in the dropbox.

In [168]:
#Please change to your name.
name = "Rodrigo Jr Gomez"

#Please use this to control EDA. 
show_eda = False

In [169]:
#Load data
df = pd.read_csv("training.csv")
df = df.drop(columns={"id"})
df.sample(5)

Unnamed: 0,target,var_1,var_2,var_3,var_4,var_5,var_6,var_7,var_8,var_9,...,var_191,var_192,var_193,var_194,var_195,var_196,var_197,var_198,var_199,var_200
110,1,0.183,0.711,0.093,0.983,0.398,0.335,0.659,0.223,0.118,...,0.424,0.866,0.871,0.3,0.37,0.021,0.669,0.362,0.06,0.578
70,0,0.497,0.92,0.009,0.607,0.413,0.631,0.753,0.999,0.613,...,0.083,0.572,0.702,0.959,0.936,0.697,0.529,0.123,0.756,0.546
51,1,0.944,0.821,0.079,0.581,0.439,0.21,0.544,0.536,0.771,...,0.801,0.876,0.917,0.429,0.672,0.548,0.298,0.348,0.985,0.646
225,0,0.912,0.654,0.367,0.351,0.22,0.59,0.241,0.941,0.07,...,0.521,0.493,0.738,0.954,0.135,0.044,0.154,0.752,0.861,0.764
175,0,0.727,0.462,0.184,0.176,0.915,0.991,0.575,0.324,0.762,...,0.473,0.802,0.308,0.747,0.248,0.23,0.014,0.907,0.182,0.435


### Starting

For this assignment, you have a small training set, so combatting overfitting is key in being accurate!

In [170]:
df.shape

(250, 201)

In [171]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
target,250.0,0.524000,0.500426,0.000,0.00000,1.0000,1.00000,1.000
var_1,250.0,0.515564,0.295878,0.004,0.24400,0.5440,0.75550,0.989
var_2,250.0,0.499984,0.303769,0.005,0.23425,0.4975,0.76125,0.999
var_3,250.0,0.491172,0.288538,0.007,0.26000,0.4655,0.72500,0.996
var_4,250.0,0.533664,0.284235,0.002,0.29175,0.5165,0.78500,0.997
...,...,...,...,...,...,...,...,...
var_196,250.0,0.487852,0.280595,0.003,0.26075,0.4745,0.72775,0.988
var_197,250.0,0.485272,0.286139,0.013,0.23800,0.4795,0.74775,1.000
var_198,250.0,0.499436,0.300880,0.002,0.24025,0.4840,0.77725,1.000
var_199,250.0,0.484928,0.281306,0.003,0.24550,0.5075,0.70500,0.999


#### Do Modelling Stuff

Make a tree model (of some vareity) and make it fit well. Keep in mind the possibility of your tree overfitting, and think of steps you may need to combat that shoudl it occur. 

**DecisionTree**

In [172]:
from sklearn.model_selection import GridSearchCV
y = np.array(df["target"]).reshape(-1,1)
X = np.array(df.drop(columns={"target"}))
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) 


pipeline_steps = [
    ('scaler', StandardScaler() ),
    ('DT', DecisionTreeClassifier()) 
    ]


pipe = Pipeline(pipeline_steps)

#Hyper Paremeter
tree_para = {'DT__max_depth':[3, 4, 5, 6, 7, 8, 9, 10, 11], 
             'DT__min_samples_leaf': [3, 4, 5, 6, 7, 8, 9, 10, 11],
             'DT__splitter': ["best"]}

#3 Much better without criterion
#4 After adding splitter accuracy still remained at .53 so i stopped there

clf = GridSearchCV(pipe, param_grid=tree_para, cv=10) 
clf.fit(X_train, y_train)
Tree = clf.best_estimator_
print(Tree.score(X_test, y_test))
print(Tree)

0.5396825396825397
Pipeline(steps=[('scaler', StandardScaler()),
                ('DT',
                 DecisionTreeClassifier(max_depth=7, min_samples_leaf=8))])


**Random Forest**

In [173]:
from sklearn.model_selection import GridSearchCV
y = np.array(df["target"]).reshape(-1,1)
X = np.array(df.drop(columns={"target"}))
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)


pipeline_steps = [
    ('scaler', StandardScaler() ),
    ('rt', RandomForestClassifier(random_state=0)) 
    ]


pipe = Pipeline(pipeline_steps)

#Hyper parameter
tree_para = {'rt__max_depth':[8, 9, 10, 11],
             'rt__criterion':["gini"],
             }

#1  max depth 3,4,5,6,7,8 resulted in accuracy 0.6
#2  max depth 8, 9, 10, 11 resulted in accuracy 0.71. with max depth suppose to be 8
#3  added rt_criterion gini still .71
#4  change rt_criterion to entropy and ended up with .63 so keeping gini is better. Didnt try log_loss since i dont understand it
# Note to d best teacher, I tried doing rt__criterion:["gini", "entropy"] and the model liked the entropy and the depth changed to 11 even though the accuracy is .63, any reason to why doing 
# both at the same time resulted on a lower accuracy compared to me doing rt_criterion individually? 


clf = GridSearchCV(pipe, param_grid=tree_para, cv=10)
clf.fit(X_train, y_train)
Forest = clf.best_estimator_
print(Forest.score(X_test, y_test))
print(Forest)


  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **

0.7142857142857143
Pipeline(steps=[('scaler', StandardScaler()),
                ('rt', RandomForestClassifier(max_depth=8, random_state=0))])


### Finishing

At the conclusion, please name your best model "best". If you look down below in the testing stuff, it should be usable to score as "best". 

You should be able to call it like this and it should work (with whatever data names you have)

In [174]:
if Tree.score(X_test, y_test) > Forest.score(X_test, y_test):
    best=Tree
    print('Decision Tree is best model')
else:
    best=Forest
    print('RandomForestClassifier is best model')

    

RandomForestClassifier is best model


In [175]:
print(best.score(X_test, y_test))
print(best)

0.7142857142857143
Pipeline(steps=[('scaler', StandardScaler()),
                ('rt', RandomForestClassifier(max_depth=8, random_state=0))])


### Testing

Please leave the stuff below as-is in your file. 

This will take your best model and score it with the test data. If you want to test to make sure that yours works, make a copy of the data file and rename it testing.csv, then make sure this runs ok. I will do the same, but the contents of my test file will be different. 

In [176]:
#Load Test Data
test_df = pd.read_csv("testing.csv")
test_df = test_df.drop(columns={"id"})
#Create tests and score
test_y = np.array(test_df["target"]).reshape(-1,1)
test_X = np.array(test_df.drop(columns={"target"}))

preds = best.predict(test_X)

roc_score = roc_auc_score(test_y, preds)
acc_score = accuracy_score(test_y, preds)

print(roc_score)
print(acc_score)
print(name, np.mean([roc_score, acc_score]))


0.6213109820656461
0.6210126582278481
Rodrigo Jr Gomez 0.621161820146747


### What Accuracy Changes Were Used

Please list here what you did to try to increase accuracy and/or limit overfitting:
<li> Avoided Criterion for decision tree since it decreased accuracy
<li> Stopped adding parameter for Decision tree after the splitter since accuracy remained the same
<li> For random Forest, seeing that adding a lot of numbers for max depth resulted in slower load time, i opted to test 2 set of number which resulted in faster loading
<li> Criterion increased the accuracy for random forest and stopped adding parameters after getting .71 on training
