# Capstone Notes: Modeling  
<hr>  

* Preprocessing (Normalize)
* Train Test Split
> Grid Search then Test to find the best Parameteres
* Apply Different Models
* Evaluate
> Metrics such as the Confusion Matrix

In [None]:
# Pandas is a software library written for the Python programming language for data manipulation and analysis.
import pandas as pd
# NumPy is a library for the Python programming language, adding support for large, multi-dimensional arrays and matrices, along with a large collection of high-level mathematical functions to operate on these arrays
import numpy as np
# Matplotlib is a plotting library for python and pyplot gives us a MatLab like plotting framework. We will use this in our plotter function to plot data.
import matplotlib.pyplot as plt
#Seaborn is a Python data visualization library based on matplotlib. It provides a high-level interface for drawing attractive and informative statistical graphics
import seaborn as sns
# Preprocessing allows us to standarsize our data
from sklearn import preprocessing
# Allows us to split our data into training and testing data
from sklearn.model_selection import train_test_split
# Allows us to test parameters of classification algorithms and find the best one
from sklearn.model_selection import GridSearchCV
# Logistic Regression classification algorithm
from sklearn.linear_model import LogisticRegression
# Support Vector Machine classification algorithm
from sklearn.svm import SVC
# Decision Tree classification algorithm
from sklearn.tree import DecisionTreeClassifier
# K Nearest Neighbors classification algorithm
from sklearn.neighbors import KNeighborsClassifier

In [None]:
def plot_confusion_matrix(y,y_predict):
    "this function plots the confusion matrix"
    from sklearn.metrics import confusion_matrix

    cm = confusion_matrix(y, y_predict)
    ax= plt.subplot()
    sns.heatmap(cm, annot=True, ax = ax); #annot=True to annotate cells
    ax.set_xlabel('Predicted labels')
    ax.set_ylabel('True labels')
    ax.set_title('Confusion Matrix'); 
    ax.xaxis.set_ticklabels(['did not land', 'land']); ax.yaxis.set_ticklabels(['did not land', 'landed'])

In [None]:
# Convert df to np array
model_scores = {} # initialize list to store scores to determine best model later
x = df.drop("Target", axis = 1) # selects everything but the target column
y = df["Target"].to_numpy()
#y = np.asarray(data["Target"])

In [None]:
# Normalize X
x = preprocessing.StandardScaler().fit(x).transform(x)

In [None]:
# Split data set
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2)

In [None]:
# fit logreg model through gridsearch for best hyperparameters
parameters = {"C":[0.01,0.1,1], 'penalty':['l2'], 'solver': ['lbfgs']} # lr parameters
logreg = LogisticRegression()

logreg_cv = GridSearchCV(logreg, parameters, cv = 10)
logreg_cv.fit(x, y)

model_scores["LogReg"] = logreg_cv.score(x_test, y_test)

yhat = logreg_cv.predict(x_test)


print("tuned hpyerparameters: (best parameters)", logreg_cv.best_params_)
print("accuracy: ", logreg_cv.best_score_)
print("test accuracy: ", logreg_cv.score(x_test, y_test))
print(plot_confusion_matrix(y_test,yhat)) 

In [None]:
# other models; follows the same format after initializing the object and the parameters to optimize

# SVM
parameters = {'kernel':('linear', 'rbf','poly','rbf', 'sigmoid'),
              'C': np.logspace(-3, 3, 5),
              'gamma':np.logspace(-3, 3, 5)}
svm = SVC()

# Decision Tree
parameters = {'criterion': ['gini', 'entropy'],
     'splitter': ['best', 'random'],
     'max_depth': [2*n for n in range(1,10)],
     'max_features': ['auto', 'sqrt'],
     'min_samples_leaf': [1, 2, 4],
     'min_samples_split': [2, 5, 10]}

tree = DecisionTreeClassifier()

# KNN
parameters = {'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
              'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
              'p': [1,2]}

KNN = KNeighborsClassifier()

In [None]:
# iterates through the models and returns the best models based on out of sample accuracy

best_models_idx = []
best_models = []

for i, x in enumerate(model_scores.values()): 
    if x == max(model_scores.values()):
        best_models_idx.append(i)

for i, x in enumerate(model_scores.keys()):
    if i in best_models_idx:
        best_models.append(x)

best_models