# Fast and Cheap (BreakoutRoom #3)

This team can utilize any model and any features but are limited to **only using ~35% of the training data**

In [1]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import roc_auc_score, plot_roc_curve
from sklearn import tree

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score, balanced_accuracy_score
from sklearn.metrics import recall_score, precision_score, f1_score

In [2]:
import pandas as pd
df = pd.read_csv('Diabetes_Data/diabetes_reduced_train.csv')
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
585,1,93,56,11,0,22.5,0.417,22,0
717,10,94,72,18,0,23.1,0.595,56,0
175,8,179,72,42,130,32.7,0.719,36,1
86,13,106,72,54,0,36.6,0.178,45,0
119,4,99,76,15,51,23.2,0.223,21,0


In [3]:
df.Outcome.value_counts(normalize=True)

0    0.632287
1    0.367713
Name: Outcome, dtype: float64

In [4]:
X = df.drop(columns='Outcome')
y = df.Outcome

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.20,
                                                   random_state=42,
                                                   stratify=y)

In [6]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [7]:
# Do your magic!
lr = LogisticRegression(class_weight = 'Balanced')
lr.fit(X_train_scaled, y_train)

y_hat_test = lr.predict(X_test_scaled)
y_hat_train = lr.predict(X_train_scaled)

In [8]:
# Printing the metrics nicely
metrics = {"Accuracy": accuracy_score,
           "Recall": recall_score,
           "Precision": precision_score,
           "F1-Score": f1_score}

for name, metric in metrics.items():
    print(f"{name}:"); print("="*len(name))
    print(f"TRAIN: {metric(y_train, y_hat_train):.4f}")
    print(f"TEST: {metric(y_test, y_hat_test):.4f}")
    print("*" * 15)

Accuracy:
TRAIN: 0.8202
TEST: 0.7778
***************
Recall:
TRAIN: 0.7077
TEST: 0.6471
***************
Precision:
TRAIN: 0.7797
TEST: 0.7333
***************
F1-Score:
TRAIN: 0.7419
TEST: 0.6875
***************


### Random Forest

In [9]:
from sklearn.ensemble import RandomForestClassifier

In [29]:
clf = RandomForestClassifier(n_estimators=15,
                             criterion='gini',
                             max_features='auto',
                             oob_score=True)

In [34]:
clf.fit(X_train_scaled, y_train)
print(clf.score(X_train_scaled, y_train))
print(clf.score(X_test, y_test))

0.9775280898876404
0.4


In [35]:
Outcome2 = clf.predict_log_proba(holdout_df)
Outcome2

array([[-1.60943791, -0.22314355],
       [-1.09861229, -0.40546511],
       [-1.60943791, -0.22314355],
       [-0.91629073, -0.51082562],
       [-0.40546511, -1.09861229],
       [-1.32175584, -0.31015493],
       [-1.60943791, -0.22314355],
       [-1.09861229, -0.40546511],
       [-0.76214005, -0.62860866],
       [-1.60943791, -0.22314355],
       [-1.09861229, -0.40546511],
       [-1.09861229, -0.40546511],
       [-0.76214005, -0.62860866],
       [-1.60943791, -0.22314355],
       [-1.60943791, -0.22314355],
       [-1.60943791, -0.22314355],
       [-1.60943791, -0.22314355],
       [-1.09861229, -0.40546511],
       [-2.01490302, -0.14310084],
       [-2.01490302, -0.14310084],
       [-1.60943791, -0.22314355],
       [-1.60943791, -0.22314355],
       [-1.60943791, -0.22314355],
       [-1.09861229, -0.40546511],
       [-1.60943791, -0.22314355],
       [-2.01490302, -0.14310084],
       [-0.76214005, -0.62860866],
       [-2.01490302, -0.14310084],
       [-1.09861229,

In [12]:
# Then use your model to predict the outcomes of the holdout_df
holdout_df = pd.read_csv('Diabetes_data/holdout_df.csv')
holdout_df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
540,8,100,74,40,215,39.4,0.661,43
307,0,137,68,14,148,24.8,0.143,21
745,12,100,84,33,105,30.0,0.488,46
691,13,158,114,0,0,42.3,0.257,44
564,0,91,80,0,0,32.4,0.601,27


In [13]:
X1 = holdout_df

In [14]:
Outcome1 = lr.predict(X1)

In [15]:
Outcome1

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
      dtype=int64)

In [19]:
OC = pd.DataFrame(Outcome1)

In [21]:
# And store those outcomes in the 'Outcome' column of this submission_df 
submission_df = pd.read_csv('Diabetes_Data/submission_df.csv')

In [22]:
submission_df.head()

Unnamed: 0.1,Unnamed: 0,Outcome
0,540,
1,307,
2,745,
3,691,
4,564,


In [23]:
submission_df.Outcome = Outcome1

In [24]:
submission_df.head()

Unnamed: 0.1,Unnamed: 0,Outcome
0,540,1
1,307,1
2,745,1
3,691,1
4,564,1


In [39]:
submission_df.to_pickle('Group_3')