In [1]:
# Import dependencies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import getpass
from collections import Counter
from sqlalchemy import create_engine

# Preprocessing
# from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# ML Models
from sklearn.linear_model import LogisticRegression
from imblearn.ensemble import BalancedRandomForestClassifier
from imblearn.ensemble import EasyEnsembleClassifier

# Metrics
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced

# Sampling
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTEENN
from imblearn.under_sampling import ClusterCentroids

In [2]:
# Enter password for connection string
password = getpass.getpass()

········


In [3]:
# Create connection string
conn_string = "postgresql://postgres:" + password + "@b-team-final-project.cct7ahzel1ur.us-west-2.rds.amazonaws.com:5432/B_Team_Final_Project"

In [4]:
# Create the connection
conn=create_engine(conn_string)

In [5]:
# Import health_outcomes table and drop "countyfips"
health_outcomes = pd.read_sql(
    'health_outcomes',
    conn
)
health_outcomes = health_outcomes.apply(pd.to_numeric)
health_outcomes = health_outcomes.drop("countyfips", axis=1)
health_outcomes

Unnamed: 0,arthritis,casthma,bphigh,cancer,highchol,kidney,copd,chd,depression,diabetes,obesity,teethlost,stroke
0,23.0,9.7,34.1,6.1,31.3,3.1,7.1,6.3,21.2,11.8,36.3,18.8,3.3
1,25.6,9.5,33.3,6.5,29.3,2.9,8.0,6.4,21.2,10.7,38.8,17.2,3.4
2,23.7,9.7,32.9,6.4,30.4,2.9,6.7,5.8,19.0,11.0,37.0,17.2,3.1
3,20.0,9.2,29.8,6.0,27.8,2.5,5.0,4.8,17.0,8.3,30.7,12.2,2.7
4,25.7,10.5,36.0,6.6,30.0,3.0,8.4,6.8,23.9,10.5,36.5,20.6,3.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3116,34.1,11.0,41.0,7.0,33.6,3.3,11.8,8.3,28.4,13.1,42.1,27.7,4.2
3117,24.6,9.3,36.4,6.4,32.5,2.6,5.9,5.3,21.6,10.6,37.8,14.2,3.1
3118,19.4,9.6,27.0,6.1,27.1,2.5,5.7,5.1,15.8,9.8,29.8,15.3,2.7
3119,22.9,9.4,29.6,6.3,28.2,2.6,5.8,5.2,19.7,9.7,35.7,14.1,2.8


In [6]:
# Import health_risk_behaviors table and drop "countyfips"
risk_df = pd.read_sql(
    'health_risk_behaviors',
    conn
)
risk_df = risk_df.apply(pd.to_numeric)
risk_df = risk_df.drop("countyfips", axis=1)
risk_df


Unnamed: 0,binge,csmoking,lpa,sleep
0,13.6,17.4,38.5,37.7
1,21.5,21.7,30.3,36.8
2,17.1,18.7,32.1,34.2
3,18.2,15.3,20.5,34.4
4,13.5,21.8,39.1,37.3
...,...,...,...,...
3116,14.6,28.9,40.9,44.7
3117,15.6,17.3,32.3,36.9
3118,16.7,15.5,30.4,43.6
3119,17.9,17.4,29.7,32.5


In [7]:
# Import prevention table and drop "countyfips"
prev_df =  pd.read_sql(
    'prevention',
    conn
)
prev_df = prev_df.apply(pd.to_numeric)
prev_df = prev_df.drop("countyfips", axis=1)
prev_df

Unnamed: 0,access,checkup,dental,bpmed,cholscreen,mammouse,cervical,colon_screen,corem,corew
0,24.2,74.1,61.1,57.9,83.8,71.5,83.0,58.7,25.2,23.7
1,18.9,73.4,57.8,56.8,83.0,68.7,82.2,61.2,24.7,20.6
2,20.5,72.6,59.3,56.8,82.1,69.0,82.6,58.4,24.4,22.3
3,13.6,68.8,68.5,48.5,81.2,67.4,84.1,58.4,20.8,25.1
4,20.5,74.5,58.3,57.7,82.8,69.6,82.3,54.6,25.5,27.2
...,...,...,...,...,...,...,...,...,...,...
3116,13.1,79.6,46.9,63.9,87.3,70.6,82.5,63.3,25.9,23.1
3117,18.3,78.9,55.8,63.2,86.8,67.3,87.2,63.1,29.0,21.4
3118,15.0,80.1,67.5,57.7,87.3,76.5,85.5,62.5,17.3,20.2
3119,18.0,74.3,63.4,56.2,83.2,73.4,84.0,59.5,24.6,26.5


In [8]:
# Create a list of high risk cancer counties
cancer_risk = []
for risk in health_outcomes["cancer"]:
    if risk <= 6.96:
        cancer_risk.append("normal_risk")
    if risk > 6.96:
        cancer_risk.append("high_risk")
Counter(cancer_risk)

Counter({'normal_risk': 3089, 'high_risk': 32})

In [9]:
# Append the list as a column to health_outcomes
health_outcomes["cancer_risk"] = cancer_risk
health_outcomes

Unnamed: 0,arthritis,casthma,bphigh,cancer,highchol,kidney,copd,chd,depression,diabetes,obesity,teethlost,stroke,cancer_risk
0,23.0,9.7,34.1,6.1,31.3,3.1,7.1,6.3,21.2,11.8,36.3,18.8,3.3,normal_risk
1,25.6,9.5,33.3,6.5,29.3,2.9,8.0,6.4,21.2,10.7,38.8,17.2,3.4,normal_risk
2,23.7,9.7,32.9,6.4,30.4,2.9,6.7,5.8,19.0,11.0,37.0,17.2,3.1,normal_risk
3,20.0,9.2,29.8,6.0,27.8,2.5,5.0,4.8,17.0,8.3,30.7,12.2,2.7,normal_risk
4,25.7,10.5,36.0,6.6,30.0,3.0,8.4,6.8,23.9,10.5,36.5,20.6,3.5,normal_risk
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3116,34.1,11.0,41.0,7.0,33.6,3.3,11.8,8.3,28.4,13.1,42.1,27.7,4.2,high_risk
3117,24.6,9.3,36.4,6.4,32.5,2.6,5.9,5.3,21.6,10.6,37.8,14.2,3.1,normal_risk
3118,19.4,9.6,27.0,6.1,27.1,2.5,5.7,5.1,15.8,9.8,29.8,15.3,2.7,normal_risk
3119,22.9,9.4,29.6,6.3,28.2,2.6,5.8,5.2,19.7,9.7,35.7,14.1,2.8,normal_risk


In [10]:
# Define X and y
X = prev_df.values
y = health_outcomes["cancer_risk"].values.reshape(-1,1)

In [11]:
# Split the data into training and testing sets
X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=2)

# RESAMPLING PREVENTATIVE SERVICES VS CANCER

## SMOTEEN


In [12]:
# Instantiate a SMOTEEN class
smote_enn = SMOTEENN(random_state=1)
X_resampled, y_resampled = smote_enn.fit_resample(X_train,y_train)
Counter(y_resampled)

Counter({'high_risk': 2313, 'normal_risk': 2225})

In [13]:
# Scale the data
#scaler = StandardScaler()
#X_train_scaled_resampled = scaler.fit_transform(X_resampled)
#X_test_scaled_resampled = scaler.transform(X_train_scaled_resampled)

In [14]:
#shapes of splitted data
print("X_train:",X_resampled.shape)
print("X_test:",X_test.shape)
print("Y_train:",y_resampled.shape)
print("Y_test:",y_test.shape)

X_train: (4538, 10)
X_test: (781, 10)
Y_train: (4538,)
Y_test: (781, 1)


In [15]:
# Train a logistic regression model using resampled data
model = LogisticRegression(solver='lbfgs', random_state=1, max_iter=1000)
model.fit(X_resampled, y_resampled)

LogisticRegression(max_iter=1000, random_state=1)

In [16]:
# get predictions
y_pred = model.predict(X_test)
y_pred[:5]

array(['normal_risk', 'normal_risk', 'normal_risk', 'normal_risk',
       'normal_risk'], dtype=object)

In [17]:
# Print the accuracy score
Accuracy = balanced_accuracy_score(y_test,y_pred)
print(" Accuracy of the model is %.2f" %Accuracy)

 Accuracy of the model is 0.75


In [18]:
# Display a confusion matrix
confusion_matrix(y_test, y_pred)

array([[  5,   3],
       [ 91, 682]], dtype=int64)

In [19]:
# print a classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

  high_risk       0.05      0.62      0.88      0.10      0.74      0.54         8
normal_risk       1.00      0.88      0.62      0.94      0.74      0.57       773

avg / total       0.99      0.88      0.63      0.93      0.74      0.57       781



## SMOTE

In [20]:
# Resample the training data with SMOTE
X_resampled, y_resampled = SMOTE(random_state=1, sampling_strategy='auto').fit_resample(X_train, y_train)
Counter(y_resampled)

Counter({'normal_risk': 2316, 'high_risk': 2316})

In [21]:
#shapes of splitted data
print("X_train:",X_resampled.shape)
print("X_test:",X_test.shape)
print("Y_train:",y_resampled.shape)
print("Y_test:",y_test.shape)

X_train: (4632, 10)
X_test: (781, 10)
Y_train: (4632,)
Y_test: (781, 1)


In [22]:
# Train the Logistic Regression model using the resampled data
model = LogisticRegression(solver="lbfgs", random_state=1, max_iter=1000)
model.fit(X_resampled, y_resampled)

LogisticRegression(max_iter=1000, random_state=1)

In [23]:
# Print the accuracy score
Accuracy = balanced_accuracy_score(y_test,y_pred)
print(" Accuracy of the model is %.2f" %Accuracy)

 Accuracy of the model is 0.75


In [24]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[  5,   3],
       [ 91, 682]], dtype=int64)

In [25]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

  high_risk       0.05      0.62      0.88      0.10      0.74      0.54         8
normal_risk       1.00      0.88      0.62      0.94      0.74      0.57       773

avg / total       0.99      0.88      0.63      0.93      0.74      0.57       781



## ClusterCentroids

In [26]:
cc = ClusterCentroids(random_state=1)
X_resampled, y_resampled = cc.fit_resample(X_train, y_train)
Counter(y_resampled)

Counter({'high_risk': 24, 'normal_risk': 24})

In [27]:
# Train the Logistic Regression model using the resampled data
model = LogisticRegression(solver='lbfgs', random_state=1, max_iter=1000)
model.fit(X_resampled, y_resampled)

LogisticRegression(max_iter=1000, random_state=1)

In [28]:
# Calculated the balanced accuracy score
y_pred = model.predict(X_test)
balanced_accuracy = balanced_accuracy_score(y_test, y_pred)
balanced_accuracy

0.7258247089262613

In [29]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[  5,   3],
       [134, 639]], dtype=int64)

In [30]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

  high_risk       0.04      0.62      0.83      0.07      0.72      0.51         8
normal_risk       1.00      0.83      0.62      0.90      0.72      0.53       773

avg / total       0.99      0.82      0.63      0.89      0.72      0.53       781



# ENSEMBLE LEARNERS - PREVENTATIVE SERVICES VS CANCER

## BALANCED RANDOM FOREST CLASSIFIER

In [31]:
# Define X and y
X = prev_df.values
y = health_outcomes["cancer_risk"].values

In [32]:
# Split the data into training and testing sets
X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=2)

In [33]:
brf_model = BalancedRandomForestClassifier(n_estimators=100, random_state=1)
brf_model.fit(X_train, y_train)
Counter(y_train)

Counter({'normal_risk': 2316, 'high_risk': 24})

In [34]:
# Calculated the balanced accuracy score
y_pred = brf_model.predict(X_test)
acc_score = balanced_accuracy_score(y_test, y_pred)
acc_score

0.8618208279430789

In [35]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[  7,   1],
       [117, 656]], dtype=int64)

In [36]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

  high_risk       0.06      0.88      0.85      0.11      0.86      0.74         8
normal_risk       1.00      0.85      0.88      0.92      0.86      0.74       773

avg / total       0.99      0.85      0.87      0.91      0.86      0.74       781



In [37]:
# List the features sorted in descending order by feature importance
importances = brf_model.feature_importances_
sorted(zip(importances, prev_df.columns), reverse=True)

[(0.16133474550812157, 'checkup'),
 (0.15379810449481965, 'access'),
 (0.14109135980253165, 'cholscreen'),
 (0.12766649221494697, 'bpmed'),
 (0.10431499243348975, 'colon_screen'),
 (0.0757459451582132, 'dental'),
 (0.07402469064877108, 'cervical'),
 (0.06548591210729723, 'corem'),
 (0.057395816633757556, 'corew'),
 (0.03914194099805126, 'mammouse')]

## EASY ENSEMBLE ADABOOST CLASSIFIER

In [38]:
# Instantiate and train the classifier
eec = EasyEnsembleClassifier(n_estimators=100, random_state=1)
eec.fit(X_train, y_train)

EasyEnsembleClassifier(n_estimators=100, random_state=1)

In [39]:
# Calculated the balanced accuracy score
y_pred = eec.predict(X_test)
acc_score = balanced_accuracy_score(y_test, y_pred)
acc_score

0.8521183699870634

In [40]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[  7,   1],
       [132, 641]], dtype=int64)

In [41]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

  high_risk       0.05      0.88      0.83      0.10      0.85      0.73         8
normal_risk       1.00      0.83      0.88      0.91      0.85      0.72       773

avg / total       0.99      0.83      0.87      0.90      0.85      0.72       781

