Diana's Code:

In [None]:
import numpy as np
import datetime as dt
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge, Lasso, LinearRegression,LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV

sns.set_style("darkgrid")

In [None]:
diabetes = pd.read_csv('Diabetes with Population Info by County 2017.csv') 

## 2: Exploratory Data Analysis

In [None]:
diabetes.head()

In [None]:
diabetes.shape

In [None]:
diabetes.describe()

In [None]:
diabetes.dtypes


### Data Preprocessing

In [None]:
diabetes = diabetes.apply(pd.to_numeric, errors = 'coerce')
diabetes.head()

In [None]:
diabetes = diabetes.drop(['County', 'State'], axis = 1)
diabetes.head()

In [None]:
diabetes_nomiss=diabetes.dropna() 

In [None]:
diabetes_nomiss.shape

In [None]:
diabetes_nomiss.dtypes

In [None]:
diabetes_rates = diabetes_nomiss.drop (["CountyFIPS", "race_total population"], axis = 1).div(diabetes_nomiss['race_total population'], axis = "index")
diabetes_rates['CountyFIPS'] = diabetes_nomiss['CountyFIPS']
diabetes_rates['race_total population'] = diabetes_nomiss['race_total population']

In [None]:
diabetes_rates.shape

In [None]:
diabetes_rates.head()

In [None]:
diabetes_rates.describe()

### Visualization and Plotting

In [None]:
diabetes_rates.hist(column='Diabetes_Number')
diabetes_rates.hist(column='Obesity_Number')
diabetes_rates.hist(column='Physical_Inactivity_Number')
plt.show()

In [None]:
boxplot = diabetes_rates.boxplot(column=['Diabetes_Number', 'Obesity_Number', 'Physical_Inactivity_Number'])

In [None]:
diabetes_rates.plot.scatter(x='race_total population_one race_white', y='Diabetes_Number')
diabetes_rates.plot.scatter(x='race_total population_one race_black or african american', y='Diabetes_Number')
diabetes_rates.plot.scatter(x='race_total population_one race_asian', y='Diabetes_Number')
diabetes_rates.plot.scatter(x='hispanic or latino and race_total population_hispanic or latino (of any race)', y='Diabetes_Number')


In [None]:
diabetes_rates.plot.scatter(x='Obesity_Number', y='Diabetes_Number')
diabetes_rates.plot.scatter(x='Physical_Inactivity_Number', y='Diabetes_Number')

In [None]:
diabetes_rates.plot.scatter(x='sex and age_total population_65 years and over', y='Diabetes_Number')

### Covariance and Correlation

In [None]:
diabetes_rates.cov(min_periods=1)

In [None]:
diabetes_rates.corr(method='pearson', min_periods=1)

In [None]:
diabetes_rates.shape

## 3: Prepare to Fit Model

### 3.1: Data Cleaning

In [None]:
diabetes_rates['diabetes_top_half'] = np.where(diabetes_rates['Diabetes_Number'] > np.nanquantile(diabetes_rates['Diabetes_Number'], q=0.50), 1, 0)
diabetes_rates.head()


In [None]:
diabetes_top_half = diabetes_rates
diabetes_top_half.head()

### 3.2: Partition Data

#### Train/Test/Validation Split

In [None]:
X = diabetes_rates.drop(['Diabetes_Number', 'diabetes_top_half'], axis = 1)
y = diabetes_rates['Diabetes_Number']
X.head()

In [None]:
from sklearn.model_selection import train_test_split 

# set the random seed
np.random.seed(10)

# split the data
# train_test_split returns 4 values: X_train, X_test, y_train, y_test

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    train_size=0.80, test_size=0.20)

In [None]:
Xclass = diabetes_top_half.drop(['Diabetes_Number', 'diabetes_top_half', 'CountyFIPS', 'race_total population'], axis = 1)
yclass= diabetes_top_half['diabetes_top_half']
Xclass.head()

In [None]:
from sklearn.model_selection import train_test_split 

# set the random seed
np.random.seed(10)

# split the data
# train_test_split returns 4 values: X_train, X_test, y_train, y_test

Xclass_train, Xclass_test, yclass_train, yclass_test = train_test_split(Xclass, yclass,
                                                    train_size=0.80, test_size=0.20)

In [None]:
print("Dimensions of Dataframe are", X.shape)

#### Validation Set

In [None]:
# split the data
# Returns 4 values: X_train, X_validate, y_train, y_validate

X_train, X_validate, y_train, y_validate = train_test_split(X_train, y_train,
                                                    train_size=0.75, test_size=0.25)

In [None]:
print("Shape of X train is", X_train.shape)
print("Shape of y_train is", y_train.shape)
print("Shape of X_test is", X_test.shape)
print("Shape of y_test is", y_test.shape)

In [None]:
y_train.head()

In [None]:
# split the data
# Returns 4 values: X_train, X_validate, y_train, y_validate

Xclass_train, Xclass_validate, yclass_train, yclass_validate = train_test_split(Xclass_train, yclass_train,
                                                    train_size=0.75, test_size=0.25)

In [None]:
print("Shape of X train is", Xclass_train.shape)
print("Shape of y_train is", yclass_train.shape)
print("Shape of X_test is", Xclass_test.shape)
print("Shape of y_test is", yclass_test.shape)

In [None]:
yclass_train.head()

### 3.3: Feature Selection

In [None]:
print(X_train.columns)

In [None]:
cor_matrix = X_train.corr().abs()
upper_tri = cor_matrix.where(np.triu(np.ones(cor_matrix.shape),k=1).astype(np.bool))
to_drop = [column for column in upper_tri.columns if any(upper_tri[column] > 0.90)]
X_train.drop(to_drop, axis=1, inplace=True)
X_validate.drop(to_drop, axis=1, inplace=True)
X_test.drop(to_drop, axis=1, inplace=True)

In [None]:
cor_matrix = Xclass_train.corr().abs()
upper_tri = cor_matrix.where(np.triu(np.ones(cor_matrix.shape),k=1).astype(np.bool))
to_drop = [column for column in upper_tri.columns if any(upper_tri[column] > 0.90)]
Xclass_train.drop(to_drop, axis=1, inplace=True)
Xclass_validate.drop(to_drop, axis=1, inplace=True)
Xclass_test.drop(to_drop, axis=1, inplace=True)

## 4: Train Models

### 4.1: Model Description

A. Choose 5 different machine learning techniques. See available ones in the
scikit-learn documentation. 2.Detail the basic logic and assumptions underlying each model, its pros/cons,and why it is a plausible choice for this problem.
1. Linear Regression
2. Ridge Regression
3. Lasso Regression
4. Logistic Regression
5. TBD

B. Detail the basic logic and assumptions underlying each model, its pros/cons,and why it is a plausible choice for this problem.



### 4.2 Train Models

#### 1. Ordinary Least Squares Regression

In [None]:
# create a model
lin_reg = LinearRegression(normalize=True)

# fit the model
lin_model = lin_reg.fit(X_train, y_train)

In [None]:
print(lin_model.coef_)
print(lin_model.intercept_)

In [None]:
# predict the number of riders
lin_pred = lin_model.predict(X_validate)

# plot the residuals on a scatter plot
plt.scatter(y_validate, lin_pred)
plt.title('Linear Model (OLS) Predicted v. Actual')
plt.xlabel('actual value')
plt.ylabel('predicted value')
plt.show()

In [None]:
def rmse(pred, actual):
    return np.sqrt(np.mean((pred - actual) ** 2))

In [None]:
rmse(lin_pred, y_validate)

#### 2. Ridge Regression

In [None]:
# make and fit a Ridge regression model
ridge_reg = Ridge() 
ridge_model = ridge_reg.fit(X_train, y_train)
ridge_reg_data = pd.DataFrame([ridge_model.coef_, X.columns]).T
ridge_reg_data.columns = ['Coefficient', 'Feature']

In [None]:
# use the model to make predictions
ridge_pred = ridge_model.predict(X_validate)

# plot the predictions
plt.scatter(y_validate, ridge_pred)
plt.title('Ridge Model')
plt.xlabel('actual values')
plt.ylabel('predicted values')
plt.show()

In [None]:
# calculate the rmse for the Ridge model
rmse(ridge_pred, y_validate)

#### 3. Lasso Regression

In [None]:
# create and fit the model
lasso_reg = Lasso(max_iter=10000) #initialize the model, add a hyperparameter 

lasso_model = lasso_reg.fit(X_train, y_train) #fit the model
lasso_reg_data = pd.DataFrame([lasso_model.coef_, X.columns]).T #save it in this dataframe
lasso_reg_data.columns = ['Coefficient', 'Feature']

In [None]:
# use the model to make predictions
lasso_pred = lasso_model.predict(X_validate)

# plot the predictions
plt.scatter(y_validate, lasso_pred)
plt.title('LASSO Model')
plt.xlabel('actual values')
plt.ylabel('predicted values')
plt.show()

In [None]:
# calculate the rmse for the LASSO model
rmse(lasso_pred, y_validate)

#### 4. Logistic Regression

X = diabetes_topquart.drop(['Diabetes_Number', 'diabetes_topquart', 'CountyFIPS', 'race_total population'], axis = 1)
y = diabetes_topquart['diabetes_topquart']
X.head()

from sklearn.model_selection import train_test_split 

# set the random seed
np.random.seed(10)

# split the data
# train_test_split returns 4 values: X_train, X_test, y_train, y_test

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    train_size=0.80, test_size=0.20)

print("Dimensions of Dataframe are", X.shape)

y_train.head(50)

# split the data
# Returns 4 values: X_train, X_validate, y_train, y_validate

X_train, X_validate, y_train, y_validate = train_test_split(X_train, y_train,
                                                    train_size=0.75, test_size=0.25)

print("Shape of X train is", X_train.shape)
print("Shape of y_train is", y_train.shape)
print("Shape of X_test is", X_test.shape)
print("Shape of y_test is", y_test.shape)

y_train.head(50)

In [None]:
# create a model
log_reg = LogisticRegression()

# fit the model
log_model = log_reg.fit(Xclass_train, yclass_train)

y_pred = log_model.predict(Xclass_validate)

In [None]:
#Check to make sure there are both 0 and 1 in predicted y
np.unique(y_pred, return_counts=True)

In [None]:
import seaborn as sns

cf_matrix = confusion_matrix(yclass_validate, y_pred, normalize = "true")

df_cm = pd.DataFrame(cf_matrix, range(2),
                  range(2))

df_cm = df_cm.rename(index=str, columns={0: "Bottom 50%", 1: "Top 50%"})
df_cm.index = ["Bottom 50%", "Top 50%"]
plt.figure(figsize = (10,7))
sns.set(font_scale=1.4)#for label size
sns.heatmap(df_cm, 
           annot=True,
           annot_kws={"size": 16},
           fmt='g')

plt.title("Confusion Matrix - Logistic Regression")
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.show()

#### Hypertuning Parameters

In [None]:
param_grid = {'fit_intercept': ['True', 'False'],
              'normalize': ['True', 'False']}

lin_grid_reg = GridSearchCV(lin_reg, param_grid, cv=3, iid=False)
lin_grid_reg.fit(X_train, y_train)

best_index = np.argmax(lin_grid_reg.cv_results_["mean_test_score"])
best_lin_pred = lin_grid_reg.best_estimator_.predict(X_validate)

print(lin_grid_reg.cv_results_["params"][best_index])
print('Best CV R^2:', max(lin_grid_reg.cv_results_["mean_test_score"]))
print('Validation R^2:', lin_grid_reg.score(X_validate, y_validate))
print('Validation RMSE', rmse(best_lin_pred, y_validate))

In [None]:
param_grid = {'alpha': np.arange(.1, 1, .1),
               'normalize': ['True', 'False'],
             'fit_intercept': ['True', 'False'],
             'solver': ['auto', 'svd', 'cholesky', 'lsqr']}

ridge_grid_reg = GridSearchCV(ridge_reg, param_grid, cv=3, iid=False)
ridge_grid_reg.fit(X_train, y_train)

best_index = np.argmax(ridge_grid_reg.cv_results_["mean_test_score"])
best_ridge_pred = ridge_grid_reg.best_estimator_.predict(X_validate)

print(ridge_grid_reg.cv_results_["params"][best_index])
print('Best CV R^2:', max(ridge_grid_reg.cv_results_["mean_test_score"]))
print('Validation R^2:', ridge_grid_reg.score(X_validate, y_validate))
print('Validation RMSE', rmse(best_ridge_pred, y_validate))

In [None]:
param_grid = {'alpha': np.arange(.1, 1, .1),
               'normalize': ['True', 'False'],
             'fit_intercept': ['True', 'False'],
             'selection': ['cyclic', 'random']}

lasso_grid_reg = GridSearchCV(lasso_reg, param_grid, cv=3, iid=False)
lasso_grid_reg.fit(X_train, y_train)

best_index = np.argmax(lasso_grid_reg.cv_results_["mean_test_score"])
best_lasso_pred = lasso_grid_reg.best_estimator_.predict(X_validate)

print(lasso_grid_reg.cv_results_["params"][best_index])
print('Best CV R^2:', max(lasso_grid_reg.cv_results_["mean_test_score"]))
print('Validation R^2:', lasso_grid_reg.score(X_validate, y_validate))
print('Validation RMSE', rmse(best_lasso_pred, y_validate))