In [23]:
import pandas as pd
import numpy as np
import imblearn
from imblearn.under_sampling import NearMiss
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LassoCV

# Creating The Data Set

In [5]:
df = pd.read_csv('diabetes.csv')
undersample = NearMiss(version=1)
X = df.loc[:, df.columns != 'Diabetes_binary']
y = df.loc[:, df.columns == 'Diabetes_binary']
X, y = undersample.fit_resample(X, y)

print(X.info())
print(y.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70692 entries, 0 to 70691
Data columns (total 21 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   HighBP                70692 non-null  float64
 1   HighChol              70692 non-null  float64
 2   CholCheck             70692 non-null  float64
 3   BMI                   70692 non-null  float64
 4   Smoker                70692 non-null  float64
 5   Stroke                70692 non-null  float64
 6   HeartDiseaseorAttack  70692 non-null  float64
 7   PhysActivity          70692 non-null  float64
 8   Fruits                70692 non-null  float64
 9   Veggies               70692 non-null  float64
 10  HvyAlcoholConsump     70692 non-null  float64
 11  AnyHealthcare         70692 non-null  float64
 12  NoDocbcCost           70692 non-null  float64
 13  GenHlth               70692 non-null  float64
 14  MentHlth              70692 non-null  float64
 15  PhysHlth           

# Splitting The Dataset

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=42)
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

df_undersampled_train = pd.DataFrame(X_train_scaled, columns = X.columns)
df_undersampled_train['Diabetes_binary'] = y_train
df_undersampled_train.head()

df_undersampled_test = pd.DataFrame(X_test_scaled, columns = X.columns)
df_undersampled_test['Diabetes_binary'] = y_test
df_undersampled_test.head()

Unnamed: 0,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,...,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income,Diabetes_binary
0,-1.212894,0.876922,0.074482,-1.061978,1.158253,-0.225623,-0.384172,0.514775,-1.494245,0.418265,...,-0.237966,1.255723,-0.332699,-0.446002,-0.485309,-1.036097,0.367769,-1.162526,-0.619867,
1,-1.212894,-1.140353,0.074482,0.377975,1.158253,-0.225623,-0.384172,0.514775,0.669234,0.418265,...,-0.237966,-0.639948,-0.332699,-0.446002,-0.485309,0.965161,-2.279439,0.864409,0.843982,
2,0.824475,0.876922,0.074482,1.017954,-0.863369,-0.225623,-0.384172,0.514775,0.669234,0.418265,...,-0.237966,0.307887,-0.332699,-0.220989,-0.485309,0.965161,-0.073432,0.864409,0.843982,
3,-1.212894,0.876922,0.074482,0.377975,-0.863369,-0.225623,-0.384172,0.514775,0.669234,0.418265,...,-0.237966,-0.639948,-0.332699,-0.446002,-0.485309,-1.036097,-0.514634,0.864409,0.356032,
4,0.824475,-1.140353,0.074482,2.777896,-0.863369,-0.225623,-0.384172,0.514775,0.669234,0.418265,...,-0.237966,1.255723,4.162593,0.679064,2.060543,0.965161,-1.397036,0.864409,-1.595766,0.0


# Looking At The Results That LassoCV Yields

In [25]:
lasso = LassoCV(cv=5, random_state=0).fit(X_train_scaled, y_train)

coef = lasso.coef_
col = X.columns
for index in range(len(coef)):
    if coef[index] > 0.015:
        print(f'{col[index]}: {np.round(coef[index], 3)}')

  y = column_or_1d(y, warn=True)


HighBP: 0.031
BMI: 0.065
Smoker: 0.016
HeartDiseaseorAttack: 0.029
HvyAlcoholConsump: 0.017
GenHlth: 0.11
MentHlth: 0.016
PhysHlth: 0.024
DiffWalk: 0.03


# Looking At The Results That Correlation Yields

In [18]:
corr = df_undersampled_train.corr()
corr

Unnamed: 0,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,...,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income,Diabetes_binary
HighBP,1.0,0.290281,0.01919,0.252404,0.132512,0.119859,0.192153,-0.185341,-0.103985,-0.123597,...,0.077367,0.322079,0.124688,0.188203,0.225602,-0.011667,0.275838,-0.227568,-0.28253,-0.012546
HighChol,0.290281,1.0,0.012837,0.12913,0.12524,0.081647,0.160192,-0.123335,-0.089436,-0.084961,...,0.065678,0.223396,0.117957,0.139995,0.147227,-0.00769,0.163192,-0.13482,-0.163382,-0.01103
CholCheck,0.01919,0.012837,1.0,-0.004101,-0.00519,-0.000986,-0.003985,0.019316,0.015673,0.010999,...,-0.059807,-0.020087,-0.027875,-0.014818,-0.009001,-0.010735,0.022321,0.006402,0.031126,-0.002274
BMI,0.252404,0.12913,-0.004101,1.0,0.063508,0.057133,0.098065,-0.255187,-0.158303,-0.123689,...,0.129655,0.345232,0.201497,0.255436,0.316306,-0.04402,-0.112717,-0.201685,-0.242094,-0.004097
Smoker,0.132512,0.12524,-0.00519,0.063508,1.0,0.072942,0.144389,-0.102905,-0.102695,-0.060063,...,0.042161,0.181521,0.108281,0.140699,0.140045,0.115278,0.145365,-0.17196,-0.152527,-0.006953
Stroke,0.119859,0.081647,-0.000986,0.057133,0.072942,1.0,0.233298,-0.126957,-0.046131,-0.084637,...,0.077127,0.223366,0.141691,0.210303,0.236295,-0.019964,0.083517,-0.120345,-0.194099,-0.002408
HeartDiseaseorAttack,0.192153,0.160192,-0.003985,0.098065,0.144389,0.233298,1.0,-0.147135,-0.063526,-0.085198,...,0.083611,0.311562,0.140186,0.246269,0.271997,0.073961,0.173327,-0.153377,-0.216515,-0.004789
PhysActivity,-0.185341,-0.123335,0.019316,-0.255187,-0.102905,-0.126957,-0.147135,1.0,0.165934,0.204442,...,-0.120715,-0.372282,-0.224465,-0.333044,-0.363598,0.081863,-0.080338,0.273416,0.307727,0.002891
Fruits,-0.103985,-0.089436,0.015673,-0.158303,-0.102695,-0.046131,-0.063526,0.165934,1.0,0.245612,...,-0.07079,-0.189348,-0.10377,-0.113491,-0.116434,-0.09609,0.042817,0.145128,0.140828,0.006887
Veggies,-0.123597,-0.084961,0.010999,-0.123689,-0.060063,-0.084637,-0.085198,0.204442,0.245612,1.0,...,-0.088002,-0.214869,-0.119102,-0.151267,-0.165544,-0.027881,-0.029046,0.214979,0.238881,-0.001889


# These Are The Correlation Results

In [19]:
corr_target = abs(corr["Diabetes_binary"])
relevant_features = corr_target[corr_target>0.006]
relevant_features

HighBP             0.012546
HighChol           0.011030
Smoker             0.006953
Fruits             0.006887
AnyHealthcare      0.007229
MentHlth           0.011795
DiffWalk           0.012193
Sex                0.012702
Income             0.008735
Diabetes_binary    1.000000
Name: Diabetes_binary, dtype: float64

# Creating A Variable Containing The Training and Testing Splits Of The Correlation Variables

* The Features that were selected for correlation:
    * Sex
    * HighBP
    * DiffWalk
    * MentHlth
    * HighChol
    * AnyHealthCare
    * Smoker
    * Fruits
    * Income


* Below is going to be the creation and presentation of the dataframe to see its details

In [20]:
X_selected_train = df_undersampled_train.loc[:, ['Sex', 'HighBP', 'DiffWalk', 'MentHlth', 'HighChol', 'AnyHealthcare', 'Smoker',
                       'Fruits','Income']]
print(X_selected_train.info())

X_selected_test = df_undersampled_test.loc[:, ['Sex', 'HighBP', 'DiffWalk', 'MentHlth', 'HighChol', 'AnyHealthcare', 'Smoker',
                       'Fruits','Income']]
print(X_selected_test.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49484 entries, 0 to 49483
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Sex            49484 non-null  float64
 1   HighBP         49484 non-null  float64
 2   DiffWalk       49484 non-null  float64
 3   MentHlth       49484 non-null  float64
 4   HighChol       49484 non-null  float64
 5   AnyHealthcare  49484 non-null  float64
 6   Smoker         49484 non-null  float64
 7   Fruits         49484 non-null  float64
 8   Income         49484 non-null  float64
dtypes: float64(9)
memory usage: 3.4 MB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21208 entries, 0 to 21207
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Sex            21208 non-null  float64
 1   HighBP         21208 non-null  float64
 2   DiffWalk       21208 non-null  float64
 3   MentHlth       21208 non-null 

# Creating A Variable Containing The Training and Testing Splits Of The Lasso Variables

* The Featurs that were selected for Lasso:
    * HighBP
    * BMI
    * Smoker
    * HeartDiseaseorAttack
    * HvyAlcoholConsump
    * GenHlth
    * MentHlth
    * PhysHlth
    * DiffWalk


* Below is the creating of the datadrame containing the features and presentation of details concerning them

In [29]:
X_selected_train = df_undersampled_train.loc[:, ['HighBP', 'BMI', 'Smoker','HeartDiseaseorAttack', 'HvyAlcoholConsump',
                                                 'GenHlth','MentHlth', 'PhysHlth', 'DiffWalk']]
print(X_selected_train.info())

X_selected_test = df_undersampled_test.loc[:, ['HighBP', 'BMI', 'Smoker','HeartDiseaseorAttack', 'HvyAlcoholConsump',
                                               'GenHlth','MentHlth', 'PhysHlth', 'DiffWalk']]
print(X_selected_test.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49484 entries, 0 to 49483
Data columns (total 9 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   HighBP                49484 non-null  float64
 1   BMI                   49484 non-null  float64
 2   Smoker                49484 non-null  float64
 3   HeartDiseaseorAttack  49484 non-null  float64
 4   HvyAlcoholConsump     49484 non-null  float64
 5   GenHlth               49484 non-null  float64
 6   MentHlth              49484 non-null  float64
 7   PhysHlth              49484 non-null  float64
 8   DiffWalk              49484 non-null  float64
dtypes: float64(9)
memory usage: 3.4 MB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21208 entries, 0 to 21207
Data columns (total 9 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   HighBP                21208 non-null  float64
 1   BMI                 

# Both Methods Are Going To Be Tested On Models From Step 1 And The Method With The Best Results Will Be Choosen

* In the future tests you will see that Lasso performs the best having a significantly greater accuracy than the correlation set