In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import seaborn as sns
sns.set()


from sklearn.model_selection import train_test_split

import warnings
warnings.simplefilter("ignore")

In [2]:
df = pd.read_csv(r"framingham.csv")
df.head()

Unnamed: 0,Sex,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,Male,39,4.0,NonCurrent,0.0,No,No,No,No,195.0,106.0,70.0,26.97,80.0,77.0,0
1,Female,46,2.0,NonCurrent,0.0,No,No,No,No,250.0,121.0,81.0,28.73,95.0,76.0,0
2,Male,48,1.0,Current,20.0,No,No,No,No,245.0,127.5,80.0,25.34,75.0,70.0,0
3,Female,61,3.0,Current,30.0,No,No,Yes,No,225.0,150.0,95.0,28.58,65.0,103.0,1
4,Female,46,3.0,Current,23.0,No,No,No,No,285.0,130.0,84.0,23.1,85.0,85.0,0


In [3]:
feat_object=[i for i in df.columns if df[i].dtypes=="object"]
print(feat_object)

['Sex', 'currentSmoker', 'BPMeds', 'prevalentStroke', 'prevalentHyp', 'diabetes']


In [4]:
for i in df.columns:
    if i in feat_object:
        df[i] =df[i].map({"Male":1,"Female":0,"Yes":1,"No":0,"Current":1,"NonCurrent":0})
print(df.head())

   Sex  age  education  currentSmoker  cigsPerDay  BPMeds  prevalentStroke  \
0    1   39        4.0              0         0.0     0.0                0   
1    0   46        2.0              0         0.0     0.0                0   
2    1   48        1.0              1        20.0     0.0                0   
3    0   61        3.0              1        30.0     0.0                0   
4    0   46        3.0              1        23.0     0.0                0   

   prevalentHyp  diabetes  totChol  sysBP  diaBP    BMI  heartRate  glucose  \
0             0         0    195.0  106.0   70.0  26.97       80.0     77.0   
1             0         0    250.0  121.0   81.0  28.73       95.0     76.0   
2             0         0    245.0  127.5   80.0  25.34       75.0     70.0   
3             1         0    225.0  150.0   95.0  28.58       65.0    103.0   
4             0         0    285.0  130.0   84.0  23.10       85.0     85.0   

   TenYearCHD  
0           0  
1           0  
2       

In [5]:
for i in df.columns:
    l= df[i].isna().sum()
    if l>0:
        print(i,"column has",df[i].isna().sum(),"null values")

education column has 105 null values
cigsPerDay column has 29 null values
BPMeds column has 53 null values
totChol column has 50 null values
BMI column has 19 null values
heartRate column has 1 null values
glucose column has 388 null values


In [6]:
print(df.dtypes)

Sex                  int64
age                  int64
education          float64
currentSmoker        int64
cigsPerDay         float64
BPMeds             float64
prevalentStroke      int64
prevalentHyp         int64
diabetes             int64
totChol            float64
sysBP              float64
diaBP              float64
BMI                float64
heartRate          float64
glucose            float64
TenYearCHD           int64
dtype: object


In [7]:
for i in df.columns:
    l= df[i].isna().sum()
    if l>0:
        means=df[i].mean()
        print(i,"column has mean of ",means)

education column has mean of  1.9789499153157513
cigsPerDay column has mean of  9.003088619624615
BPMeds column has mean of  0.02962962962962963
totChol column has mean of  236.72158548233045
BMI column has mean of  25.80200758473571
heartRate column has mean of  75.87892376681614
glucose column has mean of  81.96675324675324


In [8]:
for i in df.columns:
    df[i]=df[i].replace(np.NAN,value=df[i].mean())
    print(df[i].isna().sum())

0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0


In [9]:
target ='TenYearCHD'
y=df[target]

feat=[i for i in df.columns if 'TenYearCHD' not in i]
X=df[feat]

In [10]:
print(df['TenYearCHD'].value_counts(normalize=True))

0    0.848042
1    0.151958
Name: TenYearCHD, dtype: float64


In [11]:
# !pip install scikit-optimize
# from imblearn.over_sampling import SMOTE,RandomOverSampler,ADASYN
# from imblearn.combine import SMOTEENN
# X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=42)
# X_train.shape,y_train.shape

from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                   train_size=0.70,
                                                    random_state=0)




In [12]:
from imblearn.over_sampling import SMOTE,RandomOverSampler,ADASYN

oversample = SMOTE(random_state=1)
X_train_s,y_train_s=oversample.fit_resample(X_train,y_train)

# sm = ADASYN(random_state=42)
# X_train_ada,y_train_ada=sm.fit_resample(X_train,y_train)

# smo=SMOTEENN(random_state=42)
# X_trainse,y_trainse=smo.fit_resample(X_train,y_train)

In [13]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from imblearn.over_sampling import SMOTE,RandomOverSampler

# Part 1

In [14]:
pipeline =make_pipeline(LogisticRegression(solver='saga',max_iter=1000))

In [15]:
param_grid={
    'logisticregression__penalty':['elasticnet'],
    'logisticregression__l1_ratio':[0.2,0.5,0.8],
    'logisticregression__C':[0.1,1,10],
    
    
}

In [16]:
grid_search = GridSearchCV(pipeline,param_grid,cv=5,scoring="accuracy")

In [17]:
model=grid_search.fit(X_train,y_train)

In [18]:
best_params =model.best_params_
best_params

{'logisticregression__C': 1,
 'logisticregression__l1_ratio': 0.2,
 'logisticregression__penalty': 'elasticnet'}

In [19]:
best_estimator=model.best_estimator_
accuracy=best_estimator.score(X_test,y_test)
print('Best Parameters:',best_params)
print('Test_accuracy:',accuracy)

Best Parameters: {'logisticregression__C': 1, 'logisticregression__l1_ratio': 0.2, 'logisticregression__penalty': 'elasticnet'}
Test_accuracy: 0.8443396226415094


In [20]:
from sklearn.metrics import classification_report,accuracy_score

In [21]:
y_pred=model.predict(X_test)

In [22]:
print(classification_report(y_pred,y_test))

              precision    recall  f1-score   support

           0       1.00      0.84      0.92      1270
           1       0.01      0.50      0.01         2

    accuracy                           0.84      1272
   macro avg       0.50      0.67      0.46      1272
weighted avg       1.00      0.84      0.91      1272



# Part 2

In [23]:
from sklearn.preprocessing import StandardScaler

In [24]:
pipeline ={
    "enet":make_pipeline(StandardScaler(),LogisticRegression(penalty='elasticnet',solver='saga',max_iter=1000))
}

In [25]:
param={
    'logisticregression__l1_ratio':[0.2,0.5,0.8],
    'logisticregression__C':[0.1,1,10],    
    
}

In [26]:
hyper={
    
    'enet':param
}

In [27]:
models={}
for i in pipeline.keys():
    models[i]=GridSearchCV(pipeline[i],hyper[i],cv=5,scoring="accuracy",verbose=-1)

In [28]:
for i in models.keys():
    models[i].fit(X_train,y_train)
    print(i,"is trained and tuned")

enet is trained and tuned


In [29]:
y_pred={}
for i in models.keys():
    y_pred[i]=models[i].predict(X_test)
    print(accuracy_score(y_pred[i],y_test))
    print(classification_report(y_pred[i],y_test))
    print(i,y_pred)

0.8490566037735849
              precision    recall  f1-score   support

           0       0.99      0.85      0.92      1254
           1       0.06      0.67      0.11        18

    accuracy                           0.85      1272
   macro avg       0.53      0.76      0.51      1272
weighted avg       0.98      0.85      0.91      1272

enet {'enet': array([0, 0, 0, ..., 0, 0, 0], dtype=int64)}


# Part 3

In [30]:
pipeline ={
    "enet":make_pipeline(StandardScaler(),LogisticRegression(penalty='elasticnet',max_iter=1000)),
    "l1":make_pipeline(StandardScaler(),LogisticRegression(penalty='l1',max_iter=1000)),
    "l2":make_pipeline(StandardScaler(),LogisticRegression(penalty='l2',max_iter=1000)),
    
}

In [31]:
param_enet={
    'logisticregression__l1_ratio':[0.2,0.5,0.8],
    'logisticregression__C':[0.1,1,10], 
    'logisticregression__solver':['liblinear','saga']
    
}

param_l1={
  'logisticregression__C':[0.1,1,10], 
  'logisticregression__solver':['liblinear','saga']
    
}

param_l2={
  'logisticregression__C':[0.1,1,10],    
  'logisticregression__solver':['liblinear','saga']  
}

In [32]:
hyper={
    'enet':param_enet,
    'l1':param_l1,
    'l2':param_l2
}

In [33]:
models={}
for i in pipeline.keys():
    models[i]=GridSearchCV(pipeline[i],hyper[i],cv=5,scoring="accuracy",verbose=-1)

In [34]:
for i in models.keys():
    models[i].fit(X_train,y_train)
    print(i,"is trained and tuned")

enet is trained and tuned
l1 is trained and tuned
l2 is trained and tuned


In [35]:
for i in models.keys():
    print("\n..........................",i,"model.........................................")
    print(models[i].best_params_)


.......................... enet model.........................................
{'logisticregression__C': 0.1, 'logisticregression__l1_ratio': 0.2, 'logisticregression__solver': 'saga'}

.......................... l1 model.........................................
{'logisticregression__C': 1, 'logisticregression__solver': 'liblinear'}

.......................... l2 model.........................................
{'logisticregression__C': 10, 'logisticregression__solver': 'liblinear'}


In [36]:
best_estimator=model.best_estimator_
accuracy=best_estimator.score(X_test,y_test)
print('Best Parameters:',best_params)
print('Test_accuracy:',accuracy)

best_param={}
accuracy={}
for i in models.keys():
    best_param=models[i].best_estimator_
    accuracy=best_estimator.score(X_test,y_test)
    print('Best Parameters:',best_param)
    print('Test_accuracy:',accuracy)
    

Best Parameters: {'logisticregression__C': 1, 'logisticregression__l1_ratio': 0.2, 'logisticregression__penalty': 'elasticnet'}
Test_accuracy: 0.8443396226415094
Best Parameters: Pipeline(steps=[('standardscaler', StandardScaler()),
                ('logisticregression',
                 LogisticRegression(C=0.1, l1_ratio=0.2, max_iter=1000,
                                    penalty='elasticnet', solver='saga'))])
Test_accuracy: 0.8443396226415094
Best Parameters: Pipeline(steps=[('standardscaler', StandardScaler()),
                ('logisticregression',
                 LogisticRegression(C=1, max_iter=1000, penalty='l1',
                                    solver='liblinear'))])
Test_accuracy: 0.8443396226415094
Best Parameters: Pipeline(steps=[('standardscaler', StandardScaler()),
                ('logisticregression',
                 LogisticRegression(C=10, max_iter=1000, solver='liblinear'))])
Test_accuracy: 0.8443396226415094


In [37]:
y_pred={}
for i in models.keys():
    y_pred[i]=models[i].predict(X_test)
    print("\n......................",i,"model........................................")
    print("Accuracy:",accuracy_score(y_pred[i],y_test))
    print(classification_report(y_pred[i],y_test))


...................... enet model........................................
Accuracy: 0.8490566037735849
              precision    recall  f1-score   support

           0       0.99      0.85      0.92      1254
           1       0.06      0.67      0.11        18

    accuracy                           0.85      1272
   macro avg       0.53      0.76      0.51      1272
weighted avg       0.98      0.85      0.91      1272


...................... l1 model........................................
Accuracy: 0.8498427672955975
              precision    recall  f1-score   support

           0       0.99      0.85      0.92      1253
           1       0.07      0.68      0.12        19

    accuracy                           0.85      1272
   macro avg       0.53      0.77      0.52      1272
weighted avg       0.98      0.85      0.91      1272


...................... l2 model........................................
Accuracy: 0.8490566037735849
              precision    recall  f1-

# Part 4

In [38]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder,StandardScaler,LabelEncoder
from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report,accuracy_score
from imblearn.over_sampling import SMOTE

In [39]:
# pip install imblearn

In [40]:
pipeline ={
    "enet":make_pipeline(StandardScaler(),SimpleImputer(missing_values=np.nan,strategy='median'),LogisticRegression(penalty='elasticnet',max_iter=1000)),
    "l1":make_pipeline(StandardScaler(),SimpleImputer(),LogisticRegression(penalty='l1',max_iter=1000)),
    "l2":make_pipeline(StandardScaler(),SimpleImputer(),LogisticRegression(penalty='l2',max_iter=1000)),
    
}

In [41]:
param_enet={
    'logisticregression__l1_ratio':[0.2,0.5,0.8],
    'logisticregression__C':[0.1,1,10], 
    'logisticregression__solver':['saga']
    
}

param_l1={
  'logisticregression__C':[0.1,1,10], 
  'logisticregression__solver':['liblinear','saga']
    
}

param_l2={
  'logisticregression__C':[0.1,1,10],    
  'logisticregression__solver':['liblinear','saga']  
}

In [42]:
hyper={
    'enet':param_enet,
    'l1':param_l1,
    'l2':param_l2
}

In [43]:
models={}
for i in pipeline.keys():
    models[i]=BayesSearchCV(pipeline[i],hyper[i],cv=5,scoring="accuracy",verbose=-1)

In [44]:
for i in models.keys():
    models[i].fit(X_train_s,y_train_s)
    print(i,"is trained and tuned")

enet is trained and tuned
l1 is trained and tuned
l2 is trained and tuned


In [45]:
for i in models.keys():
    print("\n..........................",i,"model.........................................")
    print(models[i].best_params_)


.......................... enet model.........................................
OrderedDict([('logisticregression__C', 1.0), ('logisticregression__l1_ratio', 0.5), ('logisticregression__solver', 'saga')])

.......................... l1 model.........................................
OrderedDict([('logisticregression__C', 1.0), ('logisticregression__solver', 'saga')])

.......................... l2 model.........................................
OrderedDict([('logisticregression__C', 10.0), ('logisticregression__solver', 'saga')])


In [46]:
from sklearn.metrics import confusion_matrix
y_pred={}
for i in models.keys():
    y_pred[i]=models[i].predict(X_test)
    print(accuracy_score(y_pred[i],y_test))
    print(classification_report(y_pred[i],y_test))
    print(confusion_matrix(y_pred[i],y_test))
#     print(i,y_pred)

0.6525157232704403
              precision    recall  f1-score   support

           0       0.67      0.90      0.76       798
           1       0.58      0.24      0.34       474

    accuracy                           0.65      1272
   macro avg       0.62      0.57      0.55      1272
weighted avg       0.63      0.65      0.61      1272

[[715  83]
 [359 115]]
0.6525157232704403
              precision    recall  f1-score   support

           0       0.67      0.90      0.76       798
           1       0.58      0.24      0.34       474

    accuracy                           0.65      1272
   macro avg       0.62      0.57      0.55      1272
weighted avg       0.63      0.65      0.61      1272

[[715  83]
 [359 115]]
0.6509433962264151
              precision    recall  f1-score   support

           0       0.67      0.89      0.76       800
           1       0.57      0.24      0.34       472

    accuracy                           0.65      1272
   macro avg       0.62  

# PART 5

In [48]:
from sklearn.pipeline import Pipeline
pipe = Pipeline([
    ('scaler',StandardScaler()),
    ('imputer',SimpleImputer()),
    ('classifier',LogisticRegression())
    
])

from sklearn.ensemble import RandomForestClassifier

In [49]:
param_grid ={
    
    'classifier':[LogisticRegression(penalty='l1'),
                  LogisticRegression(penalty='l2'),
                  LogisticRegression(penalty='elasticnet'),
                 
                 ],
    'classifier__C':[1,10,100],
    'classifier__l1_ratio':[0.2,0.5,0.8],
    'classifier__solver':['saga','liblinear'],
 
    
}

In [50]:
grid_search=GridSearchCV(pipe,param_grid,cv=5,verbose=-1,scoring='accuracy')

In [51]:
grid_search.fit(X_train,y_train)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('scaler', StandardScaler()),
                                       ('imputer', SimpleImputer()),
                                       ('classifier', LogisticRegression())]),
             param_grid={'classifier': [LogisticRegression(C=1, l1_ratio=0.2,
                                                           penalty='l1',
                                                           solver='saga'),
                                        LogisticRegression(),
                                        LogisticRegression(penalty='elasticnet')],
                         'classifier__C': [1, 10, 100],
                         'classifier__l1_ratio': [0.2, 0.5, 0.8],
                         'classifier__solver': ['saga', 'liblinear']},
             scoring='accuracy', verbose=-1)

In [52]:
best_model = grid_search.best_estimator_
print(best_model)
best_params=grid_search.best_params_
print(best_params)

Pipeline(steps=[('scaler', StandardScaler()), ('imputer', SimpleImputer()),
                ('classifier',
                 LogisticRegression(C=1, l1_ratio=0.2, penalty='l1',
                                    solver='saga'))])
{'classifier': LogisticRegression(C=1, l1_ratio=0.2, penalty='l1', solver='saga'), 'classifier__C': 1, 'classifier__l1_ratio': 0.2, 'classifier__solver': 'saga'}


In [53]:
y_pred=best_model.predict(X_test)

In [54]:
accuracy=accuracy_score(y_test,y_pred)
print(f"Best Model:{best_model.named_steps['classifier']}")
print(f"Best Hyperparameters:{best_params}")
print(f"Accuracy:{accuracy}")

Best Model:LogisticRegression(C=1, l1_ratio=0.2, penalty='l1', solver='saga')
Best Hyperparameters:{'classifier': LogisticRegression(C=1, l1_ratio=0.2, penalty='l1', solver='saga'), 'classifier__C': 1, 'classifier__l1_ratio': 0.2, 'classifier__solver': 'saga'}
Accuracy:0.8498427672955975


# Part 6

In [55]:
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

pipeline=Pipeline([
    ('scaler',StandardScaler()),
    ('classifier',RandomForestClassifier(),SVC())
])

In [56]:
param_grid={
    'classifier':[RandomForestClassifier(),SVC()],
#     'classifier__n_estimators':[50,100,200],
#     'classifier__C':[1,10,100]
}

In [57]:
grid_search=GridSearchCV(pipe,param_grid,cv=5)
grid_search.fit(X_train,y_train)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('scaler', StandardScaler()),
                                       ('imputer', SimpleImputer()),
                                       ('classifier', LogisticRegression())]),
             param_grid={'classifier': [RandomForestClassifier(), SVC()]})

In [58]:
y_pred=grid_search.predict(X_test)
y_pred

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [59]:
best_model = grid_search.best_estimator_
print(best_model)
best_params=grid_search.best_params_
print(best_params)

Pipeline(steps=[('scaler', StandardScaler()), ('imputer', SimpleImputer()),
                ('classifier', RandomForestClassifier())])
{'classifier': RandomForestClassifier()}


In [60]:
accuracy_score(y_pred,y_test)

0.8482704402515723