In [None]:
## Important files import
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
## reading data
df = pd.read_csv('heart.csv')
df.head()

In [None]:
df.shape

#### Cleaning and preprocessing

In [None]:
## check null value present
df.isnull().sum()

In [None]:
df.dtypes

In [None]:

# Visualize the distribution of categories in 'Column1'
# sns.countplot(x='Sex', data=df) ## M,F
# sns.countplot(x='ChestPainType', data=df) ## ATA, NAP,ASY,TA
# sns.countplot(x='RestingECG', data=df) ## Normal, ST,LVH
sns.countplot(x='ST_Slope', data=df) ## Normal,ST,LVH
plt.show()

#### features engineering and standardization

In [None]:
## one-hot encoding
updatedDf = pd.get_dummies(df,columns=['Sex','ChestPainType','RestingECG','ExerciseAngina','ST_Slope'])
updatedDf = updatedDf.astype(float)

In [None]:
## create new dataset which is clean
updatedDf.to_csv('heart_cleaned_dataset.csv')

In [None]:
## Reording the columns

# Get the name of the 5th column
fifth_column_name = updatedDf.columns[6]

# Create a new list of column names with the 5th column at the end
new_column_order = updatedDf.columns[:6].tolist() + updatedDf.columns[7:].tolist() + [fifth_column_name]

# Reorder the DataFrame
updatedDf = updatedDf[new_column_order]

In [None]:
## dependent and independent feature split
X = updatedDf.iloc[:,:20]
Y = updatedDf.iloc[:,20]

In [None]:
updatedDf.corr()

In [None]:
## train test split
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.30,random_state=10)

In [None]:
## features scalling 
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
## model training
from sklearn.linear_model import LogisticRegression
regression = LogisticRegression()
regression.fit(X_train,Y_train)

In [None]:
y_pred = regression.predict(X_test)

In [None]:
# performance metrices
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
print(accuracy_score(Y_test,y_pred))
print(confusion_matrix(Y_test,y_pred))
print(classification_report(Y_test,y_pred))

In [None]:
# Plot the data and the logistic regression curve
plt.scatter(X_train, Y_train)
plt.plot(X_test, y_pred, color='red')
plt.xlabel('X')
plt.ylabel('Probability')
plt.title('Logistic Regression Curve')
plt.show()

#### Hypertunning and cross-validation

In [None]:
model = LogisticRegression()
penalty = ['l1','l2','elasticnet']
solver = ['lbfgs','liblinear','newton-cg','newton-cholesky','saga','sag']
c_values = [100,10,1.0,0.1,0.01]

In [None]:
params = dict(penalty=penalty,C=c_values,solver=solver)

In [None]:
## stratifiedKFold
from sklearn.model_selection import StratifiedGroupKFold
cv = StratifiedGroupKFold()

In [None]:
## GridSearchCV
from sklearn.model_selection import GridSearchCV
grid = GridSearchCV(estimator=model,param_grid=params,cv=cv,scoring='accuracy')

In [None]:
# grid.fit(X_train,Y_train)
Y_train