<a href="https://colab.research.google.com/github/Donalizasaji/Projects/blob/main/ML_Project_Heart_Diease_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pandas-profiling

In [None]:
!pip install sweetviz

In [None]:
import pandas as pd
import numpy as np
#EDA
import matplotlib.pyplot as plt
from pandas_profiling import ProfileReport
import seaborn as sns
import sweetviz as sv
#Scaling
from sklearn.preprocessing import StandardScaler
#Warning
import warnings
warnings.filterwarnings('ignore')
#Statistical test
from scipy.stats import chi2_contingency
#Model building and models used
from sklearn.model_selection import train_test_split
from collections import Counter
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from keras.models import Sequential
from keras.layers import Dense
#Model metrics
from sklearn.metrics import confusion_matrix,accuracy_score,roc_curve,classification_report

In [None]:
df=pd.read_csv('/content/Heart_Disease_Prediction.csv')
df

Information about dataset

In [None]:
df.shape

In [None]:
for c in df.columns:
    uq = df[c].unique()
    print('Unique values in ',c,':')
    print(uq)

In [None]:
df.duplicated().sum()

In [None]:
df.info()

Missing values

In [None]:
df.isnull().sum()

Column conversions

Categorical to Numerical

In [None]:
#replace and mapping method
mapping = {'Presence': 1, 'Absence': 0}
df['Heart Disease'] = df['Heart Disease'].replace(mapping)

In [None]:
df['Heart Disease']

Outliers Detection

In [None]:
plt.boxplot(df['Age'], vert=False)
plt.title('Boxplot for Age')
plt.xlabel('Age')
plt.show()

In [None]:
plt.boxplot(df['Thallium'], vert=False)
plt.title('Boxplot for Thallium')
plt.xlabel('Thallium')
plt.show()

In [None]:
plt.boxplot(df['Max HR'], vert=False)
plt.title('Boxplot for Max HR ')
plt.xlabel('Max HR')
plt.show()

In [None]:
plt.boxplot(df['Cholesterol'], vert=False)
plt.title('Boxplot for Cholesterol')
plt.xlabel('Cholesterol')
plt.show()

Removing outliers

In [None]:
Q1 = df['Cholesterol'].quantile(0.25)
Q3 = df['Cholesterol'].quantile(0.75)
IQR = Q3 - Q1
lb = Q1 - 1.5 * IQR
ub = Q3 + 1.5 * IQR
df = df[(df['Cholesterol'] >= lb) & (df['Cholesterol'] <= ub)]
print(df)

In [None]:
df.shape

In [None]:
profile = ProfileReport(df)
profile.to_notebook_iframe()

In [None]:
df.hist(figsize=(12,12))
plt.show()

In [None]:
report = sv.analyze(df)
report.show_html('sweetviz_report.html')

In [None]:
sns.boxplot(x='Sex', y='Cholesterol', data=df)
plt.xlabel('Sex')
plt.ylabel('Cholesterol Level')
plt.title('Cholesterol Level by Sex')
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.swarmplot(x='Heart Disease', y='Age', data=df, palette='Set1')
plt.xlabel('Heart Disease')
plt.ylabel('Age')
plt.title('Age Distribution by Heart Disease')
plt.show()

In [None]:
plt.hist(df['Age'], bins=20, color='purple')
plt.xlabel('Age')
plt.ylabel('Frequency')
plt.title('Age Distribution')
plt.show()

In [None]:
countFemale = len(df[df.Sex == 0])
countMale = len(df[df.Sex == 1])

total_count = len(df)
percentage_female = (countFemale / total_count) * 100
percentage_male = (countMale / total_count) * 100

print("Percentage of Female Patients:{:.2f}%".format((countFemale)/(len(df.Sex))*100))
print("Percentage of Male Patients:{:.2f}%".format((countMale)/(len(df.Sex))*100))

In [None]:
sns.set(style="whitegrid")
plt.figure(figsize=(8, 6))
sns.barplot(x=['Female', 'Male'], y=[percentage_female, percentage_male], palette='pastel')
plt.xlabel('Gender')
plt.ylabel('Percentage')
plt.title('Percentage of Female and Male Patients')
plt.show()

In [None]:
counts =df['Heart Disease'].value_counts()
l =['No Heart Disease', 'Heart Disease']
sizes =counts.values
plt.figure(figsize=(6, 6))
colors = ['#ff9999', '#66b3ff']
plt.pie(sizes, labels=l, autopct='%2.2f%%', colors=colors)
plt.title('Percentage of People with Heart Disease')
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.barplot(x='Chest pain type', y='Cholesterol', data=df, ci="sd", palette='pastel')
plt.xlabel('Chest Pain Type')
plt.ylabel('Average Cholesterol Level')
plt.title('Average Cholesterol Level by Chest Pain Type')
plt.show()

In [None]:
plt.figure(figsize=(10,7))
sns.heatmap(df.corr(), annot=True, vmin=-1, vmax=1,fmt=".2f")

In [None]:
corr=df.corr()
mask=np.triu(np.ones_like(corr,dtype=bool))
f,ax = plt.subplots(figsize=(15,12))
cmap = sns.diverging_palette(230,20, as_cmap=True)
sns.heatmap(corr,cmap=cmap,mask=mask,linewidth=0.5,square=True,center=0)

In [None]:
df.head()

Chi-Square Test

In [None]:
y = df["Heart Disease"]
X = df.drop('Heart Disease',axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state = 0)