## **Data Preprocessing**

In [None]:
from google.colab import files
files.upload()

In [None]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
!pip install kaggle

In [None]:
!kaggle datasets download -d redwankarimsony/heart-disease-data -p /content/heart-disease --unzip

In [None]:
import pandas as pd
df = pd.read_csv("/content/heart-disease/heart_disease_uci.csv")

In [None]:
df.head()

In [None]:
df.columns #Columns in our dataset

In [None]:
df.isnull().sum() #No. of null values in each column

In [None]:
numeric_cols = df.select_dtypes(include='number').columns #finds all the numeric columns
df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].mean()) #fills any missing value with the mean average of that column

In [None]:
df[numeric_cols].mean()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df[numeric_cols].hist(figsize=(15,10))
plt.tight_layout() #avoid overlapping
plt.show()

In [None]:
sns.heatmap(df[numeric_cols].corr(), annot=True, cmap='coolwarm')
plt.title('Numeric Feature Correlations')
plt.show()

In [None]:
cat_cols = df.select_dtypes(include='object').columns.tolist()
if 'num' in cat_cols:
  cat_cols.remove('num')
print(cat_cols)



## **MODEL TRAINING**



In [None]:
X = df.drop('num', axis = 1)
y = (df['num'] > 0).astype(int) # converting boolean values into integer

In [None]:
X = pd.get_dummies(X, columns=cat_cols) #One-hot Encoding
print('Final Feature Columns : ',X.columns)

## **Normalization, Modeling and Model Evaluation**

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [None]:
X_train, X_test, y_train, y_test =train_test_split(X, y, test_size = 0.2, random_state=42)

In [None]:
sc = StandardScaler()
X_train_scaled = sc.fit_transform(X_train)
X_test_scaled = sc.fit_transform(X_test)

In [None]:
from sklearn.linear_model import LogisticRegression # for Classification
lr_model = LogisticRegression()
lr_model.fit(X_train_scaled, y_train) #model training

In [None]:
#MODEL EVALUATION
from sklearn.metrics import accuracy_score, classification_report
y_pred_lr = lr_model.predict(X_test_scaled)
print("Logistic Rehression Accuracy", accuracy_score(y_test, y_pred_lr))
print(classification_report(y_test, y_pred_lr))

### Model Accuracy : 84.78%

## **Random Forest and Feature importance**

In [None]:
from sklearn.metrics import confusion_matrix

In [None]:
cm =confusion_matrix(y_test, y_pred_lr)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title("Confusion Matrix (Logistic Regression)")
plt.show()

In [None]:
#RANDOM FOREST
from sklearn.ensemble import RandomForestClassifier

In [None]:
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_scaled, y_train)
y_pred_rf = rf_model.predict(X_test_scaled)
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))

Feature Importance:

In [None]:
feat_imp = pd.Series(rf_model.feature_importances_, index=X.columns)
feat_imp.nlargest(10).plot(kind='barh')
plt.title("Random Forest Importance")
plt.show()

In [None]:
import joblib
joblib.dump(rf_model, 'heart_rf_model.pkl')
joblib.dump(sc, 'heart_scaler.pkl')

In [None]:
sample = X.head(1)
sample.to_csv('Heart_user_template.csv', index=False)
print("User Template saved as Heart_user_template.csv'")

## **Prediction using Users Data**

In [None]:
from google.colab import files
files.upload()

In [None]:
import joblib
import pandas as pd

user_df = pd.read_csv('heart_dataset.csv')

#Getting columns list from training dataframe
numeric_cols = df.select_dtypes(include = 'number').columns.tolist()
cat_cols = df.select_dtypes(include = 'object').columns.tolist()
bool_cols = df.select_dtypes(include = 'bool').columns.tolist()

#Dropping columns which are are extra in user_df than required to avoid error
numeric_cols = [col for col in numeric_cols if col in user_df.columns]
cat_cols = [col for col in cat_cols if col in user_df.columns]
bool_cols = [col for col in bool_cols if col in user_df.columns]

#Filling the missing numeric column
user_df[numeric_cols] = user_df[numeric_cols].fillna(user_df[numeric_cols].mean())

#Filling the missing categorical column
for col in cat_cols:
  user_df[cat_cols] = user_df[cat_cal].fillna('Unknown')

#Filling the missing bool column
for col in bool_cols:
  user_df[bool_cols] = user_df[bool_cal].astype(int)

#One-hot encoding for categorical columns
user_df_encoded = pd.get_dummies(user_df, columns=cat_cols)

#Allign columns
user_df_encoded = user_df_encoded.reindex(columns = X.columns, fill_value=0)

#Scale data
scaler = joblib.load('heart_scaler.pkl')
user_df_scaled = scaler.transform(user-df-encoded)

#Prediction
model = joblib.load('hear_rf_model.pkl')
preds = model.predict(user_df_scaled)
user_df['heart_disease_Prediction'] = preds

#Show Results
print(user_df)