<a href="https://colab.research.google.com/github/DebashreeMondal/Disease-Prediction/blob/main/Bootcamp.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Day 1:

In [None]:
from google.colab import files
files.upload()

In [None]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
!pip install kaggle

In [None]:
import pandas as pd
df = pd.read_csv('/content/heart_attack_c117 (2) - heart_attack_c117 (2).csv')

In [None]:
df.head()

In [None]:
print(df.columns)

In [None]:
df.isnull().sum()

In [None]:
numeric_cols = df.select_dtypes(include='number').columns
df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].mean())

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df[numeric_cols].hist(figsize=(15,10))
plt.tight_layout()
plt.show()

In [None]:
sns.heatmap(df[numeric_cols].corr(), annot=True, cmap='coolwarm')
plt.title('Numeric Feature Correlations')
plt.show()

Day 2:

In [None]:
#One-hot encode ALL 'object' categorical columns
cat_cols = df.select_dtypes(include='object').columns.tolist()
if 'thal' in cat_cols:
    cat_cols.remove('thal')

In [None]:
X = df.drop('thal', axis=1)
y = (df['thal'].isin([1,2,3])).astype(int)

In [None]:
X = pd.get_dummies(X, columns=cat_cols)
print("Final feature columns:", X.columns)

Day 3:

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Scale numeric features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
# Train logistic regression
from sklearn.linear_model import LogisticRegression

In [None]:
lr_model = LogisticRegression()
lr_model.fit(X_train_scaled, y_train)

In [None]:
#Model evaluation
from sklearn.metrics import accuracy_score , classification_report

In [None]:
y_pred_lr=lr_model.predict(X_test_scaled)
print("Logistic Regression Accuracy:",accuracy_score(y_test,y_pred_lr))
print(classification_report(y_test,y_pred_lr))

Day 4:

In [None]:
from sklearn.metrics import confusion_matrix

In [None]:
# Evaluate logistic regression
y_pred_lr = lr_model.predict(X_test_scaled)
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_lr))
print(classification_report(y_test, y_pred_lr))

In [None]:
#Random Forest
cm = confusion_matrix(y_test, y_pred_lr)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix (Logistic Regression)')
plt.show()

In [None]:
# Random Forest for comparison
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_scaled, y_train)
y_pred_rf = rf_model.predict(X_test_scaled)
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))

In [None]:
#Feature Importance
feat_imp = pd.Series(rf_model.feature_importances_,index=X.columns)
feat_imp.nlargest(10).plot(kind='barh')
plt.title('Random Forest Feature Importance')
plt.show()

In [None]:
import joblib
joblib.dump(rf_model,'heart_rf_model.pkl')
joblib.dump(scaler,'heart_scaler.pkl')

In [None]:
joblib.dump(scaler, 'heart_scaler.pkl')

In [None]:
sample = X.head(1)
sample.to_csv('Heart_user_template.csv',index=False)
print("User Template saved as 'Heart_user_template.csv' ")

Day 5:

In [None]:
from google.colab import files
files.upload()

In [None]:
import joblib
import pandas as pd

user_df = pd.read_csv('/content/heart_attack_c117 (2) - heart_attack_c117 (2).csv')

# Get column lists from training dataframe
numeric_cols = df.select_dtypes(include='number').columns.tolist()
cat_cols = df.select_dtypes(include='object').columns.tolist()
bool_cols = df.select_dtypes(include='bool').columns.tolist()

# Drop columns not in user_df to avoid errors
numeric_cols = [col for col in numeric_cols if col in user_df.columns]
cat_cols = [col for col in cat_cols if col in user_df.columns]
bool_cols = [col for col in bool_cols if col in user_df.columns]

# Fill missing values in numeric columns with training set mean
user_df[numeric_cols] = user_df[numeric_cols].fillna(df[numeric_cols].mean())

# Fill missing values in categorical columns with 'Unknown'
for col in cat_cols:
    user_df[col] = user_df[col].fillna('Unknown')

# Convert boolean columns to int
for col in bool_cols:
    user_df[col] = user_df[col].astype(int)

# One-hot encode categorical columns
user_df_encoded = pd.get_dummies(user_df, columns=cat_cols)

# Align columns with training features X.columns
user_df_encoded = user_df_encoded.reindex(columns=X.columns, fill_value=0)

# Scale data
scaler = joblib.load('heart_scaler.pkl')
user_scaled = scaler.transform(user_df_encoded)

# Predict
model = joblib.load('heart_rf_model.pkl')
preds = model.predict(user_scaled)
user_df['Heart_Disease_Prediction'] = preds

print(user_df)