In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
import joblib

from google.colab import files
uploaded = files.upload()
df = pd.read_csv('patient_adherence_dataset.csv')

Saving patient_adherence_dataset.csv to patient_adherence_dataset.csv


In [None]:
df.head()

Unnamed: 0,Age,Gender,Medication_Type,Dosage_mg,Previous_Adherence,Education_Level,Income,Social_Support_Level,Condition_Severity,Comorbidities_Count,Healthcare_Access,Mental_Health_Status,Insurance_Coverage,Adherence
0,57,Male,TypeA,136,1,High School,634934,Medium,Severe,3,Poor,Good,1,1
1,47,Male,TypeA,134,1,High School,297954,High,Moderate,3,Good,Good,1,0
2,59,Male,TypeC,89,1,High School,789337,High,Moderate,1,Good,Good,1,1
3,72,Male,TypeB,240,0,Postgraduate,267352,Low,Mild,2,Poor,Moderate,1,0
4,46,Male,TypeA,241,1,Postgraduate,718447,Medium,Mild,1,Poor,Good,1,0


In [None]:
df.isnull().sum()

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Age                   5000 non-null   int64 
 1   Gender                5000 non-null   object
 2   Medication_Type       5000 non-null   object
 3   Dosage_mg             5000 non-null   int64 
 4   Previous_Adherence    5000 non-null   int64 
 5   Education_Level       5000 non-null   object
 6   Income                5000 non-null   int64 
 7   Social_Support_Level  5000 non-null   object
 8   Condition_Severity    5000 non-null   object
 9   Comorbidities_Count   5000 non-null   int64 
 10  Healthcare_Access     5000 non-null   object
 11  Mental_Health_Status  5000 non-null   object
 12  Insurance_Coverage    5000 non-null   int64 
 13  Adherence             5000 non-null   int64 
dtypes: int64(7), object(7)
memory usage: 547.0+ KB


In [None]:
X = df.drop(['Adherence'],axis=1)
y = df['Adherence']

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42,stratify=y)

categorical_cols = X.select_dtypes(include=['object']).columns
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(), categorical_cols)
    ]
)

In [None]:
from xgboost import XGBClassifier

models = {
    "Logistic Regression": (
        LogisticRegression(class_weight="balanced", max_iter=2000),
        {"classifier__C": [0.01, 0.1, 1, 10]}
    ),
    "Random Forest": (
        RandomForestClassifier(class_weight="balanced", random_state=42),
        {"classifier__n_estimators": [100, 200],
         "classifier__max_depth": [5, 10, None]}
    ),
    "SVM": (
        SVC(class_weight="balanced"),
        {"classifier__C": [0.1, 1, 10],
         "classifier__kernel": ["poly"]}
    ),
    "XGBoost": (
        XGBClassifier(eval_metric="logloss", random_state=42),
        {"classifier__n_estimators": [100, 200],
         "classifier__max_depth": [5, 10],
         "classifier__learning_rate": [0.01, 0.1]}
    )
}

In [None]:
results = {}

for model_name, (model, params) in models.items():
    pipeline = Pipeline(steps=[("preprocessor", preprocessor),
                               ("classifier", model)])

    grid = GridSearchCV(pipeline, param_grid=params, cv=5, scoring="accuracy", n_jobs=-1)
    grid.fit(X_train, y_train)

    best_model = grid.best_estimator_
    best_acc = grid.best_score_
    results[model_name] = (grid.best_estimator_, grid.best_score_)
    print(f"{model_name} best accuracy: {grid.best_score_:.4f}")
best_model_name = max(results, key=lambda x: results[x][1])
best_model, best_cv_acc = results[best_model_name]

print("Best Model from CV:", best_model_name)
print("Cross-validation Accuracy:", round(best_cv_acc, 4))
y_pred = best_model.predict(X_test)
test_acc = accuracy_score(y_test, y_pred)

print("\n Test Set Evaluation for", best_model_name)
print("Test Accuracy:", round(test_acc, 4))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Logistic Regression best accuracy: 0.6350
Random Forest best accuracy: 0.6255
SVM best accuracy: 0.6132
XGBoost best accuracy: 0.6220
Best Model from CV: Logistic Regression
Cross-validation Accuracy: 0.635

 Test Set Evaluation for Logistic Regression
Test Accuracy: 0.64

Classification Report:
               precision    recall  f1-score   support

           0       0.70      0.59      0.64       543
           1       0.59      0.70      0.64       457

    accuracy                           0.64      1000
   macro avg       0.64      0.64      0.64      1000
weighted avg       0.65      0.64      0.64      1000



In [None]:
import joblib
joblib.dump(best_model, "best_model.pkl")

['best_model.pkl']

In [None]:
from google.colab import files

files.download("best_model.pkl")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>