<font size = 7 color = steelblue> Machine Learning Project

In [None]:
import pandas as pd 
import numpy as np 
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
hospital = pd.read_csv("Hospitalisation details.csv")
Medical = pd.read_csv("Medical Examinations.csv")
names = pd.read_excel('Names.xlsx')
hospital.head()

In [None]:
Medical.head()

In [None]:
names.head()

In [None]:
data = pd.merge(hospital,Medical,how = 'outer',on = 'Customer ID')
data.tail()

In [None]:
data.columns

In [None]:
data = data.replace('?',np.nan)
data.dropna(inplace=True)

In [None]:
data['NumberOfMajorSurgeries'] = data['NumberOfMajorSurgeries'].replace("No major surgery",0)
data['NumberOfMajorSurgeries'] = data['NumberOfMajorSurgeries'].astype(int)

In [None]:
data.dtypes[data.dtypes == 'object']

In [None]:
data.year = data.year.astype(int)
data.dtypes[data.dtypes == 'object']

### Correlation between columns.

In [None]:
data_corr = data.corr(numeric_only=True)
data_corr

In [None]:
sns.heatmap(data_corr,annot=True,fmt = '.2f')
plt.show()

### Delete unnessesory column

In [None]:
from datetime import date
date.today().year

In [None]:
data['Age'] = date.today().year - data['year']
data1 = data.drop(columns =['Customer ID','month','date','year'])
data1.head()

### Convert Categorical data into numerical 

In [None]:
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.model_selection import train_test_split as split, StratifiedKFold
from sklearn.linear_model import LinearRegression, Ridge, RidgeCV
from sklearn.model_selection import cross_val_score, cross_val_predict, GridSearchCV
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

for i in data1.dtypes[data1.dtypes == 'object'].index:
    k = (data1[i].unique().tolist())
    enc = OrdinalEncoder(categories=[k],dtype=int)
    data1[i] = enc.fit_transform(data1[[i]])

data1.head()    

### Linear Regression Model

In [None]:
lr = LinearRegression()
ridge = Ridge(random_state=12)

X = data1.drop(columns='charges')
y = data1.charges
X_train,X_test,y_train,y_test = split(X,y,test_size=0.2,random_state=12)
lr.fit(X_train,y_train)

In [None]:
lr_train_pred = lr.predict(X_train)
lr_test_pred = lr.predict(X_test)
print(lr.intercept_)

In [None]:
print(f"Linear Regression for Train set Score is: {lr.score(X_train,y_train) * 100:.4f}%")
print(f"Linear Regression for Test set Score is : {lr.score(X_test,y_test) * 100:.4f}%")

In [None]:
print('MSE Lr for Train:' ,(mean_squared_error(y_true = y_train,y_pred = lr_train_pred).round(2)))
print(f"R Square Lr for Train: {r2_score(y_train,lr_train_pred)*100:.2f}%")

### Ridge Regression Model

In [None]:
ridge.fit(X_train,y_train)

In [None]:
ridge_train_pred = ridge.predict(X_train)
ridge_test_pred = ridge.predict(X_test)
print(ridge.intercept_)

In [None]:
print(f"Ridge Regression Train Score is: {ridge.score(X_train,y_train) * 100:.4f}%")
print(f"Ridge Regression Test Score is : {ridge.score(X_test,y_test) * 100:.4f}%")

In [None]:
print('MSE Ridge for Train:' ,(mean_squared_error(y_true = y_train,y_pred = ridge_train_pred).round(2)))
print(f"R Square Ridge for Train: {r2_score(y_train,ridge_train_pred)*100:.2f}%")

### Stratified 5 fold cross validation with Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

X = data1.drop(columns='smoker')
y = data1.smoker
X_train,X_test,y_train,y_test = split(X,y,test_size=0.2,random_state=12)

model = LogisticRegression(max_iter = 10000)
skfold = StratifiedKFold(n_splits=5,shuffle=True,random_state=12)
cv_scores = cross_val_score(model,X_train,y_train,cv = skfold)
for fold,score in enumerate(cv_scores):
    print(f"Fold {fold + 1} for cross val score is:{score*100:.2f}%")
print()    
print(f"Mean Cross Val Score: {cv_scores.mean()*100:.2f}%")    

### Standardization techniques and hyperparameter tuning

In [None]:
X = data1.drop(columns='charges')
y = data1.charges
X_train,X_test,y_train,y_test = split(X,y,test_size=0.2,random_state=12)

In [None]:
sc = StandardScaler()
x_train_sc = sc.fit_transform(X_train)
x_test_sc = sc.transform(X_test)

In [None]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
models = {"StandardScaler" : StandardScaler(),
"MinMaxScaler" : MinMaxScaler(),
"RobustScaler" : RobustScaler()}
rf = RandomForestRegressor()
for model,item in models.items():
    x_train_model = item.fit_transform(X_train)
    x_test_model = item.transform(X_test)
    rf.fit(x_train_model,y_train)
    print(f"Train Score for {model}: {rf.score(x_train_model,y_train)*100:.2f}%")
    print(f"Test Score for {model} : {rf.score(x_test_model,y_test)*100:.2f}")

In [None]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.svm import SVC

### Random Forest Regression with hyperparameter tuning

In [None]:
param_grid = {'n_estimators':[10,100,50,200],'criterion':['absolute_error', 'squared_error', 'friedman_mse', 'poisson'],'max_depth':[1,10,20],
              'min_samples_split':[2,4,5,8,10],'min_samples_leaf':[1,2,3,4,6],'max_features':['sqrt','log2'],
              'ccp_alpha':[0,0.2,0.5,1]}
random_rf = RandomizedSearchCV(estimator=RandomForestRegressor(),param_distributions=param_grid,cv = 5,n_jobs=-1)
random_rf.fit(x_train_sc,y_train)

In [None]:
random_rf.best_params_

In [None]:
y_pred_random_rf = random_rf.predict(x_train_sc)
y_test_random_rf = random_rf.predict(x_test_sc)
print(f"Train Performance after tune: {random_rf.score(x_train_sc,y_train)*100:.2f}%")
print(f"Test Performance after tune: {random_rf.score(x_test_sc,y_test)*100:.2f}%")

### Regressions Tunning for KNeighbor, SGD, Random Forest with Randomized search

In [None]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import SGDRegressor

In [None]:
models = {'RandomForest':RandomForestRegressor(random_state=12),'KNeighborsRegressor':KNeighborsRegressor(),
          'SGDRegressor':SGDRegressor()}
#gb = GridSearchCV()
param_grids = {'param_knn': {'weights' : ['uniform', 'distance'],'algorithm':['auto', 'ball_tree',
              'kd_tree', 'brute'],'leaf_size':[20,50,40,30]},
              'param_rf': {'n_estimators':[10,100,50,200],'criterion':['squared_error', 'absolute_error', 'friedman_mse',
                'poisson'],'max_depth':[1,10,20],'min_samples_split':[2,4,5,8,10],'min_samples_leaf':[1,2,3,4,6],
                'max_features':['sqrt','log2'],'ccp_alpha':[0,0.2,0.5,1]},
              'param_sgd': {'tol' : [0.001,0.0001,0.01],'alpha':[0.001,0.0001,0.01,0.1],'l1_ratio':[0.15,0.3,0.5],
                'max_iter':[30000,10000,50000,20000],'epsilon':[0.1,0.01,0.3,0.5]}}


In [None]:
for i, param in param_grids.items():
    for model, item in models.items():
        if param == param_grids.get('param_rf') and model == 'RandomForest':
            print(f"   For {model} Model")
            print(f"Best param for {model}")
            rnd = RandomizedSearchCV(estimator=item,param_distributions=param,cv = 5)
            rnd.fit(x_train_sc,y_train)
            display(rnd.best_params_)
            print(f"Performance score for {model} model: {rnd.best_score_*100:.2f}%")
        elif param == param_grids.get('param_knn') and model == 'KNeighborsRegressor':
            print(f"     For {model} Model")
            print(f"Best param for {model}")
            rnd = RandomizedSearchCV(estimator=item,param_distributions=param,cv = 5)
            rnd.fit(x_train_sc,y_train)
            display(rnd.best_params_)
            print(f"Performance Score for {model} Model: {rnd.best_score_*100:.2f}%")
        elif param == param_grids.get('param_sgd') and model == 'SGDRegressor':
            print(f"     For {model} Model")
            print(f"Best param for {model}")
            rnd = RandomizedSearchCV(estimator=item,param_distributions=param,cv = 5)
            rnd.fit(x_train_sc,y_train)
            display(rnd.best_params_)
            print(f"Performance Score for {model} Model: {rnd.best_score_*100:.2f}%")

In [None]:
X = data1.drop(columns='smoker')
y = data1.smoker
X_train,X_test,y_train,y_test = split(X,y,test_size=0.2,random_state=12)
x_train_sc = sc.fit_transform(X_train)
x_test_sc = sc.transform(X_test)

### Classifiers Tuning for Random Forest , SVC, Logistic with Random Search

In [None]:
models = {'RandomForest':RandomForestClassifier(random_state=12),'SVC':SVC(random_state=12),
          'LogisticRegression':LogisticRegression()}
#gb = GridSearchCV()
param_grids = {'param_svc': {'kernel': ['linear', 'poly', 'rbf'],'tol' : [0.001,0.0001,0.01],
              'C':[0.1,1,0.5],'degree':[1,2,3,5,0],'gamma':['scale', 'auto'],
              'max_iter':[30000,10000,50000,20000]},
              'param_rf': {'n_estimators':[10,100,50,200],'criterion':['gini','entropy', 'log_loss'],'max_depth':[1,10,20],
              'min_samples_split':[2,4,5,8,10],'min_samples_leaf':[1,2,3,4,6],'max_features':['sqrt','log2'],
              'ccp_alpha':[0,0.2,0.5,1]},
              'param_lg': {'tol' : [0.001,0.0001,0.01],'C':[0.1,1,0.5],
                'max_iter':[30000,10000,50000,20000],'solver':['saga','liblinear']}}


In [None]:
for i, param in param_grids.items():
    for model, item in models.items():
        if param == param_grids.get('param_rf') and model == 'RandomForest':
            print(f"   For {model} Model")
            print(f"Best param for {model}")
            rnd = RandomizedSearchCV(estimator=item,param_distributions=param,cv = 5)
            rnd.fit(x_train_sc,y_train)
            display(rnd.best_params_)
            print(f"Performance score for {model} model: {rnd.best_score_*100:.2f}%")
        elif param == param_grids.get('param_svc') and model == 'SVC':
            print(f"     For {model} Model")
            print(f"Best param for {model}")
            rnd = RandomizedSearchCV(estimator=item,param_distributions=param,cv = 5)
            rnd.fit(x_train_sc,y_train)
            display(rnd.best_params_)
            print(f"Performance Score for {model} Model: {rnd.best_score_*100:.2f}%")
        elif param == param_grids.get('param_lg') and model == 'LogisticRegression':
            print(f"     For {model} Model")
            print(f"Best param for {model}")
            rnd = RandomizedSearchCV(estimator=item,param_distributions=param,cv = 5)
            rnd.fit(x_train_sc,y_train)
            display(rnd.best_params_)
            print(f"Performance Score for {model} Model: {rnd.best_score_*100:.2f}%")

### Sklearn pipelines to streamline the workflow

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.impute import SimpleImputer

In [None]:
X = data1.drop(columns='charges')
y = data1.charges
X_train,X_test,y_train,y_test = split(X,y,test_size=0.2,random_state=12)
x_train_sc = sc.fit_transform(X_train)

In [None]:
# Gradient Boosting Classifier
gb_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', MinMaxScaler(feature_range=(0, 1))),
    ('classifier', GradientBoostingRegressor(n_estimators=100, random_state=7))
])

# Evaluate the Gradient Boosting Classifier pipeline
gb_results = cross_val_score(gb_pipeline, X_train, y_train, cv=10)
print(f"Gradient Boosting Classifier mean accuracy after cross validation:  {gb_results.mean()*100:.2f}%")

### Regularization techniques to address the bias-variance trade-off

In [None]:
from sklearn.linear_model import Lasso, LassoCV, ElasticNet, ElasticNetCV

ridge_cv_model = RidgeCV(alphas=np.arange(0,10,0.1),cv = 5)
ridge_cv_model.fit(X_train,y_train)
ridge_alpha = ((ridge_cv_model.alpha_).round(2))
ridge_alpha

In [None]:
lasso_cv_model = LassoCV(cv = 5)
lasso_cv_model.fit(X_train,y_train)
lasso_best_alpha = (lasso_cv_model.alpha_).round(2)
lasso_best_alpha

In [None]:
l1 = np.arange(0.1,0.9,0.1)
elastic_cv_model = ElasticNetCV(cv = 5,l1_ratio=l1)
elastic_cv_model.fit(X_train,y_train)
elastic_alpha = elastic_cv_model.alpha_
elastic_cv_model.l1_ratio_

In [None]:
models = {'Ridge':Ridge(alpha = ridge_alpha),'Lasso':Lasso(alpha = lasso_best_alpha),
          'Elastic net':ElasticNet(alpha=elastic_alpha,l1_ratio=elastic_cv_model.l1_ratio_)}
for model,item in models.items():
    item.fit(X_train,y_train)
    scores = cross_val_score(item,X_train,y_train,cv = 5)
    print(f"              Folds for {model}")
    for fold,score in enumerate(scores):   
        print(f"Fold {fold + 1} for cross val score for {model} is: {score*100:.2f}%")
    print(f"Average cross-validation score for {model}: {np.mean(scores) * 100:.2f}%\n")
    

### Kfold Cross Validation for Linear Regression Model

In [None]:
from sklearn.model_selection import KFold
fold = KFold(n_splits = 5,shuffle = True, random_state = 12)
model = LinearRegression()
cv_scores = cross_val_score(model,X_train,y_train,cv = fold)
for fold,score in enumerate(cv_scores):
    print(f"Fold {fold + 1} for cross val score is:{score*100:.2f}%")
print()    
print(f"Mean Cross Val Score: {cv_scores.mean()*100:.2f}%")     

### Kfold Cross Validation for Random Forest Model

In [None]:
fold = KFold(n_splits = 5,shuffle = True, random_state = 12)
model = RandomForestRegressor()
cv_scores = cross_val_score(model,X_train,y_train,cv = fold)
for fold,score in enumerate(cv_scores):
    print(f"Fold {fold + 1} for cross val score is:{score*100:.2f}%")
print()    
print(f"Mean Cross Val Score: {cv_scores.mean()*100:.2f}%")     

### Gradient Boost model, variable importance scores, redundant variables

In [None]:
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor

In [None]:
model = GradientBoostingRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

importance_scores = model.feature_importances_
(importance_scores)

In [None]:
#print("Feature Importance Scores:", importance_scores)
feature_names = model.feature_names_in_
feature_names

### Features Important of 5 Variables

In [None]:
feature_data = pd.DataFrame(importance_scores,index=feature_names).rename(columns={0:'Imp_Features'}).sort_values('Imp_Features',ascending=False)
feature_data.head()

#### Estimate the cost of hospitalization for Christopher, Ms. Jayna (Date of birth 12/28/1988; height 170 cm; and weight 85 kgs). She lives with her partner and two children in a tier-1 city, and her state’s State ID is R1011. She was found to be nondiabetic (HbA1c = 5.8). She smokes but is otherwise healthy. She has had no transplants or major surgeries. Her father died of lung cancer. Hospitalization costs will be estimated using tier-1 hospitals.


In [None]:
new_data = pd.DataFrame({'children':[2],'Hospital tier':[0],'City tier':[2],'State ID':[6],'BMI':[29.41],'HBA1C':[5.8],
                         'Heart Issues':[0],'Any Transplants':[0],'Cancer history':[0],'NumberOfMajorSurgeries':[0],
                        'smoker':[1],'Age':[36]})
new_data 

In [None]:
x = data1.drop(columns='charges')
y = data1.charges
x_train,x_test,y_train,y_test = split(x,y,test_size=0.2,random_state=12)
rf = RandomForestRegressor(n_estimators = 100,
 min_samples_split= 2,
 min_samples_leaf= 2,
 max_features= 'log2',
 max_depth= 10,
 ccp_alpha= 0)
rf.fit(x_train,y_train)

In [None]:
pred_test = rf.predict(x_test)
pred_train = rf.predict(x_train)
print(f"Train score:{rf.score(x_train,y_train)*100:.2f}% " )
print(f"Test score:{rf.score(x_test,y_test)*100:.2f}% " )

In [None]:
predicted_price = rf.predict(new_data)
predicted_price

### Predicted hospitalization cost using the best models

In [None]:
dict = {'children':[0,2,1,4],'Hospital tier':[0,2,1,2],'City tier':[2,1,1,0],'State ID':[3,7,9,0],'BMI':[25,35,34,23],
        'HBA1C':[3.5,6.9,5,4],'Heart Issues':[0,1,1,0],'Any Transplants':[0,0,0,1],'Cancer history':[1,0,0,0],
        'NumberOfMajorSurgeries':[0,1,0,0],'smoker':[0,1,1,1],'Age':[34,56,28,36]}
new_data = pd.DataFrame(dict)
new_data

In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR

class HospitalizationCostPredictor:
    def __init__(self, X, y):
        self.X = X
        self.y = y
        
        self.X_train, self.X_test, self.y_train, self.y_test = split(X, y, test_size=0.2, random_state=42)
        
        # Models Uses
        self.models = {
            'Linear Regression': self._create_pipeline(LinearRegression()),
            'SVR': self._create_pipeline(SVR(kernel='linear',degree=5)),
            'Lasso Regression': self._create_pipeline(Lasso(alpha=1.0)),
            'Gradient Boosting': self._create_pipeline(GradientBoostingRegressor(n_estimators=100, random_state=42)),
            'Random Forest': self._create_pipeline(RandomForestRegressor()),
            'Decision Tree Regressor': self._create_pipeline(DecisionTreeRegressor()),
            'SGDRegressor':self._create_pipeline(SGDRegressor())
        }
        
    def _create_pipeline(self, model):
    
        return Pipeline([
            ('scaler', StandardScaler()),
            ('regressor', model)
])
    
    def evaluate_models(self):
        
        results = []
        
        for name, model in self.models.items():

            model.fit(self.X_train, self.y_train)
            y_pred = model.predict(self.X_test)
            
            # Metrics 
            mse = mean_squared_error(self.y_test, y_pred)
            mae = mean_absolute_error(self.y_test, y_pred)
            r2 = r2_score(self.y_test, y_pred) 
            
            results.append({
                'Model': name,
                'Mean Squared Error': mse,
                'Mean Absolute Error': mae,
                'R-squared': r2,
                'R-square in %' : f"{r2*100:.2f}%"
            })
        
        # Sort by MSE
        results_df = pd.DataFrame(results)
        display(results_df)

        fig, ax = plt.subplots(1, 2, figsize=(12, 6))

        sns.barplot(x='Model', y='Mean Squared Error',hue = 'Model',
                    legend=False, data=results_df, palette='Accent', ax=ax[0])
        ax[0].set_title('Model Comparison - Mean Squared Error', size=20)
        ax[0].tick_params(axis='x', rotation=45)
        ax[0].set_xlabel('Models', size=15)
        ax[0].set_ylabel('Mean Squared Errors', size=15)

        sns.barplot(x='Model', y='R-squared',hue = 'Model',
                    legend=False,palette='pastel', data=results_df, ax=ax[1])
        ax[1].set_title('Model Comparison - R-squared', size=20)
        ax[1].tick_params(axis='x', rotation=45)
        ax[1].set_xlabel('Models', size=15)
        ax[1].set_ylabel('R-Squared', size=15)

        # Adjust layout
        plt.tight_layout()
        plt.show()

        
        return results_df
    
    def predict_best_model(self, X_new):
        
        # Get the best model based on lowest MSE
        results = self.evaluate_models()
#        results_df = pd.DataFrame(results)
#        display(results_df)
#        best_model_name = results_df[['R-squared','Model']]['Model'].max()
        best_model_name = results[['Model','R-squared']].sort_values(by = 'R-squared',
                                    ascending = False).set_index('Model').index[0]
#        best_model_name = results.iloc[0]['Model']
        best_model = self.models[best_model_name]
        
        # Prediction for new data
        predictions = best_model.predict(new_data)
        
        return predictions, best_model_name
    
    def prediction_error_analysis(self, y_true, y_pred):
    
        errors = y_true - y_pred
        
        error_analysis = pd.DataFrame({
            'True Values': y_true,
            'Predicted Values': y_pred,
            'Absolute Errors': np.abs(errors),
            'Percentage Errors': np.abs(errors / y_true) * 100
        })
        
        return error_analysis

def main():

    X = data1.drop(columns='charges')
    y = data1.charges
    
    # Intialize Model Building
    predictor = HospitalizationCostPredictor(X, y)
    
    # Predict using best model
    predictions, best_model = predictor.predict_best_model(new_data)
    print(f"\nPredictions using {best_model}:")
    print(predictions)
    
    # Error analysis
    y_pred = predictor.models[best_model].predict(predictor.X_test)
    error_analysis = predictor.prediction_error_analysis(predictor.y_test, y_pred)
    print("\nPrediction Error Analysis:")
    print(error_analysis.describe())

if __name__ == "__main__":
    main()