In [2]:
# Importing Dependencies
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
dataset = pd.read_csv(r"C:\Users\moham\Desktop\ML-project\Data\medicalmalpractice.csv")


#Reading the Train data
train_data = pd.read_csv('./data/raw/train.csv')
#Reading the Test data 
test_data = pd.read_csv('./data/raw/test.csv')

#Based on performed EDA previously , there is an unkown/missing values for Marital Status column and Insurance Column 
#"Unknown" in Insurance column will be treated  as a Separate Category , since it represents 30% of values and it have a very high impact on target label
#"Unkown" in Matrial Status will be replaced with most frequent value , since it does not represent a high percantage of values , and it got less impact on target label


#Function for handling missing values of Marital Status
def handle_missing_values(df):
    
    # Finding most frequent value to replace with
    most_frequent_category= df["Marital Status"].mode()[0]
    
    # Replace missing values/Unkowns with most_frequent_category
    df["Marital Status"] = df["Marital Status"].replace(4, most_frequent_category) # 4 stands for Unkown Marital Status 
    
    

    return df

#Features Encoding

# Target Encoding for Specialty Column , since it gives slightly better results on any model performance,
# and the splitted train set off test set will prevent data leakge.

#Function for Target Encoding
def target_encoder(df):

    #Calculating the mean target value (Amount) for each category in Specialty
    specialty_means = df.groupby("Specialty")['Amount'].mean()

    # Mapping means to the Specialty column
    df['Specialty'] = df["Specialty"].map(specialty_means)
    return df
    

# Label Encoding Function for Gender & Marital Status
def label_encoder(df):
    #For Gender Column
    df.replace({'Gender':{'Male':1,'Female':0}},inplace=True)
    
    #For Marital Status Column
    df.replace({'Marital Status':{'Married':4,'Divorced':3,'Single':2,'Widowed':1}},inplace=True )
    return df


#One hot Encoding Function for Insurance Column
def one_hot_encoder(df):
  encoder = OneHotEncoder(sparse_output=False)
  encoded_data = encoder.fit_transform(df[['Insurance']])
  encoded_df = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out(['Insurance']))
  df = pd.concat([df, encoded_df], axis=1)
  # Dropping the original Insurance column
  df.drop('Insurance', axis=1, inplace=True)
  return df

#Features Scaler Function
def scaler(df,features):
    scaler = StandardScaler()
    df[features] = scaler.fit_transform(df[features])
    
    


#Applying missing values handlers for training & testing sets
all_dataset_processed=handle_missing_values(dataset)
train_data_processed=handle_missing_values(train_data)
test_data_processed=handle_missing_values(test_data)
#Applying encoding functions for training & testing sets
funcs = [target_encoder,label_encoder,one_hot_encoder]

for f in funcs:
    train_data_processed= f(train_data_processed)
    test_data_processed=f(test_data_processed)
    all_dataset_processed=f(all_dataset_processed)

#Applying scaler function
features_to_scale = ["Severity","Age","Marital Status","Specialty"]
scaler(train_data_processed,features_to_scale)
scaler(test_data_processed,features_to_scale)
scaler(all_dataset_processed,features_to_scale)



#Checking if 4 value is removed from Martial Status column
unique_values = train_data_processed['Marital Status'].unique()
print(unique_values)

#Checking some samples of preprocessed data
print(train_data_processed.head())












#Creating folder for processed data
#data_path = os.path.join("data","all_dataset")
#os.makedirs(data_path)

# Storing Processed training & testing sets as outputs

#train_data_processed.to_csv(os.path.join(data_path,"train_processed.csv"),index=False)
#test_data_processed.to_csv(os.path.join(data_path,"test_processed.csv"),index=False)
#all_dataset_processed.to_csv(os.path.join(data_path,"all_dataset_processed.csv"),index=False)


[-1.05525792  0.62473766 -2.7352535   2.30473324]
   Amount  Severity       Age  Private Attorney  Marital Status  Specialty  \
0  105148 -0.383544  1.026307                 1       -1.055258   1.110174   
1   58102 -0.864510 -0.842289                 1        0.624738  -0.355779   
2  420188  1.059352 -1.145304                 1       -1.055258   0.783230   
3  143699 -0.383544  1.430328                 1        0.624738  -0.895084   
4  363654 -0.864510 -1.044299                 1        0.624738  -0.355779   

   Gender  Insurance_Medicare/Medicaid  Insurance_No Insurance  \
0       1                          0.0                     0.0   
1       1                          0.0                     0.0   
2       0                          0.0                     1.0   
3       0                          0.0                     0.0   
4       0                          0.0                     0.0   

   Insurance_Private  Insurance_Unknown  Insurance_Workers Compensation  
0         

  df.replace({'Gender':{'Male':1,'Female':0}},inplace=True)


In [32]:
all_dataset_processed["Marital Status"].value_counts()

Marital Status
 0.624046    51582
-1.053163    22802
-2.730373     3832
 2.301256      994
Name: count, dtype: int64

In [3]:
X_train= train_data_processed.drop("Amount", axis=1)  #training data faeutures
y_train = train_data_processed["Amount"] # training target fetuatre
X_test= test_data_processed.drop("Amount", axis=1) #testing data feautres
y_test=test_data_processed["Amount"] #testing target fetuatre

In [9]:
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.linear_model import LinearRegression 

model = LinearRegression()
# Train the model on the training data
training_data_predection=model.fit(X_train, y_train)

#prediction on training data 
training_data_predection = model.predict(X_train)

#evaluate the model on training data
mse = mean_absolute_error(y_train, training_data_predection)
r2 = r2_score(y_train, training_data_predection)

print(f"Mean Absolute Error (MAE) for training data: {mse}")
print(f"R2 Score for training data: {r2}")

testing_data_predection =model.predict(X_test)

# evaluate the model on testing data
mse = mean_absolute_error(y_test, testing_data_predection)
r2 = r2_score(y_test, testing_data_predection)

print(f"Mean Absolute Error (MAE) for testing data: {mse}")
print(f"R2 Score for testing data: {r2}")

Mean Absolute Error (MAE) for training data: 112510.82207201603
R2 Score for training data: 0.2981866925414671
Mean Absolute Error (MAE) for testing data: 113861.4662373029
R2 Score for testing data: 0.29679607517797124


with standard scaler


In [40]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import r2_score, mean_absolute_error

# create KNN model
knn = KNeighborsRegressor(n_neighbors=10)

# train the model 
knn.fit(X_train, y_train)

y_train_pred = knn.predict(X_train)  
y_test_pred = knn.predict(X_test)  

r2_train = r2_score(y_train, y_train_pred)
mae_train = mean_absolute_error(y_train, y_train_pred)

print("Training Data Performance:")
print(f"R² Score: {r2_train:.4f}")
print(f"Mean Absolute Error (MAE): {mae_train:.4f}")

r2_test = r2_score(y_test, y_test_pred)
mae_test = mean_absolute_error(y_test, y_test_pred)

print("\nTesting Data Performance:")
print(f"R² Score: {r2_test:.4f}")
print(f"Mean Absolute Error (MAE): {mae_test:.4f}")


Training Data Performance:
R² Score: 0.6825
Mean Absolute Error (MAE): 66290.2170

Testing Data Performance:
R² Score: 0.6157
Mean Absolute Error (MAE): 73749.9137


without using standard scaler 



In [30]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import r2_score, mean_absolute_error

knn = KNeighborsRegressor(n_neighbors=10)

knn.fit(X_train, y_train)

y_train_pred = knn.predict(X_train)  
y_test_pred = knn.predict(X_test)    

r2_train = r2_score(y_train, y_train_pred)
mae_train = mean_absolute_error(y_train, y_train_pred)

print("Training Data Performance:")
print(f"R² Score: {r2_train:.4f}")
print(f"Mean Absolute Error (MAE): {mae_train:.4f}")

r2_test = r2_score(y_test, y_test_pred)
mae_test = mean_absolute_error(y_test, y_test_pred)

print("\nTesting Data Performance:")
print(f"R² Score: {r2_test:.4f}")
print(f"Mean Absolute Error (MAE): {mae_test:.4f}")


Training Data Performance:
R² Score: 0.6199
Mean Absolute Error (MAE): 73775.4050

Testing Data Performance:
R² Score: 0.5066
Mean Absolute Error (MAE): 83861.2478


using min max scaler 

In [45]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import r2_score, mean_absolute_error

scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

knn = KNeighborsRegressor(n_neighbors=10)

knn.fit(X_train_scaled, y_train)

y_train_pred = knn.predict(X_train_scaled)
y_test_pred = knn.predict(X_test_scaled)

r2_train = r2_score(y_train, y_train_pred)
mae_train = mean_absolute_error(y_train, y_train_pred)

print("Training Data Performance with MinMaxScaler:")
print(f"R² Score: {r2_train:.4f}")
print(f"Mean Absolute Error (MAE): {mae_train:.4f}")

r2_test = r2_score(y_test, y_test_pred)
mae_test = mean_absolute_error(y_test, y_test_pred)

print("\nTesting Data Performance with MinMaxScaler:")
print(f"R² Score: {r2_test:.4f}")
print(f"Mean Absolute Error (MAE): {mae_test:.4f}")


Training Data Performance with MinMaxScaler:
R² Score: 0.6851
Mean Absolute Error (MAE): 66022.8201

Testing Data Performance with MinMaxScaler:
R² Score: 0.6167
Mean Absolute Error (MAE): 73641.9655


In [7]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_absolute_error


rf = RandomForestRegressor(n_estimators=100)
rf.fit(X_train, y_train)

y_train_pred = rf.predict(X_train)
y_test_pred = rf.predict(X_test)

print("Training Data Performance with RandomForest:")
print(f"R² Score: {r2_score(y_train, y_train_pred):.4f}")
print(f"MAE: {mean_absolute_error(y_train, y_train_pred):.4f}")

print("\nTesting Data Performance with RandomForest:")
print(f"R² Score: {r2_score(y_test, y_test_pred):.4f}")
print(f"MAE: {mean_absolute_error(y_test, y_test_pred):.4f}")


Training Data Performance with RandomForest:
R² Score: 0.7963
MAE: 46857.3077

Testing Data Performance with RandomForest:
R² Score: 0.5448
MAE: 76922.7037


In [8]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(rf, X_train, y_train, cv=5, scoring='r2')
print(f"Cross-Validation R² Scores: {scores}")
print(f"Mean CV R²: {scores.mean():.4f}")


Cross-Validation R² Scores: [0.56588184 0.57557669 0.56395382 0.56591628 0.57234951]
Mean CV R²: 0.5687


In [9]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [10, 15, 20],
    'min_samples_split': [2, 5, 10],
    'max_features': ['sqrt', 'log2']
}

grid_search = GridSearchCV(RandomForestRegressor(), param_grid, cv=5, scoring='r2')
grid_search.fit(X_train, y_train)
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Best R² Score: {grid_search.best_score_}")


Best Parameters: {'max_depth': 15, 'max_features': 'log2', 'min_samples_split': 10, 'n_estimators': 100}
Best R² Score: 0.6548438474338945


In [34]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import GridSearchCV

# إعداد معلمات البحث لتحسين KNN باستخدام GridSearchCV
knn_param_grid = {
    'n_neighbors': [5, 10, 15, 20, 25],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan', 'minkowski']
}

# إنشاء نموذج KNeighborsRegressor
knn = KNeighborsRegressor()

# تنفيذ GridSearchCV لتحسين KNN
search_knn = GridSearchCV(knn, param_grid=knn_param_grid, cv=3, verbose=1, n_jobs=-1)

# تدريب النموذج KNN
search_knn.fit(X_train, y_train)

# اختيار أفضل نموذج من GridSearchCV
best_knn = search_knn.best_estimator_

# طباعة أفضل المعلمات
print("Best KNN Parameters:", search_knn.best_params_)


Fitting 3 folds for each of 30 candidates, totalling 90 fits
Best KNN Parameters: {'metric': 'manhattan', 'n_neighbors': 15, 'weights': 'uniform'}


In [35]:
# Importing the required libraries
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor, VotingRegressor
from sklearn.compose import TransformedTargetRegressor
from sklearn.metrics import r2_score, mean_absolute_error
from xgboost import XGBRegressor

# Creating the individual models
knn = KNeighborsRegressor(n_neighbors=15)
rf = RandomForestRegressor(n_estimators=100, max_depth=15, max_features='log2', min_samples_split=10, random_state=42)
xgb = XGBRegressor(n_estimators=100, max_depth=6, learning_rate=0.1, random_state=42, )

# Wrapping XGBRegressor to make it compatible with VotingRegressor
xgb_wrapped = TransformedTargetRegressor(regressor=xgb)

# Creating the ensemble model using VotingRegressor
ensemble_model = VotingRegressor(estimators=[
    ('knn', knn),
    ('rf', rf),
    ('xgb', xgb_wrapped)
])

# Training the ensemble model
ensemble_model.fit(X_train, y_train)

# Evaluation on training data
y_train_pred = ensemble_model.predict(X_train)
train_r2 = r2_score(y_train, y_train_pred)
train_mae = mean_absolute_error(y_train, y_train_pred)
print(f"Training Data Performance:")
print(f"R² Score: {train_r2:.4f}")
print(f"MAE: {train_mae:.4f}")

# Evaluation on test data
y_test_pred = ensemble_model.predict(X_test)
test_r2 = r2_score(y_test, y_test_pred)
test_mae = mean_absolute_error(y_test, y_test_pred)
print(f"\nTesting Data Performance:")
print(f"R² Score: {test_r2:.4f}")
print(f"MAE: {test_mae:.4f}")


Training Data Performance:
R² Score: 0.6864
MAE: 66690.2891

Testing Data Performance:
R² Score: 0.6077
MAE: 75063.8808
