<a href="https://colab.research.google.com/github/robitussin/CCADMACL_EXERCISES/blob/main/Exercise2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Exercise 2: Use Gradient Boost for Regression

Instructions:

- Use the Dataset File to train your model
- Use the Test File to generate your results
- Use the Sample Submission file to generate the same format
Submit your results to:
https://www.kaggle.com/competitions/playground-series-s4e12/overview



In [None]:
!pip install --user xgboost



In [None]:
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.metrics import root_mean_squared_log_error

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
warnings.filterwarnings("ignore")

## Dataset
Train, test and sample submission file can be found in this link
https://www.kaggle.com/competitions/playground-series-s4e12/data

## 1. Load the Data

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
test_data = pd.read_csv('/content/test.csv')
train_data = pd.read_csv('/content/train.csv')

## 2. Perform Data preprocessing

In [None]:
for column in train_data.columns:
    unique_values = train_data[column].unique()
    if column == "Credit Score" or column == "id":
        continue
    print(f"Column: {column}")
    print(f"Number of Unique Values: {len(unique_values)}")
    print(f"Unique Values: {unique_values}\n")

Column: Age
Number of Unique Values: 48
Unique Values: [19. 39. 23. 21. 29. 41. 48. 44. 56. 25. 40. 18. 59. 34. 22. 46. 49. 42.
 43. 64. 52. 37. 58. 50. 35. 61. 31. 54. 45. 30. 33. 28. 62. 53. 47. 27.
 nan 38. 63. 32. 36. 20. 51. 55. 57. 24. 60. 26.]

Column: Gender
Number of Unique Values: 2
Unique Values: ['Female' 'Male']

Column: Annual Income
Number of Unique Values: 86116
Unique Values: [ 10049.  31678.  25602. ...  42467. 123414.  53668.]

Column: Marital Status
Number of Unique Values: 4
Unique Values: ['Married' 'Divorced' 'Single' nan]

Column: Number of Dependents
Number of Unique Values: 6
Unique Values: [ 1.  3.  2.  0.  4. nan]

Column: Education Level
Number of Unique Values: 4
Unique Values: ["Bachelor's" "Master's" 'High School' 'PhD']

Column: Occupation
Number of Unique Values: 4
Unique Values: ['Self-Employed' nan 'Employed' 'Unemployed']

Column: Health Score
Number of Unique Values: 461355
Unique Values: [22.59876067 15.56973099 47.17754929 ... 16.93986558 19.8350

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
import numpy as np

# Imputers and Scalers
imputer_age = SimpleImputer(strategy='median')
scaler_income = StandardScaler()
imputer_health = SimpleImputer(strategy='most_frequent')
imputer_dependents = SimpleImputer(strategy='median')
imputer_income = SimpleImputer(strategy='median')

# Apply to test_data
test_data['Age'] = imputer_age.fit_transform(test_data[['Age']])
test_data['Credit Score'] = imputer_age.fit_transform(test_data[['Credit Score']])  # Same imputer as age

test_data['Health Score'] = imputer_health.fit_transform(test_data[['Health Score']])

# Scale and impute Annual Income
test_data['Annual Income'] = scaler_income.fit_transform(test_data[['Annual Income']])
test_data['Annual Income'] = imputer_income.fit_transform(test_data[['Annual Income']])

test_data['Previous Claims'].fillna(0, inplace=True)

test_data['Number of Dependents'] = imputer_dependents.fit_transform(test_data[['Number of Dependents']])

# Mappings
marital_mapping = {"Single": 0, "Married": 1, "Divorced": 2, np.nan: 3}
test_data['Marital Status'] = test_data['Marital Status'].map(marital_mapping)

gender_mapping = {"Female": 0, "Male": 1}
test_data['Gender'] = test_data['Gender'].map(gender_mapping)

location_mapping = {"Urban": 0, "Rural": 1, "Suburban": 3}
test_data['Location'] = test_data['Location'].map(location_mapping)

policy_mapping = {"Premium": 0, "Comprehensive": 1, "Basic": 3}
test_data['Policy Type'] = test_data['Policy Type'].map(policy_mapping)

education_mapping = {"High School": 1, "Bachelor's": 2, "Master's": 3, "PhD": 4}
test_data['Education Level'] = test_data['Education Level'].map(education_mapping)

property_mapping = {"House": 0, "Apartment": 1, "Condo": 3}
test_data['Property Type'] = test_data['Property Type'].map(property_mapping)

feedback_mapping = {"Poor": 0, "Fair": 1, "Good": 2, "Excellent": 3, np.nan: 4}
test_data['Customer Feedback'] = test_data['Customer Feedback'].map(feedback_mapping)

smoking_mapping = {"Yes": 0, "No": 1}
test_data['Smoking Status'] = test_data['Smoking Status'].map(smoking_mapping)

exercise_mapping = {"Weekly": 0, "Monthly": 1, "Daily": 2, "Rarely": 3}
test_data['Exercise Frequency'] = test_data['Exercise Frequency'].map(exercise_mapping)

occupation_mapping = {"Self-Employed": 0, "Unemployed": 1, "Employed": 2, np.nan: 3}
test_data['Occupation'] = test_data['Occupation'].map(occupation_mapping)


In [None]:
# Missing Age Values
imputer_age = SimpleImputer(strategy='median')
train_data['Age'] = imputer_age.fit_transform(train_data[['Age']])
scaler_credit = SimpleImputer(strategy='median')
train_data['Credit Score'] = scaler_credit.fit_transform(train_data[['Credit Score']])

imputer_health = SimpleImputer(strategy='most_frequent')
train_data['Health Score'] = imputer_health.fit_transform(train_data[['Health Score']])

# Missing Annual Income
# Scale Annual Income
scaler_income = StandardScaler()
train_data['Annual Income'] = scaler_income.fit_transform(train_data[['Annual Income']])
imputer_income = SimpleImputer(strategy='median')
train_data['Annual Income'] = imputer_income.fit_transform(train_data[['Annual Income']])

train_data['Previous Claims'].fillna(0, inplace=True)

# Missing Dependents
imputer_dependents = SimpleImputer(strategy='median')
train_data['Number of Dependents'] = imputer_dependents.fit_transform(train_data[['Number of Dependents']])

# Mapping
marital_mapping = {"Single" : 0, "Married" : 1, "Divorced" : 2, np.nan : 3}
train_data['Marital Status'] = train_data['Marital Status'].map(marital_mapping)

gender_mapping = {"Female" : 0, "Male" : 1}
train_data['Gender'] = train_data['Gender'].map(gender_mapping)

location_mapping ={"Urban": 0, "Rural" : 1, "Suburban" : 3}
train_data['Location'] = train_data['Location'].map(location_mapping)

policy_mapping ={"Premium": 0, "Comprehensive" : 1, "Basic" : 3}
train_data['Policy Type'] = train_data['Policy Type'].map(policy_mapping)

education_mapping = {"High School": 1, "Bachelor's": 2, "Master's": 3, "PhD": 4}
train_data['Education Level'] = train_data['Education Level'].map(education_mapping)

property_mapping ={"House": 0, "Apartment" : 1, "Condo" : 3}
train_data['Property Type'] = train_data['Property Type'].map(property_mapping)

feedback_mapping ={"Poor": 0, "Fair" : 1, "Good" : 2, "Excellent" : 3, np.nan : 4}
train_data['Customer Feedback'] = train_data['Customer Feedback'].map(feedback_mapping)

smoking_mapping = {"Yes" : 0, "No" : 1}
train_data['Smoking Status'] = train_data['Smoking Status'].map(smoking_mapping)

exercise_mapping = {"Weekly" : 0, "Monthly" : 1, "Daily" : 2, "Rarely" : 3}
train_data['Exercise Frequency'] = train_data['Exercise Frequency'].map(exercise_mapping)

occupation_mapping = {"Self-Employed" : 0, "Unemployed" : 1, "Employed" : 2, np.nan :3}
train_data['Occupation'] = train_data['Occupation'].map(occupation_mapping)

In [None]:
train_data['Customer Feedback']

Unnamed: 0,Customer Feedback
0,0.0
1,
2,2.0
3,0.0
4,0.0
...,...
1048570,
1048571,2.0
1048572,2.0
1048573,4.0


In [None]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1048575 entries, 0 to 1048574
Data columns (total 21 columns):
 #   Column                Non-Null Count    Dtype  
---  ------                --------------    -----  
 0   id                    1048575 non-null  int64  
 1   Age                   1048575 non-null  float64
 2   Gender                1048575 non-null  int64  
 3   Annual Income         1048575 non-null  float64
 4   Marital Status        1048575 non-null  int64  
 5   Number of Dependents  1048575 non-null  float64
 6   Education Level       1048575 non-null  int64  
 7   Occupation            1048575 non-null  int64  
 8   Health Score          1048575 non-null  float64
 9   Location              1048575 non-null  int64  
 10  Policy Type           1048575 non-null  int64  
 11  Previous Claims       1048575 non-null  float64
 12  Vehicle Age           1048570 non-null  float64
 13  Credit Score          1048575 non-null  float64
 14  Insurance Duration    1048574 non-

In [None]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 800000 entries, 0 to 799999
Data columns (total 20 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   id                    800000 non-null  int64  
 1   Age                   800000 non-null  float64
 2   Gender                800000 non-null  int64  
 3   Annual Income         800000 non-null  float64
 4   Marital Status        800000 non-null  int64  
 5   Number of Dependents  800000 non-null  float64
 6   Education Level       800000 non-null  int64  
 7   Occupation            800000 non-null  int64  
 8   Health Score          800000 non-null  float64
 9   Location              800000 non-null  int64  
 10  Policy Type           800000 non-null  int64  
 11  Previous Claims       800000 non-null  float64
 12  Vehicle Age           799997 non-null  float64
 13  Credit Score          800000 non-null  float64
 14  Insurance Duration    799998 non-null  float64
 15  

## 3. Create a Pipeline

In [None]:
train_data = train_data.drop('id', axis= 1)
train_data = train_data.drop('Policy Start Date', axis= 1)
train_data = train_data.drop('Customer Feedback', axis= 1)

In [None]:
test_data = test_data.drop('id', axis= 1)
test_data = test_data.drop('Policy Start Date', axis= 1)
test_data = test_data.drop('Customer Feedback', axis= 1)

In [None]:
train_data =  train_data.drop('Vehicle Age', axis= 1)

In [None]:
test_data =  test_data.drop('Vehicle Age', axis= 1)

In [None]:
train_data.fillna(0, inplace=True)

In [None]:
test_data.fillna(0, inplace=True)

In [None]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 800000 entries, 0 to 799999
Data columns (total 16 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   Age                   800000 non-null  float64
 1   Gender                800000 non-null  int64  
 2   Annual Income         800000 non-null  float64
 3   Marital Status        800000 non-null  int64  
 4   Number of Dependents  800000 non-null  float64
 5   Education Level       800000 non-null  int64  
 6   Occupation            800000 non-null  int64  
 7   Health Score          800000 non-null  float64
 8   Location              800000 non-null  int64  
 9   Policy Type           800000 non-null  int64  
 10  Previous Claims       800000 non-null  float64
 11  Credit Score          800000 non-null  float64
 12  Insurance Duration    800000 non-null  float64
 13  Smoking Status        800000 non-null  int64  
 14  Exercise Frequency    800000 non-null  int64  
 15  

In [None]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1048575 entries, 0 to 1048574
Data columns (total 17 columns):
 #   Column                Non-Null Count    Dtype  
---  ------                --------------    -----  
 0   Age                   1048575 non-null  float64
 1   Gender                1048575 non-null  int64  
 2   Annual Income         1048575 non-null  float64
 3   Marital Status        1048575 non-null  int64  
 4   Number of Dependents  1048575 non-null  float64
 5   Education Level       1048575 non-null  int64  
 6   Occupation            1048575 non-null  int64  
 7   Health Score          1048575 non-null  float64
 8   Location              1048575 non-null  int64  
 9   Policy Type           1048575 non-null  int64  
 10  Previous Claims       1048575 non-null  float64
 11  Credit Score          1048575 non-null  float64
 12  Insurance Duration    1048575 non-null  float64
 13  Smoking Status        1048575 non-null  int64  
 14  Exercise Frequency    1048575 non-

In [None]:
X, y = train_data.drop('Premium Amount', axis=1), train_data[['Premium Amount']]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

## 4. Train the Model

In [None]:
dtrain_reg = xgb.DMatrix(X_train, y_train, enable_categorical=True)
dtest_reg = xgb.DMatrix(X_test, y_test, enable_categorical=True)

In [None]:
params = {"objective": "reg:squarederror", "tree_method": "gpu_hist"}

In [None]:
n = 100
model = xgb.train(
   params=params,
   dtrain=dtrain_reg,
   num_boost_round=n,
)

In [None]:
preds = model.predict(dtest_reg)

## 5. Evaluate the Model

In [None]:
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, root_mean_squared_log_error

# ... (your existing code for data loading and preprocessing) ...

X, y = train_data.drop('Premium Amount', axis=1), train_data[['Premium Amount']]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

# ... (your existing code for XGBoost model training) ...

# Modify prediction to ensure positive values
preds = model.predict(dtest_reg)
preds = np.clip(preds, a_min=0, a_max=None)  # Clip negative values to 0

# Calculate RMSLE
rmse = root_mean_squared_log_error(y_test, preds)

print(f"RMSLE of the base model: {rmse:.3f}")

RMSLE of the base model: 1.148


In [None]:
params = {"objective": "reg:squarederror", "tree_method": "gpu_hist"}
n = 100

In [None]:
evals = [(dtrain_reg, "train"), (dtest_reg, "validation")]

model = xgb.train(
   params=params,
   dtrain=dtrain_reg,
   num_boost_round=n,
   evals=evals,
)

[0]	train-rmse:858.78342	validation-rmse:857.37656
[1]	train-rmse:854.10207	validation-rmse:852.87392
[2]	train-rmse:852.36263	validation-rmse:851.28905
[3]	train-rmse:851.15067	validation-rmse:850.22171
[4]	train-rmse:850.53138	validation-rmse:849.72942
[5]	train-rmse:849.77757	validation-rmse:849.13195
[6]	train-rmse:849.38082	validation-rmse:848.83559
[7]	train-rmse:849.00631	validation-rmse:848.61534
[8]	train-rmse:848.79955	validation-rmse:848.53376
[9]	train-rmse:848.47648	validation-rmse:848.33751
[10]	train-rmse:848.04713	validation-rmse:847.97669
[11]	train-rmse:847.48044	validation-rmse:847.44633
[12]	train-rmse:847.33013	validation-rmse:847.42196
[13]	train-rmse:847.18442	validation-rmse:847.42166
[14]	train-rmse:847.02192	validation-rmse:847.39068
[15]	train-rmse:846.90500	validation-rmse:847.38045
[16]	train-rmse:846.74499	validation-rmse:847.36410
[17]	train-rmse:846.61913	validation-rmse:847.34539
[18]	train-rmse:846.49625	validation-rmse:847.32200
[19]	train-rmse:846.31

## Generate Submission File

Choose the model that has the best performance to generate a submission file.

In [None]:
sf = pd.read_csv('/content/sample.csv')

In [None]:
sf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 800000 entries, 0 to 799999
Data columns (total 2 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   id              800000 non-null  int64  
 1   Premium Amount  800000 non-null  float64
dtypes: float64(1), int64(1)
memory usage: 12.2 MB


In [None]:
id = sf['id']  # Use the index of test_data for id

y_pred = model.predict(xgb.DMatrix(test_data))

# Create a submission DataFrame
submission_df = pd.DataFrame({
    'id': id,
    'Premium Amount': y_pred
})

# Save the submission DataFrame to a CSV file
submission_df.to_csv('submission_file.csv', index=False)
print("Submission file created: submission_file.csv")

Submission file created: submission_file.csv


In [None]:
submission_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 800000 entries, 0 to 799999
Data columns (total 2 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   id              800000 non-null  int64  
 1   Premium Amount  800000 non-null  float32
dtypes: float32(1), int64(1)
memory usage: 9.2 MB
