In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import xgboost as xgb
import numpy as np

df = pd.read_csv("/mnt/c/Users/krish/Documents/Internships/State Farm/2024 Hack Day/statefarm-hackathon/Car_Health_Metrics_Dataset.csv")
df

Unnamed: 0,HeartRate,BloodOxygenLevel,NoiseLevel,Score
0,63.761181,97.479150,70.468829,88.709160
1,78.240789,99.177881,77.597383,87.983947
2,64.434949,97.105226,69.537317,88.077492
3,62.179323,99.565248,62.657563,78.271638
4,80.943299,97.846916,67.761804,81.665422
...,...,...,...,...
34995,80.524325,99.630628,71.609517,84.454565
34996,70.657464,95.821085,64.271135,86.434756
34997,82.954991,97.306875,73.979477,85.331360
34998,81.769338,98.222699,80.388383,82.619580


In [40]:
# Split the data into features and target
X = df[['HeartRate', 'BloodOxygenLevel', 'NoiseLevel']]
y = df['Score']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

pipeline = Pipeline([
    ('poly', PolynomialFeatures(degree=2, include_bias=False)),
    ('scaler', StandardScaler()),
    ('regressor', xgb.XGBRegressor(random_state=42))
])

# Define the hyperparameters to tune
param_grid = {
    'regressor__n_estimators': [100, 200, 300],
    'regressor__max_depth': [3, 6, 9],
    'regressor__learning_rate': [0.01, 0.1, 0.2],
    'regressor__subsample': [0.6, 0.8, 1.0],
    'regressor__colsample_bytree': [0.6, 0.8, 1.0]
}

In [43]:
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Best parameters from grid search
best_params = grid_search.best_params_
print(f'Best Parameters: {best_params}')

Best Parameters: {'regressor__colsample_bytree': 1.0, 'regressor__learning_rate': 0.1, 'regressor__max_depth': 9, 'regressor__n_estimators': 300, 'regressor__subsample': 0.6}


In [44]:
# Train the best model on the entire training set
best_model = grid_search.best_estimator_

best_model.fit(X_train, y_train)

# Predict and evaluate
y_pred = best_model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f'Best MSE: {mse}')
print(f'Best R^2: {r2}')

Best MSE: 0.10437803442392928
Best R^2: 0.9977354050908246


In [47]:
new_data = np.array([
    [70, 98, 75],  
    [65, 97, 70]   
])

new_data_poly = best_model.named_steps['poly'].transform(new_data)
new_data_scaled = best_model.named_steps['scaler'].transform(new_data_poly)

predictions = best_model.named_steps['regressor'].predict(new_data_scaled)
print(f"Predictions: {predictions}")

Predictions: [99.72121 88.96499]




In [1]:
import joblib

joblib_file = "driver_xgb_model.pkl"
joblib.dump(best_model, joblib_file)
print(f"Model saved to {joblib_file}")

NameError: name 'best_model' is not defined

In [4]:
loaded_model = joblib.load(joblib_file)
print("Model loaded from disk")

new_data = np.array([[78, 98, 40]])
new_data_poly = loaded_model.named_steps['poly'].transform(new_data)
new_data_scaled = loaded_model.named_steps['scaler'].transform(new_data_poly)

predictions = loaded_model.named_steps['regressor'].predict(new_data_scaled)
print(f"Predictions: {predictions}")

Model loaded from disk
Predictions: [75.78229]


