In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, make_scorer
import pickle
import matplotlib as plt


In [None]:
X_train = pd.read_csv("data/features/X_train.csv")
y_train = pd.read_csv("data/features/y_train.csv")
X_val = pd.read_csv("data/features/X_val.csv")
y_val = pd.read_csv("data/features/y_val.csv")
X_test = pd.read_csv("data/features/X_test.csv")
y_test = pd.read_csv("data/features/y_test.csv")

X_train = pd.concat([X_train, X_val])
y_train = pd.concat([y_train, y_val])

# Make them in the same order using the pothole_id
X_train = X_train.sort_values(by=['pothole_id'])
y_train = y_train.sort_values(by=['1'])
X_test = X_test.sort_values(by=['pothole_id'])
y_test = y_test.sort_values(by=['1'])

# Drop the pothole_id column and '1' column
X_train = X_train.drop(['pothole_id'], axis=1)
y_train = y_train.drop(['1'], axis=1)
X_test = X_test.drop(['pothole_id'], axis=1)
y_test = y_test.drop(['1'], axis=1)

In [None]:
# Impute missing data
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='mean')
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)

In [None]:
# remove outliers using z-score and a threshold of 5 
from scipy import stats

z = np.abs(stats.zscore(X_train_imputed))
threshold = 5
X_train_imputed = X_train_imputed[(z < threshold).all(axis=1)]
y_train = y_train[(z < threshold).all(axis=1)]

In [None]:
# Scale the data
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_imputed)
X_test_scaled = scaler.transform(X_test_imputed)

# Min max scaling on the target
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
y_train_scaled = scaler.fit_transform(y_train)
y_test_scaled = scaler.transform(y_test)

# PCA


In [None]:
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

# Step 1: Fit PCA
pca = PCA(n_components=50)
pca.fit(X_train_scaled)

# Step 2: Extract explained variance ratio
explained_variance_ratio = pca.explained_variance_ratio_
cumulative_explained_variance = explained_variance_ratio.cumsum()

# Step 3: Plot the cumulative explained variance ratio
plt.figure(figsize=(10, 6))
plt.plot(range(1, len(cumulative_explained_variance) + 1), cumulative_explained_variance, marker='o', linestyle='--')
plt.title('Scree Plot')
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance Ratio')
plt.grid()
plt.show()


In [None]:
# Use PCA with 2 components
pca = PCA(n_components=50)
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

# XGBoost


In [None]:
param_grid = {
    'n_estimators': [1100, 1500, 2000],  # Further increased values
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 5, 7],  # Further increased values
    'subsample': [0.8, 0.9, 1.0],  # Further increased values
    'colsample_bytree': [0.1, 0.3, 0.5],  # Further increased values
    'gamma': [0, 0.1, 0.2],
}

In [None]:
# Step 3: Set up cross-validation and hyperparameter tuning
xgb_model = XGBRegressor(random_state=42)

# Use mean squared error as the scoring metric
mse_scorer = make_scorer(mean_squared_error, greater_is_better=False)

# Set up GridSearchCV
grid_search = GridSearchCV(
    estimator=xgb_model,
    param_grid=param_grid,
    scoring=mse_scorer,
    cv=5,
    n_jobs=-1,
    verbose=2
)

# Fit the model with hyperparameter tuning
grid_search.fit(X_train_pca, y_train_scaled)

# Best parameters found
best_params = grid_search.best_params_
print(f"Best hyperparameters: {best_params}")

# Use the best estimator to predict on the test set
best_model = grid_search.best_estimator_

# Save the model to a file
with open('best_xgb_model.pkl', 'wb') as file:
    pickle.dump(best_model, file)

In [None]:
from sklearn.metrics import r2_score

# Load the model from the file
with open('best_xgb_model.pkl', 'rb') as file:
    loaded_model = pickle.load(file)

# Step 4: Evaluate the model
y_pred = loaded_model.predict(X_test_pca)

mse = mean_squared_error(y_test_scaled, y_pred)
print(f"Mean Squared Error on the test set: {mse:.4f}")
r2 = r2_score(y_test_scaled, y_pred)
print(f"R-squared value on the test set: {r2:.4f}")

y_output = scaler.inverse_transform(y_pred.reshape(-1, 1))

# Random Forest Classifier


In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
# Define the parameter grid for hyperparameter tuning
param_grid = {
    'n_estimators': [5000, 10000],
    'max_depth': [None], #5, 10, 20, 30],
    'min_samples_split': [2], #5, 10],
    'min_samples_leaf': [4, 8, 12],
    'max_features': ['sqrt'] #'log2'],
}

In [None]:
# Set up cross-validation and hyperparameter tuning
rf_model = RandomForestRegressor(random_state=42)

# Use mean squared error as the scoring metric
mse_scorer = make_scorer(mean_squared_error, greater_is_better=False)

# Set up GridSearchCV
grid_search = GridSearchCV(
    estimator=rf_model,
    param_grid=param_grid,
    scoring=mse_scorer,
    cv=5,
    n_jobs=-1,
    verbose=2
)

# Fit the model with hyperparameter tuning
grid_search.fit(X_train_pca, y_train_scaled.ravel())

# Best parameters found
best_params = grid_search.best_params_
print(f"Best hyperparameters: {best_params}")

# Use the best estimator to predict on the test set
best_model = grid_search.best_estimator_

# Save the model to a file
with open('best_rf_model.pkl', 'wb') as file:
    pickle.dump(best_model, file)

In [None]:
from sklearn.metrics import r2_score

# Load the model from the file
with open('best_rf_model.pkl', 'rb') as file:
    loaded_model = pickle.load(file)

# Step 4: Evaluate the model
y_pred = loaded_model.predict(X_test_pca)

mse = mean_squared_error(y_test_scaled, y_pred)
print(f"Mean Squared Error on the test set: {mse:.4f}")
r2 = r2_score(y_test_scaled, y_pred)
print(f"R-squared value on the test set: {r2:.4f}")

y_output = scaler.inverse_transform(y_pred.reshape(-1, 1))

# Support Vector Regression


In [None]:
from sklearn.svm import SVR

In [None]:
# Define the extended parameter grid for hyperparameter tuning
param_grid = {
    'C': [500, 600, 800, 1000],  # Increased range of C values
    'kernel': ['rbf', 'sigmoid'], #'poly', 'sigmoid'],  # Added more kernel types
    'gamma': ['scale', 'auto', 0.001, 0.01, 0.1, 1],  # Added specific gamma values
    'epsilon': [0.01, 0.1, 0.2, 0.5]  # Added more epsilon values
}

In [None]:
## Don't run. It runs for a long time

# Set up cross-validation and hyperparameter tuning
svr_model = SVR()

# Use mean squared error as the scoring metric
mse_scorer = make_scorer(mean_squared_error, greater_is_better=False)

# Set up GridSearchCV
grid_search = GridSearchCV(
    estimator=svr_model,
    param_grid=param_grid,
    scoring=mse_scorer,
    cv=10,
    n_jobs=-1,
    verbose=2
)

# Fit the model with hyperparameter tuning
grid_search.fit(X_train_lasso, y_train_scaled.ravel())

# Best parameters found
best_params = grid_search.best_params_
print(f"Best hyperparameters: {best_params}")

# Use the best estimator to predict on the test set
best_model = grid_search.best_estimator_

# Save the model to a file
with open('best_svr_model.pkl', 'wb') as file:
    pickle.dump(best_model, file)

In [None]:
from sklearn.metrics import r2_score

# Load the model from the file
with open('best_svr_model.pkl', 'rb') as file:
    loaded_model = pickle.load(file)

# Step 4: Evaluate the model

y_pred = loaded_model.predict(X_test_lasso)
mse = mean_squared_error(y_test_scaled, y_pred)
print(f"Mean Squared Error on the test set: {mse:.4f}")

# Get the r-squared value
r2 = r2_score(y_test_scaled, y_pred)
print(f"R-squared value on the test set: {r2:.4f}")

# Linear Regression


In [None]:
from sklearn.linear_model import LinearRegression

# Fit the linear regression model
lr_model = LinearRegression()
lr_model.fit(X_train_lasso, y_train_scaled)

# Use the model to predict on the test set
y_pred = lr_model.predict(X_test_lasso)

# Get the mean squared error
mse = mean_squared_error(y_test_scaled, y_pred)
print(f"Mean Squared Error on the test set: {mse:.4f}")

# Get the r-squared value
r2 = r2_score(y_test_scaled, y_pred)
print(f"R-squared value on the test set: {r2:.4f}")

# Save the model to a file
with open('lr_model.pkl', 'wb') as file:
    pickle.dump(lr_model, file)

y_output = scaler.inverse_transform(y_pred.reshape(-1, 1))

# NN


In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.regularizers import l2
from sklearn.metrics import r2_score

# Define the model
model = Sequential()
model.add(Dense(30, activation='relu', input_shape=(X_train_scaled.shape[1],), kernel_regularizer=l2(0.01)))
model.add(Dropout(0.2))
model.add(Dense(15, activation='relu', kernel_regularizer=l2(0.01)))
model.add(Dropout(0.2))
model.add(Dense(1, activation='linear'))

# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error')

# Early stopping callback
early_stopping = EarlyStopping(monitor='val_loss', patience=70, restore_best_weights=True)

# Train the model
history = model.fit(X_train_scaled, y_train_scaled, epochs=500, batch_size=1, validation_split=0.2, callbacks=[early_stopping])

# Save the model to a file
model.save('nn_model.h5')

# Get R-squared value on test data
y_pred = model.predict(X_test_scaled)
y_pred = scaler.inverse_transform(y_pred)
r2 = r2_score(y_test, y_pred)
print(f"R-squared value on the test set: {r2:.4f}")