In [None]:
import pandas as pd

df = pd.read_csv('processed_data.csv')
df.head()

In [None]:
from sklearn.preprocessing import LabelEncoder

def label_encode(df):
    """
    Preprocess the DataFrame by encoding categorical variables before splitting into training and test sets.
    
    Parameters:
    - df: DataFrame containing the full dataset.
    
    Returns:
    - df_encoded: The DataFrame with categorical columns encoded.
    """
    
    # Identify categorical columns
    categorical_columns = df.select_dtypes(include=['object']).columns.tolist()
    
    # Label Encoding for ordinal categorical variables or One-Hot Encoding for nominal ones
    df_encoded = df.copy()
    
    for col in categorical_columns:
        # If the column is a categorical feature, apply Label Encoding or One-Hot Encoding
        df_encoded[col] = df_encoded[col].astype(str)  # Ensure string type for encoding
        
        # Apply Label Encoding (you can use OneHotEncoder if needed for non-ordinal data)
        label_encoder = LabelEncoder()
        df_encoded[col] = label_encoder.fit_transform(df_encoded[col])
    
    return df_encoded

In [None]:
df_encoded = label_encode(df)
df_encoded.head()

In [None]:
def split_data(df, year_split=2022):
    """
    Split the data into training and test sets based on the year column.
    
    Parameters:
    - df: DataFrame containing the full dataset.
    - target_column: The name of the target variable column.
    - year_split: The year to split on (default is 2022).
    
    Returns:
    - train_data: DataFrame containing the training data.
    - test_data: DataFrame containing the test data.
    """
    train_data = df[df['Year'] <= year_split]
    test_data = df[df['Year'] > year_split]
    
    return train_data, test_data

In [None]:
from sklearn.model_selection import GridSearchCV, TimeSeriesSplit
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import r2_score

def train_gradient_boosting(train_data, target_column, feature_columns, n_splits=5, scoring='neg_root_mean_squared_error'):
    """
    Train a Random Forest model using GridSearchCV with time-series cross-validation.
    
    Parameters:
    - train_data: DataFrame containing the training data.
    - target_column: The name of the target variable column (dependent variable).
    - feature_columns: List of column names to be used as features (independent variables).
    - n_splits: Number of splits for time-series cross-validation (default is 5).
    - scoring: The scoring metric for GridSearchCV (default is RMSE).
    
    Returns:
    - best_model: The best trained model after hyperparameter tuning.
    - best_params: The best set of hyperparameters found.
    - r2_train: R² score on the training set.
    """
    param_grid = {  
        'learning_rate': [0.05, 0.2, 0.1], 
        "min_samples_split":[2, 3, 5],
        "max_depth": [ 3, 4, 5],
        "min_samples_leaf": [1, 3, 5],
        'subsample': [0.6, 0.8, 1.0], 
        'n_estimators': [100, 200, 300]
    }
    
    X_train = train_data[feature_columns]
    y_train = train_data[target_column]
    
    gbr = GradientBoostingRegressor(random_state=42)
    tscv = TimeSeriesSplit(n_splits=n_splits)
    
    grid_search = GridSearchCV(estimator=gbr, param_grid=param_grid, cv=tscv, scoring=scoring, n_jobs=-1, verbose=2)
    grid_search.fit(X_train, y_train)
    
    best_params = grid_search.best_params_
    best_model = grid_search.best_estimator_
    
    # Calculate R² on the training data
    y_train_pred = best_model.predict(X_train)
    r2_train = r2_score(y_train, y_train_pred)
    
    return best_model, best_params, r2_train

In [None]:
from sklearn.metrics import mean_squared_error

def evaluate_model(model, test_data, target_column, feature_columns):
    """
    Evaluate the model using RMSE and R² on the test data.
    
    Parameters:
    - model: The trained model to be evaluated.
    - test_data: DataFrame containing the test data.
    - target_column: The name of the target variable column.
    - feature_columns: List of column names to be used as features.
    
    Returns:
    - rmse: Root Mean Squared Error (RMSE) on the test set.
    - r2: R² score on the test set.
    """
    X_test = test_data[feature_columns]
    y_test = test_data[target_column]
    
    y_pred = model.predict(X_test)
    
    # Calculate RMSE
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    
    # Calculate R² score
    # r2 = r2_score(y_test, y_pred)
    
    return rmse

In [None]:
train_data, test_data = split_data(df_encoded)
test_data

In [None]:
# Step 2: 
feature_columns = [col for col in df.columns if col != "TSR"]

# Step 3: Train the model with GridSearchCV
best_model, best_params, r2_train = train_gradient_boosting(train_data, target_column="TSR", feature_columns=feature_columns)

# Step 4: Evaluate the model
rmse = evaluate_model(best_model, test_data, target_column="TSR", feature_columns=feature_columns)

# Output the results
print(f"\n Best Hyperparameters: {best_params}")
print(f"\n R2 on Train-set: {r2_train}")
print(f"\n Root Mean Squared Error (RMSE) on Test Set: {rmse}")

In [None]:
import joblib
def dump_model(best_model, model_path):
   joblib.dump(best_model, model_path)

model_path = "../model_folder/xgb.joblib"
dump_model(best_model, model_path)