In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
train_data = pd.read_csv('train_data_imputed.csv')

test_data = pd.read_csv('test_data.csv')

In [3]:
# Operate on UserID
train_data['UserID']=train_data['UserID'].str[1:].astype(float)

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error

# Select the feature and target value
features = ['UserID', 'QuestionTiming_binary', 'LastTaskCompletedEncoded', 
            'CurrentTaskEncoded', 'CurrentGameModeEncoded','LevelProgressionAmount','CurrentSessionLength']
target = 'ResponseValue'

X = train_data[features]
y = train_data[target]

# Split train and validation data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train Gradient Boosting Regressor model
model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
model.fit(X_train, y_train)

# Predict on validation data
y_pred = model.predict(X_val)

# Calculate MAE
mae = mean_absolute_error(y_val, y_pred)
print(f'Mean Absolute Error on validation set: {mae}')

# Print the feature importance
feature_importances = model.feature_importances_
for feature, importance in zip(features, feature_importances):
    print(f'Feature: {feature}, Importance: {importance}')

Mean Absolute Error on validation set: 161.42871597199374
Feature: UserID, Importance: 0.29523568448330556
Feature: QuestionTiming_binary, Importance: 0.0008537552214922872
Feature: LastTaskCompletedEncoded, Importance: 0.39154696405879263
Feature: CurrentTaskEncoded, Importance: 0.04641982928051566
Feature: CurrentGameModeEncoded, Importance: 0.009068563899986067
Feature: LevelProgressionAmount, Importance: 0.009202940555985913
Feature: CurrentSessionLength, Importance: 0.24767226249992175


In [5]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error


# Select the features and target variable
features = ['UserID', 'QuestionTiming_binary', 'LastTaskCompletedEncoded', 
            'CurrentTaskEncoded', 'CurrentGameModeEncoded']
target = 'ResponseValue'

X = train_data[features]
y = train_data[target]

# Split train and validation data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the Gradient Boosting Regressor model
model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
model.fit(X_train, y_train)

# Predict on validation data
y_pred = model.predict(X_val)

# Calculate MAE
mae = mean_absolute_error(y_val, y_pred)
print(f'Mean Absolute Error on validation set: {mae}')

# Print feature importance
feature_importances = model.feature_importances_
for feature, importance in zip(features, feature_importances):
    print(f'Feature: {feature}, Importance: {importance}')

Mean Absolute Error on validation set: 165.72552356205136
Feature: UserID, Importance: 0.47473384301884725
Feature: QuestionTiming_binary, Importance: 0.014189967696993572
Feature: LastTaskCompletedEncoded, Importance: 0.3005001392426165
Feature: CurrentTaskEncoded, Importance: 0.18585125745099265
Feature: CurrentGameModeEncoded, Importance: 0.02472479259055004


In [6]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error


# Select the features and target variable
features = ['UserID', 'QuestionTiming_binary', 'CurrentGameModeEncoded','CurrentSessionLength']
target = 'ResponseValue'

X = train_data[features]
y = train_data[target]

# Split the train and validation data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train Gradient Boosting Regressor model
model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
model.fit(X_train, y_train)

# Predict on validation set
y_pred = model.predict(X_val)

# Calculate MAE
mae = mean_absolute_error(y_val, y_pred)
print(f'Mean Absolute Error on validation set: {mae}')

# Print feature importance
feature_importances = model.feature_importances_
for feature, importance in zip(features, feature_importances):
    print(f'Feature: {feature}, Importance: {importance}')

Mean Absolute Error on validation set: 166.6193408715848
Feature: UserID, Importance: 0.5958701000424266
Feature: QuestionTiming_binary, Importance: 0.003427560500501896
Feature: CurrentGameModeEncoded, Importance: 0.015246607889255656
Feature: CurrentSessionLength, Importance: 0.38545573156781593


In [7]:
# Initialize Gradient Boosting Regressor model
gbr = GradientBoostingRegressor(random_state=42)

# fit the model
gbr.fit(X_train, y_train)

# Predict on validation data
y_pred = gbr.predict(X_val)

# Calculate MAE
mae = mean_absolute_error(y_val, y_pred)
print(f'Mean Absolute Error on validation set: {mae}')

# Define the parameters of grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 4, 5],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=gbr, param_grid=param_grid, cv=3, scoring='neg_mean_absolute_error', n_jobs=-1, verbose=2)

# Fit the GridSearch
grid_search.fit(X_train, y_train)

# Print the best parameters
print(f'Best parameters found: {grid_search.best_params_}')

# Fit the model with the best parameters
best_gbr = grid_search.best_estimator_

# Predict on validation data
y_pred_best = best_gbr.predict(X_val)

# Calculate MAE
mae_best = mean_absolute_error(y_val, y_pred_best)
print(f'Mean Absolute Error on validation set with best parameters: {mae_best}')


Mean Absolute Error on validation set: 166.6193408715848
Fitting 3 folds for each of 243 candidates, totalling 729 fits
Best parameters found: {'learning_rate': 0.1, 'max_depth': 5, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 300}
Mean Absolute Error on validation set with best parameters: 149.02215704945667
