# Light Gradient-Boosting Machine

In [1]:
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

filename = "encoded_data_03_12.csv"
df = pd.read_csv(f"../DataSets/EncodedData/{filename}", sep=",", encoding="UTF-8")
df.head()

Unnamed: 0,height_in_cm,year,goals_for,goals_against,goals,assists,red_cards,yellow_cards,minutes_played,age_at_evaluation,log_market_value_base10,country_of_citizenship_encoded,sub_position_encoded,club_id_encoded,domestic_competition_id_encoded
0,184.0,2012.75,31.0,14.0,11.0,1.0,0.0,6.0,1483.0,35,6.60206,6.445992,6.241155,6.659857,6.525779
1,184.0,2013.25,24.0,15.0,5.0,2.0,0.0,2.0,1102.0,35,6.30103,6.445992,6.241155,6.659857,6.525779
2,184.0,2013.75,14.0,21.0,4.0,2.0,0.0,1.0,950.0,36,6.0,6.445992,6.241155,6.659857,6.525779
3,184.0,2014.25,24.0,22.0,4.0,3.0,0.0,1.0,1270.0,36,6.0,6.445992,6.241155,6.659857,6.525779
4,184.0,2014.75,31.0,18.0,4.0,3.0,0.0,2.0,496.0,37,6.0,6.445992,6.241155,6.659857,6.525779


In [2]:
# Definerer parametrene for LightGBM-regresjonsmodellen
params = {
    'boosting_type': 'gbdt',    # Gradient Boosting Decision Tree
    'objective': 'regression',  # Regresjon
    'metric': 'l2',             # Målel2-feil (MSE)
    'num_leaves': 31,           # Maksimalt antall blader i et tre
    'learning_rate': 0.05,      # Læringshastighet
    'bagging_fraction': 0.8,    # Andel av data brukt for bagging
    'bagging_freq': 5,          # Frekvens for bagging
    'feature_fraction': 0.8,    # Andel av funksjoner brukt for tre-bygging
    'lambda_l1': 10             # L1-regulariseringsparameter
}

In [3]:
X = df.drop(columns=['log_market_value_base10'])
y = df['log_market_value_base10']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Konverterer datasettene til LightGBM-format
lgb_train = lgb.Dataset(X_train, y_train)
lgb_test = lgb.Dataset(X_test, y_test, reference=lgb_train)

# Trener LightGBM-regresjonsmodellen
num_round = 100
bst = lgb.train(params, lgb_train, num_round, valid_sets=[lgb_train, lgb_test])

# Gjør prediksjoner på testdataene
y_pred = bst.predict(X_test, num_iteration=bst.best_iteration)

# Reverse normalization for actual values
y_pred_original = np.power(10, y_pred)
y_test_original = np.power(10, y_test)


# Calculate the mean squared error
mse = mean_squared_error(y_test_original, y_pred_original)
print(f'Mean Squared Error: {mse}')

# Calculate the mean squared percentage error
percentage_error = (mse / y_test_original.mean()) * 100
print(f'Mean squared Percentage Error: {percentage_error}%')

# Initialize an empty list to store percentage errors
percentage_errors = []

# Calculate the percentage error for each prediction
for i in range(len(y_pred_original)):
    pred = y_pred_original[i]
    actual = y_test_original.values[i]  # Convert to array for indexing
    percentage_error = ((pred - actual) / actual) * 100
    percentage_errors.append(percentage_error)

# Calculate the mean percentage error
mean_percentage_error = np.mean(percentage_errors)
print(f'Mean Percentage Error: {mean_percentage_error}%')

# Calculate Mean Absolute Percentage Error (MAPE)
absolute_percentage_errors = np.abs((y_pred_original - y_test_original) / y_test_original) * 100
mape = np.mean(absolute_percentage_errors)

# Calculate Accuracy
accuracy = (1 - (mape / 100)) * 100
print(f'Accuracy: {accuracy}%')


from sklearn.metrics import mean_absolute_percentage_error
result = mean_absolute_percentage_error(y_pred=y_pred_original, y_true=y_test_original)
print("MAPE", result)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001611 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1014
[LightGBM] [Info] Number of data points in the train set: 114291, number of used features: 14
[LightGBM] [Info] Start training from score 6.205465
Mean Squared Error: 36774136020702.21
Mean squared Percentage Error: 733199713.0237198%
Mean Percentage Error: 27.769660718795922%
Accuracy: 37.53275266893321%
MAPE 0.6246724733106681
