In [1]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, Dataset
from torch.optim import AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, classification_report
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch.nn as nn
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import IsolationForest
from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans
from imblearn.combine import SMOTEENN
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE

In [760]:
# we read in the our csv with new extracted features
df=pd.read_csv("featured_reviews.csv",parse_dates=["timestamp_created"])

We will try to model if we can predict if the review would be positive or negative based on the content of the review.

In [762]:
# we define our features and target
y = df['voted_up']
X = df['review']

In [764]:
# split the dataset
X_train, X_test, y_train, y_test = train_test_split(
    df['review'], df['voted_up'], test_size=0.2
)

The BERT model (Bidirectional Encoder Representations from Transformers) is a pre-trained transformer-based language model designed to understand text context bidirectionally. We will leverage it for text classification by fine-tuning it on our dataset to predict whether a review was positive or negative (voted_up).

Each review was: <br>
<ul>
    <li>Split into tokens</li>
    <li>Converted into unique token IDs (input_ids)</li>
    <li>Padded or truncated to a maximum length of 128 tokens</li>
    <li>Generated an attention_mask to indicate which tokens should be attended to</li>
</ul>

We created a ReviewDataset class to manage the tokenized inputs and corresponding labels. <br>
The dataset was then split into training and testing sets (X_train, y_train, X_test, y_test).

In [766]:
from transformers import BertTokenizer

# Load pre-trained tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

class ReviewDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts.iloc[idx]
        label = self.labels.iloc[idx]
        encoding = tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=128,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'label': torch.tensor(label, dtype=torch.long)
        }

# Create datasets
train_dataset = ReviewDataset(X_train, y_train)
test_dataset = ReviewDataset(X_test, y_test)

In [780]:
# raw data before transformation ->
print("Original Train Data (First 5 Rows):")
raw_sample = pd.DataFrame({
    'review': X_train.iloc[:5],
    'voted_up': y_train.iloc[:5]
})
print(raw_sample)

Original Train Data (First 5 Rows):
                                                  review  voted_up
1288                                    yeah pretty good         1
31280  well thought made game complexity slowly build...         1
39225                                          hyvä peli         1
17598  god love game played bg3 need good play game l...         1
31369                          1010 best game ive played         1


In [776]:
# Display transformed data (tokenized)
def display_transformed_data(dataset, n=5):
    """
    Function to display the transform review data (tokenized)
    """
    print("\nTransformed Train Data (Tokenized):")
    transformed_samples = []
    for i in range(n):
        sample = dataset[i]
        transformed_samples.append({
            'input_ids': sample['input_ids'].tolist(),
            'attention_mask': sample['attention_mask'].tolist(),
            'label': sample['label'].item()
        })
    transformed_df = pd.DataFrame(transformed_samples)
    print(transformed_df)

# first 5 samples from the transformed dataset ->
display_transformed_data(train_dataset)


Transformed Train Data (Tokenized):
                                           input_ids  \
0  [101, 3398, 3492, 2204, 102, 0, 0, 0, 0, 0, 0,...   
1  [101, 2092, 2245, 2081, 2208, 11619, 3254, 164...   
2  [101, 1044, 2100, 3567, 21877, 3669, 102, 0, 0...   
3  [101, 2643, 2293, 2208, 2209, 1038, 2290, 2509...   
4  [101, 7886, 2692, 2190, 2208, 4921, 2063, 2209...   

                                      attention_mask  label  
0  [1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...      1  
1  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...      1  
2  [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...      1  
3  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...      1  
4  [1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, ...      1  


In [17]:
from torch.utils.data import DataLoader

# DataLoaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16)

In [19]:
# Load pre-trained BERT model for binary classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [21]:
optimizer = AdamW(model.parameters(), lr=2e-5)

In [23]:
# Training loop
epochs = 3
model.train()

for epoch in range(epochs):
    total_loss = 0
    for batch in train_loader:
        optimizer.zero_grad()
        
        # Move data to GPU
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        
        # Forward pass
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        
        # Backward pass and optimization
        loss.backward()
        optimizer.step()
    
    print(f"Epoch {epoch + 1}, Loss: {total_loss / len(train_loader)}")

Epoch 1, Loss: 0.2058819322121209
Epoch 2, Loss: 0.13247972990714788
Epoch 3, Loss: 0.07910977544507167


In [25]:
# Evaluation loop
model.eval()
y_preds = []
y_true = []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        
        outputs = model(input_ids, attention_mask=attention_mask)
        preds = torch.argmax(outputs.logits, axis=1)
        
        y_preds.extend(preds.cpu().numpy())
        y_true.extend(labels.cpu().numpy())

# Calculate metrics
print("Accuracy:", accuracy_score(y_true, y_preds))
print("F1-Score:", f1_score(y_true, y_preds))
print("Classification Report:")
print(classification_report(y_true, y_preds))

Accuracy: 0.934370613008891
F1-Score: 0.9635785236642213
Classification Report:
              precision    recall  f1-score   support

           0       0.65      0.69      0.67       826
           1       0.97      0.96      0.96      7722

    accuracy                           0.93      8548
   macro avg       0.81      0.82      0.82      8548
weighted avg       0.94      0.93      0.94      8548



Text reviews (review) contain detailed feedback, but their raw form is difficult for machine learning models to interpret directly. By extracting a sentiment score, we provide a numerical representation of the review’s polarity, making it easier for the model to process.
VADER (Valence Aware Dictionary and sEntiment Reasoner) is a pre-trained rule-based sentiment analysis tool from the NLTK library, designed to handle social media text and other short reviews. <br>
It provides four sentiment scores for a given text:
<ul>
    <li>Positive: Proportion of positive words</li>
    <li>Neutral: Proportion of neutral words</li>
    <li>Negative: Proportion of negative words</li>
    <li>Compound: A single aggregated score that represents the overall sentiment of the text</li>
</ul>
For each review in the review column, we applied sia.polarity_scores(x) to compute sentiment scores. We selected the compound score, which is a normalized value between -1 (most negative) and 1 (most positive). This value is stored as the new column “sentiment_score”.

In [59]:
import nltk
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\aniru\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [782]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

sia = SentimentIntensityAnalyzer()

# Apply sentiment analysis to each review
df['sentiment_score'] = df['review'].apply(lambda x: sia.polarity_scores(x)['compound'])

In [784]:
X_train, X_test, y_train, y_test = train_test_split(
    df[['review', 'review_length', 'sentiment_score']],  # Include all features
    df['voted_up'], 
    test_size=0.2, 
)

In [790]:
# Display raw data before transformation
X_test.head()

Unnamed: 0,review,review_length,sentiment_score
36847,best cod since og mw2 imho vastly underrated b...,115,0.807
36953,n ghbfrtvfrewdw2,16,0.0
32947,game surprised thought would pretty slow much ...,135,0.7876
28797,fun game,8,0.5106
42616,dota best ever game,19,0.6369


In [792]:
# Columns to normalize
numerical_cols = ['review_length', 'sentiment_score']

scaler = MinMaxScaler()

# Fit on training data and transform both train and test sets
X_train[numerical_cols] = scaler.fit_transform(X_train[numerical_cols])
X_test[numerical_cols] = scaler.transform(X_test[numerical_cols])

In [794]:
X_train.head()

Unnamed: 0,review,review_length,sentiment_score
24083,first review maybe 15 years platform felt ough...,0.01913,0.969998
31917,great game lot think,0.002376,0.812441
36458,perfect,0.00075,0.785939
9535,really fun really cool,0.002626,0.877144
17997,good,0.000375,0.720186


Similar to the last model we created a custom Dataset class (ReviewDataset) to include:
<ul>
    <li>Text Features: Tokenized input IDs and attention masks</li>
    <li>Numerical Features: Normalized review_length and sentiment_score</li>
    <li>Labels: voted_up</li>
</ul>
The key difference is that the __getitem__ method was updated to return both numerical features and tokenized text features, making the dataset compatible with the combined BERT model

In [55]:
class ReviewDataset(Dataset):
    def __init__(self, texts, numerical_features, labels):
        self.texts = texts
        self.numerical_features = numerical_features
        self.labels = labels

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts.iloc[idx]
        num_features = torch.tensor(self.numerical_features.iloc[idx], dtype=torch.float)
        label = self.labels.iloc[idx]
        
        # Tokenize the text
        encoding = tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=128,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'numerical_features': num_features,
            'label': torch.tensor(label, dtype=torch.long)
        }

In [57]:
# Create datasets
train_dataset = ReviewDataset(
    texts=X_train['review'],
    numerical_features=X_train[numerical_cols],
    labels=y_train
)

test_dataset = ReviewDataset(
    texts=X_test['review'],
    numerical_features=X_test[numerical_cols],
    labels=y_test
)

# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16)


In [63]:
class CustomBERTModel(nn.Module):
    def __init__(self, bert_model):
        super(CustomBERTModel, self).__init__()
        self.bert = bert_model
        self.fc = nn.Linear(768 + len(numerical_cols), 2)  # Adjusted for the number of numerical features
        self.dropout = nn.Dropout(0.3)

    def forward(self, input_ids, attention_mask, numerical_features):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs.last_hidden_state[:, 0, :]  
        combined = torch.cat((cls_output, numerical_features), dim=1)  # Concatenated with numerical features
        logits = self.fc(self.dropout(combined))
        return logits

In [69]:
from transformers import BertModel

bert_model = BertModel.from_pretrained('bert-base-uncased')

model = CustomBERTModel(bert_model).to(device)
optimizer = AdamW(model.parameters(), lr=2e-5)

# Training loop
epochs = 3
model.train()

for epoch in range(epochs):
    total_loss = 0
    for batch in train_loader:
        optimizer.zero_grad()

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        numerical_features = batch['numerical_features'].to(device)
        labels = batch['label'].to(device)

        logits = model(input_ids, attention_mask, numerical_features)
        loss = nn.CrossEntropyLoss()(logits, labels)
        total_loss += loss.item()

        loss.backward()
        optimizer.step()
    
    print(f"Epoch {epoch + 1}, Loss: {total_loss / len(train_loader)}")

  num_features = torch.tensor(self.numerical_features.iloc[idx], dtype=torch.float)


Epoch 1, Loss: 0.20206833520540718
Epoch 2, Loss: 0.1320957051658507
Epoch 3, Loss: 0.0781316688128422


In [71]:
model.eval()
y_preds = []
y_true = []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        numerical_features = batch['numerical_features'].to(device)
        labels = batch['label'].to(device)

        logits = model(input_ids, attention_mask, numerical_features)
        preds = torch.argmax(logits, axis=1)
        y_preds.extend(preds.cpu().numpy())
        y_true.extend(labels.cpu().numpy())

# Evaluate
from sklearn.metrics import classification_report
print("Classification Report:")
print(classification_report(y_true, y_preds))

  num_features = torch.tensor(self.numerical_features.iloc[idx], dtype=torch.float)


Classification Report:
              precision    recall  f1-score   support

           0       0.69      0.63      0.66       866
           1       0.96      0.97      0.96      7682

    accuracy                           0.93      8548
   macro avg       0.82      0.80      0.81      8548
weighted avg       0.93      0.93      0.93      8548



We will now use a LightGBM classifier model to try to model the same relationship.

The review data is transformed for it to work with our classifier model. The transformation involves converting raw review text into numerical features using TF-IDF (Term Frequency-Inverse Document Frequency). This step allows text data to be represented numerically for machine learning models.

In [77]:
numerical_cols = ['review_length', 'sentiment_score']

In [796]:
from lightgbm import LGBMClassifier
from sklearn.feature_extraction.text import TfidfVectorizer

In [798]:
vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')

In [800]:
X_train, X_test, y_train, y_test = train_test_split(
    df[['review', 'review_length', 'sentiment_score']],
    df['voted_up'],
    test_size=0.2
)

In [802]:
scaler = MinMaxScaler()
X_train_numerical = scaler.fit_transform(X_train[numerical_cols])
X_test_numerical = scaler.transform(X_test[numerical_cols])

# Fit TF-IDF on training data and transform both train and test reviews
X_train_tfidf = vectorizer.fit_transform(X_train['review'])
X_test_tfidf = vectorizer.transform(X_test['review'])

# Combine TF-IDF and scaled numerical features
import scipy
X_train_combined = scipy.sparse.hstack((X_train_tfidf, X_train_numerical))
X_test_combined = scipy.sparse.hstack((X_test_tfidf, X_test_numerical))

In [808]:
# we display the transformed data ->
tfidf_dense = pd.DataFrame(
    X_train_tfidf[:5].toarray(), 
    columns=vectorizer.get_feature_names_out() 
)
print("TF-IDF Transformed Data (First 5 Rows):")
print(tfidf_dense.head())

TF-IDF Transformed Data (First 5 Rows):
   010   10  100  1000  10010  100ing      1010   11  110  1110  ...   на  \
0  0.0  0.0  0.0   0.0    0.0     0.0  0.000000  0.0  0.0   0.0  ...  0.0   
1  0.0  0.0  0.0   0.0    0.0     0.0  0.000000  0.0  0.0   0.0  ...  0.0   
2  0.0  0.0  0.0   0.0    0.0     0.0  0.000000  0.0  0.0   0.0  ...  0.0   
3  0.0  0.0  0.0   0.0    0.0     0.0  0.000000  0.0  0.0   0.0  ...  0.0   
4  0.0  0.0  0.0   0.0    0.0     0.0  0.530583  0.0  0.0   0.0  ...  0.0   

    не   но  послушай   то   ты  что  ミxノ  ヽ_ヽ___   二つ  
0  0.0  0.0       0.0  0.0  0.0  0.0  0.0     0.0  0.0  
1  0.0  0.0       0.0  0.0  0.0  0.0  0.0     0.0  0.0  
2  0.0  0.0       0.0  0.0  0.0  0.0  0.0     0.0  0.0  
3  0.0  0.0       0.0  0.0  0.0  0.0  0.0     0.0  0.0  
4  0.0  0.0       0.0  0.0  0.0  0.0  0.0     0.0  0.0  

[5 rows x 5000 columns]


In [87]:
model = LGBMClassifier(class_weight='balanced')
model.fit(X_train_combined, y_train)

y_pred = model.predict(X_test_combined)

print("Classification Report:")
print(classification_report(y_test, y_pred))

[LightGBM] [Info] Number of positive: 30751, number of negative: 3438
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.066701 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 106902
[LightGBM] [Info] Number of data points in the train set: 34189, number of used features: 3048
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
Classification Report:
              precision    recall  f1-score   support

           0       0.39      0.77      0.52       826
           1       0.97      0.87      0.92      7722

    accuracy                           0.86      8548
   macro avg       0.68      0.82      0.72      8548
weighted avg       0.92      0.86      0.88      8548





After training the model, we adjusted the decision threshold (default is 0.5) to optimize performance for different class priorities. We tested thresholds ranging from 0.1 to 0.6.

In [89]:
import numpy as np

y_pred_proba = model.predict_proba(X_test_combined)[:, 1]

for threshold in np.arange(0.1, 0.6, 0.1):
    y_pred = (y_pred_proba >= threshold).astype(int)
    print(f"Threshold: {threshold}")
    print(classification_report(y_test, y_pred))


Threshold: 0.1
              precision    recall  f1-score   support

           0       0.80      0.25      0.38       826
           1       0.93      0.99      0.96      7722

    accuracy                           0.92      8548
   macro avg       0.86      0.62      0.67      8548
weighted avg       0.91      0.92      0.90      8548

Threshold: 0.2
              precision    recall  f1-score   support

           0       0.69      0.50      0.58       826
           1       0.95      0.98      0.96      7722

    accuracy                           0.93      8548
   macro avg       0.82      0.74      0.77      8548
weighted avg       0.92      0.93      0.92      8548

Threshold: 0.30000000000000004
              precision    recall  f1-score   support

           0       0.55      0.61      0.58       826
           1       0.96      0.95      0.95      7722

    accuracy                           0.91      8548
   macro avg       0.75      0.78      0.77      8548
weighted avg 



To improve the model’s performance, we conducted a Grid Search over a small parameter space to find the best combination of hyperparameters.

In [91]:
param_grid = {
    'learning_rate': [0.01, 0.1],         
    'n_estimators': [100, 200],           
    'max_depth': [3, 5],                  
    'subsample': [0.8, 1.0],              
    'colsample_bytree': [0.8, 1.0]        
}

In [93]:
from sklearn.model_selection import GridSearchCV
from lightgbm import LGBMClassifier

model = LGBMClassifier(class_weight='balanced')

grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    scoring='f1',  # Optimize for F1-score
    cv=3,          # 3-fold cross-validation
    verbose=2,
    n_jobs=-1      # Use all available cores
)

# Fit grid search
grid_search.fit(X_train_combined, y_train)

# Best parameters
print("Best Parameters:", grid_search.best_params_)
print("Best F1-Score:", grid_search.best_score_)


Fitting 3 folds for each of 32 candidates, totalling 96 fits
[LightGBM] [Info] Number of positive: 30751, number of negative: 3438
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.074901 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 106902
[LightGBM] [Info] Number of data points in the train set: 34189, number of used features: 3048
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
Best Parameters: {'colsample_bytree': 0.8, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 200, 'subsample': 0.8}
Best F1-Score: 0.9089000849347499


In [95]:
# Get best parameters
best_params = grid_search.best_params_ 

# Retrain model with best parameters
tuned_model = LGBMClassifier(**best_params, class_weight='balanced', random_state=42)
tuned_model.fit(X_train_combined, y_train)

# Evaluate
y_pred = tuned_model.predict(X_test_combined)
print("Classification Report:")
print(classification_report(y_test, y_pred))

[LightGBM] [Info] Number of positive: 30751, number of negative: 3438
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.077570 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 106902
[LightGBM] [Info] Number of data points in the train set: 34189, number of used features: 3048
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
Classification Report:
              precision    recall  f1-score   support

           0       0.36      0.76      0.49       826
           1       0.97      0.86      0.91      7722

    accuracy                           0.85      8548
   macro avg       0.67      0.81      0.70      8548
weighted avg       0.91      0.85      0.87      8548





We try to use an ensemble model with LightGBM, Logistic Regression and Naive Bayes for modelling the same relationship. 

In [101]:
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from lightgbm import LGBMClassifier

In [103]:
# we define the base models of LightGBM, logistic regression and naive bayes
base_models = [
    ('lightgbm', LGBMClassifier(class_weight='balanced', random_state=42)),
    ('logreg', LogisticRegression(max_iter=1000)),
    ('nb', MultinomialNB())
]

In [105]:
# Logistic Regression would be the meta model
meta_model = LogisticRegression()

In [107]:
stacked_model = StackingClassifier(
    estimators=base_models,
    final_estimator=meta_model,
    cv=3 
)

In [109]:
# fit the ensemble model
stacked_model.fit(X_train_combined, y_train)

[LightGBM] [Info] Number of positive: 30751, number of negative: 3438
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.076698 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 106902
[LightGBM] [Info] Number of data points in the train set: 34189, number of used features: 3048
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
[LightGBM] [Info] Number of positive: 20500, number of negative: 2292
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.036832 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 71096
[LightGBM] [Info] Number of data points in the train set: 22792, number of used features: 2273
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000
[LightGBM

In [111]:
y_pred = stacked_model.predict(X_test_combined)
print("Classification Report:")
print(classification_report(y_test, y_pred))

Classification Report:
              precision    recall  f1-score   support

           0       0.72      0.51      0.60       826
           1       0.95      0.98      0.96      7722

    accuracy                           0.93      8548
   macro avg       0.84      0.74      0.78      8548
weighted avg       0.93      0.93      0.93      8548





Next we make an ensemble model by bagging the LightGBM Classifier and try to model the same relationship

In [812]:
from sklearn.ensemble import BaggingClassifier
from lightgbm import LGBMClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

# Define the base LightGBM estimator
base_estimator = LGBMClassifier(class_weight='balanced', random_state=42)

# Define the BaggingClassifier
bagging_model = BaggingClassifier(
    estimator=base_estimator,
    random_state=42
)

param_grid = {
    'n_estimators': [5, 10],          
    'max_samples': [0.6, 0.8],       
    'max_features': [0.6, 0.8]
}

grid_search = GridSearchCV(
    estimator=bagging_model,
    param_grid=param_grid,
    scoring='f1_weighted', 
    cv=3,                  
    verbose=2,
    n_jobs=-1               
)

grid_search.fit(X_train_combined, y_train)

best_params = grid_search.best_params_
print("Best Parameters:", best_params)
print("Best F1-Score:", grid_search.best_score_)

best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test_combined)
print("Classification Report (Best Model):")
print(classification_report(y_test, y_pred))

Fitting 3 folds for each of 8 candidates, totalling 24 fits
Best Parameters: {'max_features': 0.8, 'max_samples': 0.6, 'n_estimators': 5}
Best F1-Score: 0.8950582797469929
Classification Report (Best Model):
              precision    recall  f1-score   support

           0       0.45      0.71      0.55       826
           1       0.97      0.91      0.94      7722

    accuracy                           0.89      8548
   macro avg       0.71      0.81      0.74      8548
weighted avg       0.92      0.89      0.90      8548



We will try to predict if a review mentions difficulty using review meta data like length and sentiment score. We will try to model review_length and sentiment_score to mentions_difficulty.

In [303]:
df.columns

Index(['game_name', 'review', 'voted_up', 'timestamp_created',
       'author_num_games_owned', 'author_num_reviews',
       'author_playtime_at_review', 'author_playtime_last_two_weeks',
       'author_playtime_forever', 'review_length', 'difficulty_word_count',
       'mentions_difficulty', 'roguelike', 'co_op', 'base_building',
       'soulslike', 'deckbuilding', 'puzzle', 'metroidvania', 'rpg',
       'competitive', 'first_person', 'crpg', 'multiplayer', 'action',
       'sandbox', 'fantasy', 'simulation', 'platformer', 'shooter',
       'open_world', 'strategy', 'survival', 'adventure', 'crafting',
       'third_person', 'turn_based', '2d', 'experience_level_experienced',
       'experience_level_intermediate', 'sentiment_score'],
      dtype='object')

In [814]:
# we take review_length and sentiment_score as our inputs and mentions_difficulty as our output
features = [
    'review_length', 
    'sentiment_score',   
]

In [816]:
X = df[features]
y = df['mentions_difficulty']

In [824]:
X.head()

Unnamed: 0,review_length,sentiment_score
0,100,0.6369
1,4,0.0
2,55,0.4215
3,15,0.0
4,43,-0.6597


In [818]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [820]:
# both are numerical columns so we scale them
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [838]:
scaler = MinMaxScaler()

X_train_scaled = scaler.fit_transform(X_train)

X_test_scaled = scaler.transform(X_test)

import pandas as pd
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns)


In [840]:
X_train_scaled

Unnamed: 0,review_length,sentiment_score
0,0.033847,0.986899
1,0.000398,0.818441
2,0.002256,0.525776
3,0.004380,0.261613
4,0.000265,0.710736
...,...,...
29910,0.003186,0.840392
29911,0.022564,0.954048
29912,0.003584,0.920096
29913,0.002787,0.251313


In [830]:
y.value_counts()

mentions_difficulty
0    36788
1     5949
Name: count, dtype: int64

To combat unbalanced classes, we do sampling. We undersample the majority class and oversample the minority class.

In [832]:
undersampler = RandomUnderSampler(sampling_strategy=0.2)
X_train_resampled, y_train_resampled = undersampler.fit_resample(X_train_scaled, y_train)
y_train_resampled.value_counts()

mentions_difficulty
0    20910
1     4182
Name: count, dtype: int64

In [834]:
smote = SMOTE()
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_resampled, y_train_resampled)
y_train_resampled.value_counts()

mentions_difficulty
0    20910
1    20910
Name: count, dtype: int64

In [541]:
# train a LightGBM model 
model = LGBMClassifier(class_weight='balanced')
model.fit(X_train_resampled, y_train_resampled)

In [543]:
y_pred = model.predict(X_test_scaled)

print("Classification Report:")
print(classification_report(y_test, y_pred))

Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.77      0.85     11028
           1       0.34      0.73      0.46      1794

    accuracy                           0.76     12822
   macro avg       0.64      0.75      0.66     12822
weighted avg       0.86      0.76      0.79     12822



In [842]:
# run the same model but with some hyperparameter tuning
model = LGBMClassifier(class_weight='balanced', random_state=42)

param_grid = {
    'n_estimators': [50, 100],         
    'max_depth': [3, 5],             
    'learning_rate': [0.05, 0.1],     
    'colsample_bytree': [0.8, 1.0],   
}


grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    scoring='f1_weighted',  
    cv=3,                   
    verbose=2,
    n_jobs=-1               
)

grid_search.fit(X_train_resampled, y_train_resampled)

best_params = grid_search.best_params_
print("Best Parameters:", best_params)
print("Best F1-Score:", grid_search.best_score_)

best_model = grid_search.best_estimator_
best_model.fit(X_train_resampled, y_train_resampled)

y_pred = best_model.predict(X_test_scaled)
print("Classification Report (Best Model):")
print(classification_report(y_test, y_pred))

Fitting 3 folds for each of 16 candidates, totalling 48 fits
Best Parameters: {'colsample_bytree': 0.8, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 100}
Best F1-Score: 0.778229432215042
Classification Report (Best Model):
              precision    recall  f1-score   support

           0       0.95      0.76      0.85     11060
           1       0.33      0.75      0.46      1762

    accuracy                           0.76     12822
   macro avg       0.64      0.76      0.65     12822
weighted avg       0.87      0.76      0.79     12822



We will now use an XGBoost Classifier model to try to model the same relationship as the last one.

In [844]:
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

# Define the base XGBoost model
model = XGBClassifier(
    scale_pos_weight=2,        # Adjust for class imbalance
    use_label_encoder=False,  
    eval_metric='logloss'    
)

param_grid = {
    'n_estimators': [100, 200],      
    'learning_rate': [0.05, 0.1],   
    'max_depth': [4, 6],            
    'subsample': [0.8, 1.0],        
    'colsample_bytree': [0.8, 1.0]
}

grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    scoring='f1_weighted',    
    cv=3,                     
    verbose=2,               
    n_jobs=-1                 
)

grid_search.fit(X_train_resampled, y_train_resampled)

best_params = grid_search.best_params_
print("Best Parameters:", best_params)
print("Best F1-Score:", grid_search.best_score_)

best_model = grid_search.best_estimator_

y_pred = best_model.predict(X_test_scaled)
print("Classification Report (Best Model):")
print(classification_report(y_test, y_pred))

Fitting 3 folds for each of 32 candidates, totalling 96 fits


Parameters: { "use_label_encoder" } are not used.



Best Parameters: {'colsample_bytree': 1.0, 'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 200, 'subsample': 0.8}
Best F1-Score: 0.7659379446556777
Classification Report (Best Model):
              precision    recall  f1-score   support

           0       0.97      0.63      0.76     11060
           1       0.27      0.86      0.41      1762

    accuracy                           0.66     12822
   macro avg       0.62      0.75      0.59     12822
weighted avg       0.87      0.66      0.71     12822



We will now use an ensemble model of XGBoost, Logistic Regression and Naïve Bayes to model the same relationship

In [846]:
xgb = XGBClassifier(
    scale_pos_weight=2,
    n_estimators=100,
    max_depth=6,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    use_label_encoder=False,
    eval_metric='logloss'
)
logreg = LogisticRegression(max_iter=1000, random_state=42)
nb = GaussianNB()

stacking_model = StackingClassifier(
    estimators=[('xgb', xgb), ('logreg', logreg), ('nb', nb)],
    final_estimator=LogisticRegression(max_iter=1000),
    cv=3 
)

stacking_model.fit(X_train_resampled, y_train_resampled)

y_pred = stacking_model.predict(X_test_scaled)
print("Classification Report:")
print(classification_report(y_test, y_pred))

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.76      0.84     11060
           1       0.33      0.73      0.45      1762

    accuracy                           0.76     12822
   macro avg       0.64      0.75      0.65     12822
weighted avg       0.86      0.76      0.79     12822



In [549]:
param_grid = {
    'xgb__n_estimators': [100, 150],     
    'xgb__max_depth': [4, 6],            
    'xgb__learning_rate': [0.05, 0.1],   
    'final_estimator__C': [0.1, 1, 10]
}

In [551]:
grid_search = GridSearchCV(
    estimator=stacking_model,
    param_grid=param_grid,
    scoring='f1',  
    cv=3,
    verbose=2,
    n_jobs=-1
)

grid_search.fit(X_train_resampled, y_train_resampled)

print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

best_model = grid_search.best_estimator_
y_pred_best = best_model.predict(X_test_scaled)
print("Classification Report (Best Model):")
print(classification_report(y_test, y_pred_best))

Fitting 3 folds for each of 24 candidates, totalling 72 fits


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Best Parameters: {'final_estimator__C': 10, 'xgb__learning_rate': 0.1, 'xgb__max_depth': 6, 'xgb__n_estimators': 150}
Best Score: 0.7691706834535003
Classification Report (Best Model):
              precision    recall  f1-score   support

           0       0.95      0.75      0.83     11028
           1       0.32      0.74      0.45      1794

    accuracy                           0.75     12822
   macro avg       0.63      0.74      0.64     12822
weighted avg       0.86      0.75      0.78     12822



Thresholds between 0.3 and 0.9 were tested to improve performance metrics for class 1.

In [555]:
y_pred_proba = best_model.predict_proba(X_test_scaled)[:, 1]

# Adjust thresholds and evaluate
for threshold in np.arange(0.3, 0.9, 0.1):
    y_pred_adjusted = (y_pred_proba >= threshold).astype(int)
    print(f"Threshold: {threshold}")
    print(classification_report(y_test, y_pred_adjusted))

Threshold: 0.3
              precision    recall  f1-score   support

           0       0.96      0.59      0.73     11028
           1       0.25      0.86      0.39      1794

    accuracy                           0.63     12822
   macro avg       0.61      0.72      0.56     12822
weighted avg       0.86      0.63      0.68     12822

Threshold: 0.4
              precision    recall  f1-score   support

           0       0.95      0.68      0.79     11028
           1       0.29      0.79      0.42      1794

    accuracy                           0.70     12822
   macro avg       0.62      0.73      0.61     12822
weighted avg       0.86      0.70      0.74     12822

Threshold: 0.5
              precision    recall  f1-score   support

           0       0.95      0.75      0.83     11028
           1       0.32      0.74      0.45      1794

    accuracy                           0.75     12822
   macro avg       0.63      0.74      0.64     12822
weighted avg       0.86      

We will now use an ensemble model of LightGBM, Logistic Regression and Naïve Bayes to model the same relationship

In [557]:
lgbm = LGBMClassifier(
    class_weight='balanced',  # Automatically handles class imbalance
    n_estimators=100,         
    max_depth=6,              
    learning_rate=0.1,        
    subsample=0.8,            
    colsample_bytree=0.8,     
    random_state=42
)

logreg = LogisticRegression(max_iter=1000, random_state=42)
nb = GaussianNB()

stacking_model = StackingClassifier(
    estimators=[('lgbm', lgbm), ('logreg', logreg), ('nb', nb)],
    final_estimator=LogisticRegression(max_iter=1000),
    cv=3 
)

stacking_model.fit(X_train_resampled, y_train_resampled)

y_pred = stacking_model.predict(X_test_scaled)
print("Classification Report:")
print(classification_report(y_test, y_pred))

Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.74      0.83     11028
           1       0.32      0.75      0.45      1794

    accuracy                           0.74     12822
   macro avg       0.64      0.75      0.64     12822
weighted avg       0.86      0.74      0.78     12822



In [564]:
param_grid = {
    'lgbm__n_estimators': [100, 150],       
    'lgbm__max_depth': [4, 6],             
    'lgbm__learning_rate': [0.05, 0.1],    
    'final_estimator__C': [0.1, 1, 10]
}

grid_search = GridSearchCV(
    estimator=stacking_model,
    param_grid=param_grid,
    scoring='f1', 
    cv=3,
    verbose=2,
    n_jobs=-1
)

grid_search.fit(X_train_resampled, y_train_resampled)

print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

best_model = grid_search.best_estimator_

y_pred_best = best_model.predict(X_test_scaled)
print("Classification Report (Best Model):")
print(classification_report(y_test, y_pred_best))

Fitting 3 folds for each of 24 candidates, totalling 72 fits
Best Parameters: {'final_estimator__C': 10, 'lgbm__learning_rate': 0.1, 'lgbm__max_depth': 6, 'lgbm__n_estimators': 150}
Best Score: 0.8111193862210401
Classification Report (Best Model):
              precision    recall  f1-score   support

           0       0.95      0.75      0.84     11028
           1       0.33      0.73      0.45      1794

    accuracy                           0.75     12822
   macro avg       0.64      0.74      0.65     12822
weighted avg       0.86      0.75      0.78     12822



In [566]:
# Predict probabilities for threshold tuning
y_pred_proba = best_model.predict_proba(X_test_scaled)[:, 1]

for threshold in np.arange(0.3, 0.9, 0.1):
    y_pred_adjusted = (y_pred_proba >= threshold).astype(int)
    print(f"Threshold: {threshold}")
    print(classification_report(y_test, y_pred_adjusted))

Threshold: 0.3
              precision    recall  f1-score   support

           0       0.96      0.66      0.78     11028
           1       0.28      0.84      0.42      1794

    accuracy                           0.68     12822
   macro avg       0.62      0.75      0.60     12822
weighted avg       0.87      0.68      0.73     12822

Threshold: 0.4
              precision    recall  f1-score   support

           0       0.95      0.71      0.81     11028
           1       0.30      0.78      0.44      1794

    accuracy                           0.72     12822
   macro avg       0.63      0.75      0.63     12822
weighted avg       0.86      0.72      0.76     12822

Threshold: 0.5
              precision    recall  f1-score   support

           0       0.95      0.75      0.84     11028
           1       0.33      0.73      0.45      1794

    accuracy                           0.75     12822
   macro avg       0.64      0.74      0.65     12822
weighted avg       0.86      

In [570]:
from scipy.stats import pointbiserialr

corr_length, _ = pointbiserialr(df['review_length'], df['mentions_difficulty'])
corr_sentiment, _ = pointbiserialr(df['sentiment_score'], df['mentions_difficulty'])

print(f"Correlation (review_length, mentions_difficulty): {corr_length:.4f}")
print(f"Correlation (sentiment_score, mentions_difficulty): {corr_sentiment:.4f}")

Correlation (review_length, mentions_difficulty): 0.3980
Correlation (sentiment_score, mentions_difficulty): 0.1132


Next, we would try to model the relationship between the genres of the game in the review and if it mentions difficulty or not.

In [50]:
df.columns

Index(['game_name', 'review', 'voted_up', 'timestamp_created',
       'author_num_games_owned', 'author_num_reviews',
       'author_playtime_at_review', 'author_playtime_last_two_weeks',
       'author_playtime_forever', 'review_length', 'difficulty_word_count',
       'mentions_difficulty', 'roguelike', 'co_op', 'base_building',
       'soulslike', 'deckbuilding', 'puzzle', 'metroidvania', 'rpg',
       'competitive', 'first_person', 'crpg', 'multiplayer', 'action',
       'sandbox', 'fantasy', 'simulation', 'platformer', 'shooter',
       'open_world', 'strategy', 'survival', 'adventure', 'crafting',
       'third_person', 'turn_based', '2d', 'experience_level_experienced',
       'experience_level_intermediate'],
      dtype='object')

In [52]:
# we define the list of genre columns
genre_columns=['roguelike', 'co_op', 'base_building',
       'soulslike', 'deckbuilding', 'puzzle', 'metroidvania', 'rpg',
       'competitive', 'first_person', 'crpg', 'multiplayer', 'action',
       'sandbox', 'fantasy', 'simulation', 'platformer', 'shooter',
       'open_world', 'strategy', 'survival', 'adventure', 'crafting',
       'third_person', 'turn_based', '2d']

In [84]:
# take all the genre columns as input and mentions_difficulty as output
X=df[genre_columns]
y=df["mentions_difficulty"]
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3)

In [86]:
# balance the classes by oversampling
smote = SMOTE()
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

In [88]:
# build a decision tree classifier model
model = DecisionTreeClassifier(
    class_weight='balanced'
)

model.fit(X_train_resampled, y_train_resampled)

In [90]:
y_pred = model.predict(X_test)
print("Classification Report:")
print(classification_report(y_test, y_pred))

Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.56      0.69     11030
           1       0.19      0.65      0.30      1792

    accuracy                           0.57     12822
   macro avg       0.55      0.60      0.49     12822
weighted avg       0.81      0.57      0.64     12822



In [92]:
param_grid = {
    'criterion': ['gini', 'entropy'],    
    'max_depth': [5, 10, 15, None],         
    'min_samples_split': [2, 5, 10],        
    'min_samples_leaf': [1, 5],         
}

grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    scoring='f1_weighted',  
    cv=3,                   
    verbose=2,
    n_jobs=-1               
)

grid_search.fit(X_train_resampled, y_train_resampled)

best_params = grid_search.best_params_
print("Best Parameters:", best_params)
print("Best F1-Score:", grid_search.best_score_)

best_model = grid_search.best_estimator_

y_pred = best_model.predict(X_test)
print("Classification Report (Best Model):")
print(classification_report(y_test, y_pred))

Fitting 3 folds for each of 48 candidates, totalling 144 fits
Best Parameters: {'criterion': 'gini', 'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2}
Best F1-Score: 0.605562997308152
Classification Report (Best Model):
              precision    recall  f1-score   support

           0       0.91      0.56      0.69     11030
           1       0.19      0.65      0.30      1792

    accuracy                           0.57     12822
   macro avg       0.55      0.60      0.49     12822
weighted avg       0.81      0.57      0.64     12822



In [104]:
# then we move to a random forest classifier to try to improve this
model = RandomForestClassifier(
    class_weight='balanced'
)

model.fit(X_train_resampled, y_train_resampled)

y_pred = model.predict(X_test)
print("Classification Report:")
print(classification_report(y_test, y_pred))

Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.56      0.69     11030
           1       0.19      0.65      0.30      1792

    accuracy                           0.57     12822
   macro avg       0.55      0.60      0.49     12822
weighted avg       0.81      0.57      0.64     12822



In [108]:
# we try to tune the hyperparameters again
param_grid = {
    'n_estimators': [100, 200],          
    'max_depth': [5, 10, None],         
    'min_samples_split': [2, 5, 10],     
    'min_samples_leaf': [1, 5],         
    'max_features': ['sqrt', 'log2']     
}

grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    scoring='f1_weighted',  
    cv=3,                   
    verbose=2,
    n_jobs=-1               
)

grid_search.fit(X_train_resampled, y_train_resampled)

best_params = grid_search.best_params_
print("Best Parameters:", best_params)
print("Best F1-Score:", grid_search.best_score_)

best_model = grid_search.best_estimator_

y_pred = best_model.predict(X_test)
print("Classification Report (Best Model):")
print(classification_report(y_test, y_pred))

Fitting 3 folds for each of 72 candidates, totalling 216 fits
Best Parameters: {'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best F1-Score: 0.6070320394187442
Classification Report (Best Model):
              precision    recall  f1-score   support

           0       0.91      0.56      0.69     11030
           1       0.19      0.65      0.30      1792

    accuracy                           0.57     12822
   macro avg       0.55      0.60      0.49     12822
weighted avg       0.81      0.57      0.64     12822



In [110]:
# we now use a logistic regression model for the same relationship
model = LogisticRegression(
    max_iter=1000
)

model.fit(X_train_resampled, y_train_resampled)

y_pred = model.predict(X_test)
print("Classification Report:")
print(classification_report(y_test, y_pred))

Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.56      0.69     11030
           1       0.19      0.65      0.30      1792

    accuracy                           0.57     12822
   macro avg       0.55      0.60      0.49     12822
weighted avg       0.81      0.57      0.64     12822



In [112]:
# try to tune this model
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],  
    'penalty': ['l1', 'l2'],       
    'solver': ['liblinear', 'saga'],  
}

grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    scoring='f1_weighted',  
    cv=3,                   
    verbose=2,
    n_jobs=-1               
)

grid_search.fit(X_train_resampled, y_train_resampled)

best_params = grid_search.best_params_
print("Best Parameters:", best_params)
print("Best F1-Score:", grid_search.best_score_)

best_model = grid_search.best_estimator_

y_pred = best_model.predict(X_test)
print("Classification Report (Best Model):")
print(classification_report(y_test, y_pred))

Fitting 3 folds for each of 20 candidates, totalling 60 fits
Best Parameters: {'C': 0.1, 'penalty': 'l1', 'solver': 'liblinear'}
Best F1-Score: 0.6070320394187442
Classification Report (Best Model):
              precision    recall  f1-score   support

           0       0.91      0.56      0.69     11030
           1       0.19      0.65      0.30      1792

    accuracy                           0.57     12822
   macro avg       0.55      0.60      0.49     12822
weighted avg       0.81      0.57      0.64     12822



Since it produces nearly identical results with all three models, even after tuning, we can conclude that there isn't strong enough evidence in the data to model the mentioned relationship