### Libraries

In [93]:
import pandas as pd
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import TomekLinks
from imblearn.combine import SMOTETomek
import lightgbm as lgb
from sklearn.svm import SVC
from scipy.stats import uniform, loguniform, randint

### Import data

In [2]:
stock_pumper_data = pd.read_csv("../data/combined_stock_pumper_dataset.csv")

# in case NA
stock_pumper_data = stock_pumper_data.dropna(subset=['Known_Pumper'])

In [None]:
tfidf_features = stock_pumper_data.columns[stock_pumper_data.columns.str.islower()]  
other_features = ['Sentiment', 'num_mentioned_handles_lag_1', 'Close_lag_1', 'Volume']

columns_to_exclude = ['user.screen_name', 'id_str',"mentioned_handle_frequencies",'mentioned_handles', 'text', "mentioned_saamon2500_count",
"mentioned_yocchan60_count", "degree_pumper_interaction"]

#for col in stock_pumper_data.columns:
#    if "pumper" in col:
#        columns_to_exclude.append(col)


# Create a new DataFrame 'X' containing only the columns you want to use as features
# We use the .drop() method to remove the specified columns

# Fill NaNs with 0 or another imputation
X = stock_pumper_data[list(tfidf_features) + other_features].fillna(0)
X = X.drop(columns=columns_to_exclude, axis=1)
y = stock_pumper_data['Known_Pumper']



#### Dataset creation

In [4]:
X.head()

Unnamed: 0,time_since_inflection,1hr_tweet_count,1day_tweet_count,hour,day_of_week,month,year,degree_centrality,betweenness_centrality,eigenvector_centrality,...,vmhg,volume,wdlf,week,xrp,youre,Sentiment,num_mentioned_handles_lag_1,Close_lag_1,Volume
0,0.0,2,2,17,3,3,2021,0.000925,5.710304e-07,8.050834e-21,...,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,3787332.0
1,0.0,2,2,17,3,3,2021,0.000925,5.710304e-07,8.050834e-21,...,0.0,0.0,0.0,0.0,0.0,0.0,0,4.0,0.011,3787332.0
2,0.0,1,1,18,3,3,2021,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1,5.0,0.011,3787332.0
3,0.0,1,1,20,3,3,2021,0.002159,0.004172865,0.0007185859,...,0.0,0.0,0.0,0.0,0.0,0.0,1,0.0,0.011,3787332.0
4,0.0,1,1,20,3,3,2021,0.002468,0.008832524,0.00435422,...,0.0,0.0,0.0,0.0,0.0,0.0,1,0.0,0.011,3787332.0


#### Train test split

In [5]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=69, stratify=y)

# Remove duplicate columns (keeping only the first occurrence)
X_train = X_train.loc[:, ~X_train.columns.duplicated()]
X_test = X_test.loc[:, ~X_test.columns.duplicated()]


# Random Forest

### Model training

In [6]:
model_RF = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=69)
model_RF.fit(X_train, y_train)

### Model prediction

##### On training

In [7]:
y_pred = model_RF.predict(X_train)

print(classification_report(y_train, y_pred))
print(confusion_matrix(y_train, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      7772
           1       1.00      1.00      1.00       288

    accuracy                           1.00      8060
   macro avg       1.00      1.00      1.00      8060
weighted avg       1.00      1.00      1.00      8060

[[7771    1]
 [   0  288]]


##### On test

In [8]:
y_pred = model_RF.predict(X_test)

print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99      1944
           1       0.95      0.56      0.70        72

    accuracy                           0.98      2016
   macro avg       0.97      0.78      0.85      2016
weighted avg       0.98      0.98      0.98      2016

[[1942    2]
 [  32   40]]


# Logistic regression

### Model training

In [9]:
model_LR = LogisticRegression(class_weight='balanced', max_iter=1000)

### Model prediction

#### Train

In [10]:
model_LR.fit(X_train,y_train)

y_pred = model_LR.predict(X_train)

print(classification_report(y_train, y_pred))
print(confusion_matrix(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.98      0.39      0.56      7772
           1       0.05      0.78      0.09       288

    accuracy                           0.41      8060
   macro avg       0.51      0.59      0.32      8060
weighted avg       0.95      0.41      0.54      8060

[[3055 4717]
 [  63  225]]


#### Test

In [11]:
y_pred = model_LR.predict(X_test)

print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.97      0.38      0.55      1944
           1       0.04      0.71      0.08        72

    accuracy                           0.39      2016
   macro avg       0.51      0.54      0.31      2016
weighted avg       0.94      0.39      0.53      2016

[[ 741 1203]
 [  21   51]]


As we can see from the baseline logistics regression, the results are really bad, with precision being terrible. This could be due to a variety of issues, which come down to lack of normalization in the dataset, as well as class inbalance

### Logistics regression (With scaling)

Scale only the non TF-IDF variables as it does not make sense to normalize the TF-IDF variables

In [15]:
numeric_features_to_scale = [
    'time_since_inflection',
    '1hr_tweet_count',
    '1day_tweet_count',
    'hour',
    'day_of_week',
    'month',
    'year',
    'degree_centrality',
    'betweenness_centrality',
    'eigenvector_centrality',
    'num_mentioned_handles',
    'sentiment_x_num_mentioned',
    'num_mentioned_handles_lag_1',
    'num_mentioned_handles_lag_2',
    'Sentiment',
    'Close_lag_1',
    'Volume'
]

# All other columns = TF-IDF or token features
all_features = X_train.columns.tolist()
tfidf_and_other_features = [f for f in all_features if f not in numeric_features_to_scale]

# Step 2: Column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features_to_scale),
        ('tfidf', 'passthrough', tfidf_and_other_features)
    ]
)

# Step 3: Pipeline
model_LR_scaled = Pipeline([
    ('preprocessing', preprocessor),
    ('classifier', LogisticRegression(class_weight='balanced', max_iter=1000))
])

# Step 4: Fit and evaluate
model_LR_scaled.fit(X_train, y_train)

#### Model Prediction

##### Train

In [16]:
y_pred = model_LR_scaled.predict(X_train)

# Evaluate
print(classification_report(y_train, y_pred))
print(confusion_matrix(y_train, y_pred))

              precision    recall  f1-score   support

           0       1.00      0.86      0.93      7772
           1       0.20      0.92      0.33       288

    accuracy                           0.87      8060
   macro avg       0.60      0.89      0.63      8060
weighted avg       0.97      0.87      0.90      8060

[[6708 1064]
 [  23  265]]


##### Test

In [17]:
y_pred = model_LR_scaled.predict(X_test)

# Evaluate
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.84      0.91      1944
           1       0.17      0.85      0.28        72

    accuracy                           0.84      2016
   macro avg       0.58      0.85      0.60      2016
weighted avg       0.96      0.84      0.89      2016

[[1640  304]
 [  11   61]]


As we can see, the results are slightly better with the precision improving several fold, but this is not enough as the precision is still very low. What remains is trying dimensionality reduction and regularization

### Logistics regression (With normalization)

#### L1 normalization

In [18]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features_to_scale),
        ('tfidf', 'passthrough', tfidf_and_other_features)
    ]
)

# L1-regularized Logistic Regression pipeline
pipeline_L1 = Pipeline([
    ('preprocessing', preprocessor),
    ('classifier', LogisticRegression(
        penalty='l1',
        solver='saga',
        class_weight='balanced',
        max_iter=1000
    ))
])

# Fit model on scaled data
pipeline_L1.fit(X_train, y_train)



##### Model prediction

###### Train

In [19]:
y_pred = pipeline_L1.predict(X_train)

# Evaluate
print(classification_report(y_train, y_pred))
print(confusion_matrix(y_train, y_pred))

              precision    recall  f1-score   support

           0       1.00      0.82      0.90      7772
           1       0.17      0.97      0.28       288

    accuracy                           0.82      8060
   macro avg       0.58      0.89      0.59      8060
weighted avg       0.97      0.82      0.88      8060

[[6369 1403]
 [   9  279]]


###### Test

In [20]:
y_pred = pipeline_L1.predict(X_test)

# Evaluate
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      0.79      0.88      1944
           1       0.14      0.92      0.24        72

    accuracy                           0.80      2016
   macro avg       0.57      0.85      0.56      2016
weighted avg       0.97      0.80      0.86      2016

[[1539  405]
 [   6   66]]


#### L2 normalization

In [21]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features_to_scale),
        ('tfidf', 'passthrough', tfidf_and_other_features)
    ]
)

# L2-regularized Logistic Regression pipeline
pipeline_L2 = Pipeline([
    ('preprocessing', preprocessor),
    ('classifier', LogisticRegression(
        penalty='l2',
        solver='saga',
        class_weight='balanced',
        max_iter=1000
    ))
])

# Fit model on scaled data
pipeline_L2.fit(X_train, y_train)



##### Model predictions

###### Train

In [22]:
y_pred = pipeline_L2.predict(X_train)

# Evaluate
print(classification_report(y_train, y_pred))
print(confusion_matrix(y_train, y_pred))

              precision    recall  f1-score   support

           0       1.00      0.87      0.93      7772
           1       0.20      0.89      0.33       288

    accuracy                           0.87      8060
   macro avg       0.60      0.88      0.63      8060
weighted avg       0.97      0.87      0.91      8060

[[6762 1010]
 [  33  255]]


###### Test

In [23]:
y_pred = pipeline_L2.predict(X_test)

# Evaluate
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.85      0.91      1944
           1       0.17      0.85      0.28        72

    accuracy                           0.85      2016
   macro avg       0.58      0.85      0.60      2016
weighted avg       0.96      0.85      0.89      2016

[[1648  296]
 [  11   61]]


Regularization has helped to improve the results for the recall, and the f1-score has gone up as well, but it is still not sufficient as a model to predict this stock data, perhaps the domination of the class inbalance is affecting this too heavily. In fact, model wrongly classifies NON-pumpers as pumpers which is a critical mistake to make

# XGBoost

In [39]:
# Rough estimate
inbalance_ratio = (y_train == 0).sum() / (y_train == 1).sum()

xgb_model = XGBClassifier(
    scale_pos_weight= inbalance_ratio,        # imbalance handling
    use_label_encoder=False,
    eval_metric='logloss',
    random_state=69
)

# Step 2: Fit the model
xgb_model.fit(X_train, y_train)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


### Model predictions

#### Train

In [40]:
y_pred = xgb_model.predict(X_train)

# Step 4: Evaluate
print(classification_report(y_train, y_pred))
print(confusion_matrix(y_train, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      7772
           1       0.99      1.00      0.99       288

    accuracy                           1.00      8060
   macro avg       0.99      1.00      1.00      8060
weighted avg       1.00      1.00      1.00      8060

[[7769    3]
 [   0  288]]


#### Test

In [41]:
y_pred = xgb_model.predict(X_test)

# Step 4: Evaluate
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      1.00      1944
           1       0.92      0.82      0.87        72

    accuracy                           0.99      2016
   macro avg       0.96      0.91      0.93      2016
weighted avg       0.99      0.99      0.99      2016

[[1939    5]
 [  13   59]]


### XGBoost with scaling

In [42]:
preprocessor = ColumnTransformer([
    ('num', StandardScaler(), numeric_features_to_scale),
    ('tfidf', 'passthrough', tfidf_and_other_features)
])

xgb_pipeline = Pipeline([
    ('preprocessing', preprocessor),
    ('xgb', XGBClassifier(scale_pos_weight=inbalance_ratio, use_label_encoder=False, eval_metric='logloss'))
])

xgb_pipeline.fit(X_train, y_train)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


#### Model predictions

##### Train

In [43]:
y_pred = xgb_pipeline.predict(X_train)

# Step 4: Evaluate
print(classification_report(y_train, y_pred))
print(confusion_matrix(y_train, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      7772
           1       0.98      1.00      0.99       288

    accuracy                           1.00      8060
   macro avg       0.99      1.00      0.99      8060
weighted avg       1.00      1.00      1.00      8060

[[7766    6]
 [   0  288]]


##### Test

In [44]:
y_pred = xgb_pipeline.predict(X_test)

# Step 4: Evaluate
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      1.00      1944
           1       0.97      0.83      0.90        72

    accuracy                           0.99      2016
   macro avg       0.98      0.92      0.95      2016
weighted avg       0.99      0.99      0.99      2016

[[1942    2]
 [  12   60]]


### XGBoost with scaling and parameter selection

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features_to_scale),
        ('tfidf', 'passthrough', tfidf_and_other_features)
    ]
)

# Step 3: Define class imbalance ratio
inbalance_ratio = (y_train == 0).sum() / (y_train == 1).sum()

# Step 4: Set up base XGBoost model
xgb = XGBClassifier(
    use_label_encoder=False,
    eval_metric='logloss',
    random_state=69
)

# Step 5: Wrap it all in a pipeline
pipeline = Pipeline([
    ('preprocessing', preprocessor),
    ('xgb', xgb)
])

# Step 6: Define hyperparameter grid
param_grid = {
    'xgb__max_depth': [3, 5, 7],
    'xgb__learning_rate': [0.05, 0.1, 0.2],
    'xgb__n_estimators': [100, 200],
    'xgb__subsample': [0.8, 1.0],
    'xgb__colsample_bytree': [0.8, 1.0],
    'xgb__scale_pos_weight': [inbalance_ratio]  # use imbalance ratio
}

# Step 7: GridSearchCV
grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    scoring='f1',  # good for imbalanced binary
    cv=3,
    verbose=1,
    n_jobs=-1
)

# Step 8: Fit
grid_search.fit(X_train, y_train)

# Step 9: Best model and evaluation
best_model = grid_search.best_estimator_
print("Best Params:", grid_search.best_params_)

Fitting 3 folds for each of 72 candidates, totalling 216 fits


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.


Best Params: {'xgb__colsample_bytree': 0.8, 'xgb__learning_rate': 0.2, 'xgb__max_depth': 7, 'xgb__n_estimators': 200, 'xgb__scale_pos_weight': 26.98611111111111, 'xgb__subsample': 1.0}


#### Model predictions

##### Train

In [37]:
y_pred = best_model.predict(X_train)

# Step 4: Evaluate
print(classification_report(y_train, y_pred))
print(confusion_matrix(y_train, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      7772
           1       1.00      1.00      1.00       288

    accuracy                           1.00      8060
   macro avg       1.00      1.00      1.00      8060
weighted avg       1.00      1.00      1.00      8060

[[7771    1]
 [   0  288]]


##### Test

In [38]:
y_pred = best_model.predict(X_test)

# Step 4: Evaluate
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      1.00      1944
           1       0.95      0.83      0.89        72

    accuracy                           0.99      2016
   macro avg       0.97      0.92      0.94      2016
weighted avg       0.99      0.99      0.99      2016

[[1941    3]
 [  12   60]]


### XGBoost with scaling AND SMOTE

In [46]:
from imblearn.pipeline import Pipeline

In [None]:

preprocessor = ColumnTransformer([
    ('num', StandardScaler(), numeric_features_to_scale),
    ('tfidf', 'passthrough', tfidf_and_other_features)
])

xgb = XGBClassifier(
    use_label_encoder=False,
    eval_metric='logloss',
    random_state=42
)

# Use SMOTETomek for combined oversampling and undersampling
smote_tomek = SMOTETomek(random_state=42)

pipeline = Pipeline([
    ('preprocessing', preprocessor),
    ('sampling', smote_tomek),  # Changed 'smote' to 'sampling' and using SMOTETomek
    ('xgb', xgb)
])

param_grid = {
    'sampling__sampling_strategy': ['auto'], # SMOTETomek handles the strategy
    'xgb__max_depth': [3, 5],
    'xgb__learning_rate': [0.05, 0.1],
    'xgb__n_estimators': [100, 200],
    'xgb__subsample': [0.8, 1.0],
    'xgb__colsample_bytree': [0.8, 1.0],
    'xgb__scale_pos_weight': [inbalance_ratio]
}

grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    scoring='f1',
    cv=3,
    verbose=1,
    n_jobs=-1
)

grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_

Fitting 3 folds for each of 32 candidates, totalling 96 fits


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.


#### Model prediction

##### Train

In [53]:
y_pred = best_model.predict(X_train)

# Step 4: Evaluate
print(classification_report(y_train, y_pred))
print(confusion_matrix(y_train, y_pred))

              precision    recall  f1-score   support

           0       1.00      0.99      0.99      7772
           1       0.72      1.00      0.84       288

    accuracy                           0.99      8060
   macro avg       0.86      0.99      0.92      8060
weighted avg       0.99      0.99      0.99      8060

[[7662  110]
 [   0  288]]


##### Test

In [54]:
y_pred = best_model.predict(X_test)

# Step 4: Evaluate
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.97      0.98      1944
           1       0.49      0.86      0.62        72

    accuracy                           0.96      2016
   macro avg       0.74      0.91      0.80      2016
weighted avg       0.98      0.96      0.97      2016

[[1879   65]
 [  10   62]]


Applying SMOTE (Synthetic Minority Over-sampling Technique) can sometimes lead to a decrease in overall model performance metrics like accuracy, precision, or recall. This can occur for several reasons:

* **Introduction of Noise:** SMOTE generates synthetic minority class samples by interpolating between existing minority instances. If the original minority class data contains noise or outliers, SMOTE might amplify this noise by creating synthetic samples based on these problematic instances.

* **Overlapping Classes:** In cases where the decision boundary between the majority and minority classes is complex or there is significant overlap, SMOTE might create synthetic minority samples that fall within the majority class region. This can confuse the classifier and lead to misclassifications.

* **Overgeneralization on Limited Information:** While SMOTE increases the number of minority class samples, it doesn't add truly new information. If the original minority class is not well-defined or lacks sufficient variability, the synthetic samples might lead the model to overgeneralize patterns that aren't truly representative of unseen minority instances.

* **Distortion of the Original Data Distribution:** By artificially balancing the class distribution, SMOTE can sometimes distort the underlying relationships within the data, potentially making it harder for the model to learn the true decision boundaries.

Therefore, while SMOTE can be beneficial in many imbalanced learning scenarios, it's not always guaranteed to improve performance and should be evaluated carefully on a separate test set using appropriate metrics.

# LightGBM

In [58]:
from sklearn.pipeline import Pipeline

In [None]:
preprocessor = ColumnTransformer([
    ('num', StandardScaler(), numeric_features_to_scale),
    ('tfidf', 'passthrough', tfidf_and_other_features)
])

# LightGBM Classifier
lgbm = lgb.LGBMClassifier(
    random_state=69
)

pipeline = Pipeline([  # Using ImbPipeline for consistency
    ('preprocessing', preprocessor),
    ('lgbm', lgbm)  # LightGBM directly after preprocessing
])

param_grid_lgbm = {
    'lgbm__n_estimators': [100, 200, 300],
    'lgbm__learning_rate': [0.01, 0.05, 0.1],
    'lgbm__max_depth': [3, 5, 7],
    'lgbm__num_leaves': [20, 31, 40],
    'lgbm__subsample': [0.8, 1.0],
    'lgbm__colsample_bytree': [0.8, 1.0],
    'lgbm__objective': ['binary'],  # Assuming binary classification
    'lgbm__boosting_type': ['gbdt', 'dart'], # Try different boosting types
    'lgbm__scale_pos_weight': [inbalance_ratio] # If you still want to use it
}

param_grid_lgbm_reduced = {
    'lgbm__n_estimators': [100, 250],
    'lgbm__learning_rate': [0.01, 0.1],
    'lgbm__max_depth': [3, 7],
    'lgbm__num_leaves': [20, 40],
    'lgbm__subsample': [0.8, 1.0],
    'lgbm__colsample_bytree': [0.8, 1.0],
    'lgbm__objective': ['binary'],
    'lgbm__boosting_type': ['gbdt'],
    'lgbm__scale_pos_weight': [inbalance_ratio]
}

param_grid_lgbm_focused = {
    'lgbm__n_estimators': [150, 250, 350],
    'lgbm__learning_rate': [0.005, 0.01, 0.02],
    'lgbm__num_leaves': [25, 35, 45],
    'lgbm__max_depth': [5, 7],
    'lgbm__subsample': [0.9],
    'lgbm__colsample_bytree': [0.9],
    'lgbm__objective': ['binary'],
    'lgbm__boosting_type': ['gbdt'],
    'lgbm__scale_pos_weight': [inbalance_ratio]
}

grid_search_lgbm = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid_lgbm_focused,
    scoring='f1',
    cv=3,
    verbose=1,
    n_jobs=-1
)

# Step 5: Train LightGBM
grid_search_lgbm.fit(X_train, y_train)

# Get the best LightGBM model
best_model_lgbm = grid_search_lgbm.best_estimator_

Fitting 3 folds for each of 54 candidates, totalling 162 fits
[LightGBM] [Info] Number of positive: 192, number of negative: 5181
[LightGBM] [Info] Number of positive: 192, number of negative: 5181
[LightGBM] [Info] Number of positive: 192, number of negative: 5181
[LightGBM] [Info] Number of positive: 192, number of negative: 5181
[LightGBM] [Info] Number of positive: 192, number of negative: 5181
[LightGBM] [Info] Number of positive: 192, number of negative: 5182
[LightGBM] [Info] Number of positive: 192, number of negative: 5182
[LightGBM] [Info] Number of positive: 192, number of negative: 5181
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010996 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4025
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.011916 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Auto-choosing col-

## Grid Search

### Model Prediction

#### Train

In [66]:
y_pred = best_model_lgbm.predict(X_train)

# Step 4: Evaluate
print(classification_report(y_train, y_pred))
print(confusion_matrix(y_train, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      7772
           1       0.90      1.00      0.95       288

    accuracy                           1.00      8060
   macro avg       0.95      1.00      0.97      8060
weighted avg       1.00      1.00      1.00      8060

[[7740   32]
 [   0  288]]


#### Test

In [67]:
y_pred = best_model_lgbm.predict(X_test)

# Step 4: Evaluate
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99      1944
           1       0.84      0.86      0.85        72

    accuracy                           0.99      2016
   macro avg       0.92      0.93      0.92      2016
weighted avg       0.99      0.99      0.99      2016

[[1932   12]
 [  10   62]]


In [74]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform, randint

param_distributions_lgbm = {
    'lgbm__n_estimators': randint(100, 400),
    'lgbm__learning_rate': uniform(0.005, 0.1),
    'lgbm__max_depth': randint(3, 8),
    'lgbm__num_leaves': randint(20, 50),
    'lgbm__subsample': uniform(0.7, 1.0),
    'lgbm__colsample_bytree': uniform(0.7, 1.0),
    'lgbm__objective': ['binary'],
    'lgbm__boosting_type': ['gbdt', 'dart'],
    'lgbm__scale_pos_weight': [inbalance_ratio]
}

random_search_lgbm = RandomizedSearchCV(
    estimator=pipeline,
    param_distributions=param_distributions_lgbm,
    n_iter=50,  # Number of random combinations to try
    scoring='f1',
    cv=3,
    verbose=0,
    n_jobs=-1,
    random_state=42
)

random_search_lgbm.fit(X_train, y_train)
best_model_lgbm_2 = random_search_lgbm.best_estimator_

[LightGBM] [Fatal] Check failed: (bagging_fraction) <= (1.0) at /Users/runner/work/LightGBM/LightGBM/lightgbm-python/src/io/config_auto.cpp, line 367 .

[LightGBM] [Fatal] Check failed: (bagging_fraction) <= (1.0) at /Users/runner/work/LightGBM/LightGBM/lightgbm-python/src/io/config_auto.cpp, line 367 .

[LightGBM] [Fatal] Check failed: (feature_fraction) <= (1.0) at /Users/runner/work/LightGBM/LightGBM/lightgbm-python/src/io/config_auto.cpp, line 385 .

[LightGBM] [Fatal] Check failed: (feature_fraction) <= (1.0) at /Users/runner/work/LightGBM/LightGBM/lightgbm-python/src/io/config_auto.cpp, line 385 .

[LightGBM] [Fatal] Check failed: (bagging_fraction) <= (1.0) at /Users/runner/work/LightGBM/LightGBM/lightgbm-python/src/io/config_auto.cpp, line 367 .

[LightGBM] [Fatal] Check failed: (feature_fraction) <= (1.0) at /Users/runner/work/LightGBM/LightGBM/lightgbm-python/src/io/config_auto.cpp, line 385 .

[LightGBM] [Fatal] Check failed: (bagging_fraction) <= (1.0) at /Users/runner/work

[LightGBM] [Info] Number of positive: 192, number of negative: 5181
[LightGBM] [Info] Number of positive: 192, number of negative: 5182
[LightGBM] [Info] Number of positive: 192, number of negative: 5181
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.012950 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4038
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.012926 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4025
[LightGBM] [Info] Number of data points in the train set: 5373, number of used features: 115
[LightGBM] [Info] Number of data points in the train set: 5374, number of used features: 116
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.01

[LightGBM] [Fatal] Check failed: (bagging_fraction) <= (1.0) at /Users/runner/work/LightGBM/LightGBM/lightgbm-python/src/io/config_auto.cpp, line 367 .

[LightGBM] [Fatal] Check failed: (bagging_fraction) <= (1.0) at /Users/runner/work/LightGBM/LightGBM/lightgbm-python/src/io/config_auto.cpp, line 367 .

[LightGBM] [Fatal] Check failed: (bagging_fraction) <= (1.0) at /Users/runner/work/LightGBM/LightGBM/lightgbm-python/src/io/config_auto.cpp, line 367 .

[LightGBM] [Fatal] Check failed: (bagging_fraction) <= (1.0) at /Users/runner/work/LightGBM/LightGBM/lightgbm-python/src/io/config_auto.cpp, line 367 .

[LightGBM] [Fatal] Check failed: (bagging_fraction) <= (1.0) at /Users/runner/work/LightGBM/LightGBM/lightgbm-python/src/io/config_auto.cpp, line 367 .

[LightGBM] [Fatal] Check failed: (bagging_fraction) <= (1.0) at /Users/runner/work/LightGBM/LightGBM/lightgbm-python/src/io/config_auto.cpp, line 367 .

[LightGBM] [Fatal] Check failed: (bagging_fraction) <= (1.0) at /Users/runner/work

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.036431 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4025
[LightGBM] [Info] Number of data points in the train set: 5374, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.035728 -> initscore=-3.295451
[LightGBM] [Info] Start training from score -3.295451


[LightGBM] [Fatal] Check failed: (bagging_fraction) <= (1.0) at /Users/runner/work/LightGBM/LightGBM/lightgbm-python/src/io/config_auto.cpp, line 367 .

[LightGBM] [Fatal] Check failed: (bagging_fraction) <= (1.0) at /Users/runner/work/LightGBM/LightGBM/lightgbm-python/src/io/config_auto.cpp, line 367 .

[LightGBM] [Fatal] Check failed: (bagging_fraction) <= (1.0) at /Users/runner/work/LightGBM/LightGBM/lightgbm-python/src/io/config_auto.cpp, line 367 .

[LightGBM] [Fatal] Check failed: (bagging_fraction) <= (1.0) at /Users/runner/work/LightGBM/LightGBM/lightgbm-python/src/io/config_auto.cpp, line 367 .

[LightGBM] [Fatal] Check failed: (bagging_fraction) <= (1.0) at /Users/runner/work/LightGBM/LightGBM/lightgbm-python/src/io/config_auto.cpp, line 367 .

[LightGBM] [Fatal] Check failed: (bagging_fraction) <= (1.0) at /Users/runner/work/LightGBM/LightGBM/lightgbm-python/src/io/config_auto.cpp, line 367 .

[LightGBM] [Fatal] Check failed: (bagging_fraction) <= (1.0) at /Users/runner/work



[LightGBM] [Fatal] Check failed: (feature_fraction) <= (1.0) at /Users/runner/work/LightGBM/LightGBM/lightgbm-python/src/io/config_auto.cpp, line 385 .

[LightGBM] [Fatal] Check failed: (bagging_fraction) <= (1.0) at /Users/runner/work/LightGBM/LightGBM/lightgbm-python/src/io/config_auto.cpp, line 367 .

[LightGBM] [Fatal] Check failed: (bagging_fraction) <= (1.0) at /Users/runner/work/LightGBM/LightGBM/lightgbm-python/src/io/config_auto.cpp, line 367 .

[LightGBM] [Fatal] Check failed: (bagging_fraction) <= (1.0) at /Users/runner/work/LightGBM/LightGBM/lightgbm-python/src/io/config_auto.cpp, line 367 .

[LightGBM] [Fatal] Check failed: (bagging_fraction) <= (1.0) at /Users/runner/work/LightGBM/LightGBM/lightgbm-python/src/io/config_auto.cpp, line 367 .

[LightGBM] [Fatal] Check failed: (bagging_fraction) <= (1.0) at /Users/runner/work/LightGBM/LightGBM/lightgbm-python/src/io/config_auto.cpp, line 367 .





[LightGBM] [Fatal] Check failed: (bagging_fraction) <= (1.0) at /Users/runner/work/LightGBM/LightGBM/lightgbm-python/src/io/config_auto.cpp, line 367 .

[LightGBM] [Fatal] Check failed: (feature_fraction) <= (1.0) at /Users/runner/work/LightGBM/LightGBM/lightgbm-python/src/io/config_auto.cpp, line 385 .

[LightGBM] [Fatal] Check failed: (feature_fraction) <= (1.0) at /Users/runner/work/LightGBM/LightGBM/lightgbm-python/src/io/config_auto.cpp, line 385 .

[LightGBM] [Fatal] Check failed: (feature_fraction) <= (1.0) at /Users/runner/work/LightGBM/LightGBM/lightgbm-python/src/io/config_auto.cpp, line 385 .

[LightGBM] [Fatal] Check failed: (feature_fraction) <= (1.0) at /Users/runner/work/LightGBM/LightGBM/lightgbm-python/src/io/config_auto.cpp, line 385 .

[LightGBM] [Fatal] Check failed: (feature_fraction) <= (1.0) at /Users/runner/work/LightGBM/LightGBM/lightgbm-python/src/io/config_auto.cpp, line 385 .

[LightGBM] [Fatal] Check failed: (feature_fraction) <= (1.0) at /Users/runner/work




[LightGBM] [Fatal] Check failed: (feature_fraction) <= (1.0) at /Users/runner/work/LightGBM/LightGBM/lightgbm-python/src/io/config_auto.cpp, line 385 .

[LightGBM] [Fatal] Check failed: (feature_fraction) <= (1.0) at /Users/runner/work/LightGBM/LightGBM/lightgbm-python/src/io/config_auto.cpp, line 385 .

[LightGBM] [Fatal] Check failed: (feature_fraction) <= (1.0) at /Users/runner/work/LightGBM/LightGBM/lightgbm-python/src/io/config_auto.cpp, line 385 .

[LightGBM] [Fatal] Check failed: (feature_fraction) <= (1.0) at /Users/runner/work/LightGBM/LightGBM/lightgbm-python/src/io/config_auto.cpp, line 385 .

[LightGBM] [Fatal] Check failed: (feature_fraction) <= (1.0) at /Users/runner/work/LightGBM/LightGBM/lightgbm-python/src/io/config_auto.cpp, line 385 .

[LightGBM] [Fatal] Check failed: (feature_fraction) <= (1.0) at /Users/runner/work/LightGBM/LightGBM/lightgbm-python/src/io/config_auto.cpp, line 385 .

[LightGBM] [Fatal] Check failed: (feature_fraction) <= (1.0) at /Users/runner/work

[LightGBM] [Info] Number of positive: 192, number of negative: 5181
[LightGBM] [Info] Number of positive: 192, number of negative: 5182
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.150670 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4038
[LightGBM] [Info] Number of data points in the train set: 5373, number of used features: 115
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.035734 -> initscore=-3.295258
[LightGBM] [Info] Start training from score -3.295258
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.103563 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4025
[LightGBM] [Info] Number of data points in the train set: 5374, number of used features: 116
[LightGBM] [Info] [binar

[LightGBM] [Fatal] Check failed: (bagging_fraction) <= (1.0) at /Users/runner/work/LightGBM/LightGBM/lightgbm-python/src/io/config_auto.cpp, line 367 .

[LightGBM] [Fatal] Check failed: (bagging_fraction) <= (1.0) at /Users/runner/work/LightGBM/LightGBM/lightgbm-python/src/io/config_auto.cpp, line 367 .

[LightGBM] [Fatal] Check failed: (bagging_fraction) <= (1.0) at /Users/runner/work/LightGBM/LightGBM/lightgbm-python/src/io/config_auto.cpp, line 367 .

[LightGBM] [Fatal] Check failed: (bagging_fraction) <= (1.0) at /Users/runner/work/LightGBM/LightGBM/lightgbm-python/src/io/config_auto.cpp, line 367 .

[LightGBM] [Fatal] Check failed: (bagging_fraction) <= (1.0) at /Users/runner/work/LightGBM/LightGBM/lightgbm-python/src/io/config_auto.cpp, line 367 .

[LightGBM] [Fatal] Check failed: (bagging_fraction) <= (1.0) at /Users/runner/work/LightGBM/LightGBM/lightgbm-python/src/io/config_auto.cpp, line 367 .

[LightGBM] [Fatal] Check failed: (bagging_fraction) <= (1.0) at /Users/runner/work



[LightGBM] [Fatal] Check failed: (bagging_fraction) <= (1.0) at /Users/runner/work/LightGBM/LightGBM/lightgbm-python/src/io/config_auto.cpp, line 367 .

[LightGBM] [Fatal] Check failed: (bagging_fraction) <= (1.0) at /Users/runner/work/LightGBM/LightGBM/lightgbm-python/src/io/config_auto.cpp, line 367 .

[LightGBM] [Fatal] Check failed: (bagging_fraction) <= (1.0) at /Users/runner/work/LightGBM/LightGBM/lightgbm-python/src/io/config_auto.cpp, line 367 .

[LightGBM] [Fatal] Check failed: (bagging_fraction) <= (1.0) at /Users/runner/work/LightGBM/LightGBM/lightgbm-python/src/io/config_auto.cpp, line 367 .

[LightGBM] [Fatal] Check failed: (bagging_fraction) <= (1.0) at /Users/runner/work/LightGBM/LightGBM/lightgbm-python/src/io/config_auto.cpp, line 367 .

[LightGBM] [Fatal] Check failed: (bagging_fraction) <= (1.0) at /Users/runner/work/LightGBM/LightGBM/lightgbm-python/src/io/config_auto.cpp, line 367 .

[LightGBM] [Fatal] Check failed: (bagging_fraction) <= (1.0) at /Users/runner/work

[LightGBM] [Info] Number of positive: 192, number of negative: 5181
[LightGBM] [Info] Number of positive: 192, number of negative: 5182
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.031864 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4038
[LightGBM] [Info] Number of data points in the train set: 5373, number of used features: 115

[LightGBM] [Info] Start training from score -3.295258


[LightGBM] [Fatal] Check failed: (bagging_fraction) <= (1.0) at /Users/runner/work/LightGBM/LightGBM/lightgbm-python/src/io/config_auto.cpp, line 367 .

[LightGBM] [Fatal] Check failed: (bagging_fraction) <= (1.0) at /Users/runner/work/LightGBM/LightGBM/lightgbm-python/src/io/config_auto.cpp, line 367 .

[LightGBM] [Fatal] Check failed: (bagging_fraction) <= (1.0) at /Users/runner/work/LightGBM/LightGBM/lightgbm-python/src/io/config_auto.cpp, line 367 .

[LightGBM] [Fatal] Check failed: (feature_fraction) <= (1.0) at /Users/runner/work/LightGBM/LightGBM/lightgbm-python/src/io/config_auto.cpp, line 385 .

[LightGBM] [Fatal] Check failed: (feature_fraction) <= (1.0) at /Users/runner/work/LightGBM/LightGBM/lightgbm-python/src/io/config_auto.cpp, line 385 .

[LightGBM] [Fatal] Check failed: (feature_fraction) <= (1.0) at /Users/runner/work/LightGBM/LightGBM/lightgbm-python/src/io/config_auto.cpp, line 385 .

[LightGBM] [Fatal] Check failed: (bagging_fraction) <= (1.0) at /Users/runner/work

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.089774 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4025
[LightGBM] [Info] Number of data points in the train set: 5374, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.035728 -> initscore=-3.295451
[LightGBM] [Info] Start training from score -3.295451




[LightGBM] [Info] Number of positive: 192, number of negative: 5181
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.007557 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3951
[LightGBM] [Info] Number of data points in the train set: 5373, number of used features: 117
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.035734 -> initscore=-3.295258
[LightGBM] [Info] Start training from score -3.295258


[LightGBM] [Fatal] Check failed: (bagging_fraction) <= (1.0) at /Users/runner/work/LightGBM/LightGBM/lightgbm-python/src/io/config_auto.cpp, line 367 .






[LightGBM] [Info] Number of positive: 192, number of negative: 5181
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.008838 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3951
[LightGBM] [Info] Number of data points in the train set: 5373, number of used features: 117
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.035734 -> initscore=-3.295258
[LightGBM] [Info] Start training from score -3.295258



[LightGBM] [Fatal] Check failed: (feature_fraction) <= (1.0) at /Users/runner/work/LightGBM/LightGBM/lightgbm-python/src/io/config_auto.cpp, line 385 .





135 fits failed out of a total of 150.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
96 fits failed with the following error:
Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/sklearn/base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/sklearn/pipeline.py", line 473, in fit
    self._final_estima

[LightGBM] [Info] Number of positive: 288, number of negative: 7772
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003071 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5189
[LightGBM] [Info] Number of data points in the train set: 8060, number of used features: 117
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.035732 -> initscore=-3.295322
[LightGBM] [Info] Start training from score -3.295322


## RandomSearchCV

## Model prediction

### Train 

In [75]:
y_pred = best_model_lgbm_2.predict(X_train)

# Step 4: Evaluate
print(classification_report(y_train, y_pred))
print(confusion_matrix(y_train, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      7772
           1       0.95      1.00      0.97       288

    accuracy                           1.00      8060
   macro avg       0.97      1.00      0.99      8060
weighted avg       1.00      1.00      1.00      8060

[[7756   16]
 [   0  288]]


### Test

In [76]:
y_pred = best_model_lgbm_2.predict(X_test)

# Step 4: Evaluate
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      1.00      1944
           1       0.92      0.83      0.88        72

    accuracy                           0.99      2016
   macro avg       0.96      0.92      0.94      2016
weighted avg       0.99      0.99      0.99      2016

[[1939    5]
 [  12   60]]


LightGBM often performs well on datasets like this due to several of its inherent characteristics:

* **Gradient-Based One-Side Sampling (GOSS):** LightGBM is designed to efficiently handle large datasets by focusing on the data points that contribute the most to gradient calculation. GOSS samples the majority of instances with small gradients and keeps all instances with large gradients. This reduces the number of data instances to consider for training without significantly compromising accuracy, leading to faster training times and potentially better generalization, especially if the dataset is large.

* **Exclusive Feature Bundling (EFB):** If your dataset has many sparse features, LightGBM can bundle mutually exclusive features together, reducing the dimensionality of the data and further accelerating training and reducing memory usage. This can be particularly beneficial for datasets derived from sparse representations like TF-IDF.

* **Leaf-Wise Tree Growth:** Unlike many other tree-based algorithms that grow trees level by level (depth-wise), LightGBM grows trees leaf-wise. This means it chooses to split the leaf with the largest loss, which can lead to faster convergence and better accuracy, especially for complex datasets, as it focuses on reducing the loss more effectively.

* **Efficient Handling of Categorical Features:** LightGBM has built-in support for handling categorical features directly (if specified), often outperforming one-hot encoding in terms of both speed and memory efficiency. If your dataset contains categorical features, LightGBM's ability to handle them natively can be a significant advantage.

* **Regularization Techniques:** LightGBM includes various regularization techniques (like L1 and L2 regularization, and controlling tree complexity through `max_depth` and `num_leaves`) that help prevent overfitting, leading to better performance on unseen data.

**Possible Improvements Given What We Have Already Done:**

Based on our experimentation with LightGBM using `RandomizedSearchCV`, here are potential avenues for further improvement:

1.  **Expand the Hyperparameter Search Space:** While `RandomizedSearchCV` is efficient, the initial `param_distributions_lgbm` might not have covered the absolute optimal ranges for all hyperparameters. We could:
    * **Widen the ranges:** For example, try a larger range for `n_estimators`, `num_leaves`, or `learning_rate`.
    * **Adjust the distributions:** Consider using different statistical distributions (e.g., normal distribution around promising values found in the initial search) if you have reason to believe certain ranges are more likely to yield better results.
    * **Add more hyperparameters to tune:** Explore tuning parameters like `min_child_samples`, `min_child_weight`, or different regularization strengths (`reg_alpha`, `reg_lambda`).

2.  **Increase the Number of Randomized Search Iterations (`n_iter`):** Increasing `n_iter` allows `RandomizedSearchCV` to sample and evaluate more hyperparameter combinations, increasing the chances of finding a better set of parameters.

3.  **Feature Engineering:** Explore creating new features from the existing ones or transforming them in different ways. For example:
    * Creating interaction terms between features.
    * Applying different scaling methods or transformations to numerical features.
    * Further analyzing and processing the TF-IDF features.

4.  **Address Class Imbalance More Explicitly (If Still Relevant):** Even if SMOTE wasn't used in the final LightGBM run, you could experiment with:
    * **Adjusting `scale_pos_weight` more aggressively:** Try different values for `inbalance_ratio` or even a range of values in the hyperparameter search.
    * **Using other techniques within LightGBM:** LightGBM has parameters like `is_unbalance` which can be set to `True` to automatically handle class imbalance.

5.  **Ensemble Methods:** Consider ensembling the best LightGBM model with other well-performing models you have trained (e.g., Random Forest, potentially a well-tuned SVM if it showed promise). Techniques like voting or stacking could potentially improve overall performance and robustness.

6.  **More Rigorous Cross-Validation:** While 3-fold CV is a good starting point, increasing the number of folds (e.g., to 5 or 10) can provide a more robust estimate of the model's generalization performance during hyperparameter tuning. However, this will also increase the runtime of the search.

7.  **Analyze Feature Importance:** LightGBM provides a way to assess the importance of different features. Analyzing these importances might give insights into which features are most predictive and whether any less important features could be removed or if new related features could be engineered.

By exploring these potential improvements, you can further refine your LightGBM model and potentially achieve even better performance on your dataset. Remember to evaluate any changes on a separate validation or test set to ensure that the improvements generalize to unseen data.

# SVM with RBF Kernel

## Model training

In [78]:
preprocessor = ColumnTransformer([
    ('num', StandardScaler(), numeric_features_to_scale),
    ('tfidf', 'passthrough', tfidf_and_other_features)
])

# SVM Classifier with RBF kernel
svm_rbf = SVC(kernel='rbf', probability=True, random_state=69) # probability=True for predict_proba if needed

pipeline_svm = Pipeline([
    ('preprocessing', preprocessor),
    ('svm', svm_rbf)
])

param_distributions_svm = {
    'svm__C': loguniform(1e-3, 1e3),  # Regularization parameter
    'svm__gamma': loguniform(1e-3, 1e1),  # Kernel coefficient
    'svm__class_weight': [None, 'balanced'] # Handle class imbalance if needed
}

random_search_svm = RandomizedSearchCV(
    estimator=pipeline_svm,
    param_distributions=param_distributions_svm,
    n_iter=50,  # You can adjust the number of iterations
    scoring='f1',
    cv=3,
    verbose=1,
    n_jobs=-1,
    random_state=69
)

# Step 5: Train SVM with RandomizedSearchCV
random_search_svm.fit(X_train, y_train)

# Get the best SVM model
best_model_svm_rbf = random_search_svm.best_estimator_

Fitting 3 folds for each of 50 candidates, totalling 150 fits


### Model prediction

#### Train

In [79]:
y_pred = best_model_svm_rbf.predict(X_train)

# Step 4: Evaluate
print(classification_report(y_train, y_pred))
print(confusion_matrix(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      1.00      7772
           1       1.00      0.81      0.89       288

    accuracy                           0.99      8060
   macro avg       0.99      0.90      0.94      8060
weighted avg       0.99      0.99      0.99      8060

[[7771    1]
 [  55  233]]


#### Test

In [80]:
y_pred = best_model_svm_rbf.predict(X_test)

# Step 4: Evaluate
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.98      0.99      0.98      1944
           1       0.52      0.44      0.48        72

    accuracy                           0.97      2016
   macro avg       0.75      0.71      0.73      2016
weighted avg       0.96      0.97      0.96      2016

[[1915   29]
 [  40   32]]


While Support Vector Machines (SVM), particularly with the RBF kernel, are powerful models capable of capturing complex non-linear relationships, they might not perform well on this specific dataset for several potential reasons:

* **Sensitivity to Feature Scaling:** SVMs are highly sensitive to the scale of the input features. If the features in this dataset have widely varying ranges and haven't been properly scaled (e.g., using StandardScaler), the features with larger values might dominate the distance calculations, leading to suboptimal performance.

* **Curse of Dimensionality:** If this dataset has a high number of features compared to the number of samples, SVMs can struggle. The RBF kernel maps data into a high-dimensional (potentially infinite) space, and in high-dimensional sparse spaces, the concept of distance can become less meaningful, potentially hindering the effectiveness of the kernel.

* **Overfitting Tendency (with RBF):** The RBF kernel is very flexible and can easily overfit the training data, especially if the hyperparameters `C` and `gamma` are not carefully tuned. If the dataset contains noise or outliers, an overfit SVM might perform poorly on unseen data.

* **Computational Cost for Large Datasets:** SVMs with the RBF kernel can be computationally expensive to train, especially on large datasets. If the dataset is substantial, the training process might have been limited, preventing the model from fully learning the underlying patterns.

* **Nature of the Decision Boundary:** The optimal decision boundary for this specific problem might be better captured by other types of models (e.g., tree-based models like Random Forest or gradient boosting methods like XGBoost and LightGBM) that can handle feature interactions and non-linearities in different ways. For instance, if the decision boundary is highly axis-aligned or involves complex hierarchical rules, tree-based models might be more naturally suited.

* **Hyperparameter Optimization Challenges:** Finding the optimal hyperparameters (`C` and `gamma`) for RBF SVM can be challenging. If the hyperparameter search was not extensive enough or didn't explore the most relevant regions of the parameter space, the resulting SVM model might be suboptimal.

* **Class Imbalance Issues (If Present):** While techniques like setting `class_weight='balanced'` can help, SVMs can still be sensitive to class imbalance, especially if the minority class is very small and the decision boundary is complex.

It's important to note that the poor performance of SVM on this dataset doesn't inherently mean it's a bad algorithm, but rather that its characteristics might not align well with the specific properties and underlying structure of this particular data.

# GBM

## Model training

In [90]:
# Preprocessor for GBM
preprocessor_gbm = ColumnTransformer([
    ('num', StandardScaler(), [col for col in numeric_features_to_scale if col in X_train.columns]),
    ('tfidf', 'passthrough', [col for col in tfidf_and_other_features if col in X_train.columns])
], remainder='passthrough')

# Gradient Boosting Classifier
gbm = GradientBoostingClassifier(
    random_state=69,
    verbose=0, # Set to 1 or higher for more output during training
    loss='log_loss' # For binary classification
)

pipeline_gbm = Pipeline([
    ('preprocessing', preprocessor_gbm),
    ('gbm', gbm)
])

param_distributions_gbm = {
    'gbm__n_estimators': randint(100, 400),
    'gbm__learning_rate': uniform(0.01, 0.2),
    'gbm__max_depth': randint(3, 8),
    'gbm__min_samples_split': randint(2, 10),
    'gbm__min_samples_leaf': randint(1, 5),
    'gbm__subsample': uniform(0.01, 0.99) # Sample between 0.01 and 0.99 (exclusive of 1.0 at the very edge can sometimes be safer with floating point)
    # OR
    # 'gbm__subsample': uniform(0.7, 0.3) # To sample between 0.7 and 1.0 (0.7 + 0.3)
    # OR
    # 'gbm__subsample': uniform(0.1, 0.9) # To sample between 0.1 and 1.0 (0.1 + 0.9)
}

random_search_gbm = RandomizedSearchCV(
    estimator=pipeline_gbm,
    param_distributions=param_distributions_gbm,
    n_iter=50, # Adjust as needed
    scoring='f1',
    cv=3,
    verbose=1,
    n_jobs=-1,
    random_state=69,
    error_score='raise'
)

# Step 5: Train GBM with RandomizedSearchCV
random_search_gbm.fit(X_train, y_train)

# Get the best GBM model
best_model_gbm = random_search_gbm.best_estimator_

Fitting 3 folds for each of 50 candidates, totalling 150 fits


### Model prediction

#### Train

In [92]:
y_pred = best_model_gbm.predict(X_train)

# Step 4: Evaluate
print(classification_report(y_train, y_pred))
print(confusion_matrix(y_train, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      7772
           1       1.00      1.00      1.00       288

    accuracy                           1.00      8060
   macro avg       1.00      1.00      1.00      8060
weighted avg       1.00      1.00      1.00      8060

[[7772    0]
 [   1  287]]


#### Test

In [91]:
y_pred = best_model_gbm.predict(X_test)

# Step 4: Evaluate
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99      1944
           1       0.96      0.74      0.83        72

    accuracy                           0.99      2016
   macro avg       0.98      0.87      0.91      2016
weighted avg       0.99      0.99      0.99      2016

[[1942    2]
 [  19   53]]


Based on our discussion, XGBoost and LightGBM appear to be achieving better precision and recall than the basic Gradient Boosting Machine (GBM) from scikit-learn. Here's a concise explanation:

* **Algorithmic Optimizations:** Both XGBoost and LightGBM incorporate significant algorithmic and engineering optimizations compared to a standard GBM. These include more efficient tree building strategies (leaf-wise in LightGBM, level-wise with pruning in XGBoost), advanced regularization techniques (L1 and L2), and optimized handling of data and computations. These enhancements often lead to better generalization and thus improved precision and recall.

* **Effective Handling of Numerical and TF-IDF Features:** Your current feature set, after preprocessing, consists primarily of scaled numerical features and TF-IDF outputs. XGBoost and LightGBM are known for their strong performance with this type of data, effectively capturing complex relationships and feature importance.

* **Hyperparameter Tuning:** Through `RandomizedSearchCV`, you are likely exploring a wider and more effective range of hyperparameters for XGBoost and LightGBM, allowing you to find configurations that are better suited to your specific data and optimization goals (precision and recall).

* **Computational Efficiency:** While not directly impacting precision and recall, the efficiency of XGBoost and LightGBM allows for more extensive experimentation with different hyperparameters and larger datasets, potentially leading to the discovery of better performing models.

In essence, the superior performance of XGBoost and LightGBM likely stems from their more advanced algorithms, efficient implementations, and the benefits of hyperparameter tuning, allowing them to learn more effectively from your numerical and TF-IDF features compared to the basic GBM.

# Extra trees

## Model training

In [94]:
preprocessor_et = ColumnTransformer([
    ('num', StandardScaler(), [col for col in numeric_features_to_scale if col in X_train.columns]),
    ('tfidf', 'passthrough', [col for col in tfidf_and_other_features if col in X_train.columns])
], remainder='passthrough')

# Extra Trees Classifier
extra_trees = ExtraTreesClassifier(
    random_state=69,
    n_jobs=-1 # Use all available cores
)

pipeline_et = Pipeline([
    ('preprocessing', preprocessor_et),
    ('extra_trees', extra_trees)
])

param_distributions_et = {
    'extra_trees__n_estimators': randint(100, 400),
    'extra_trees__max_depth': [None] + list(randint(5, 30).rvs(5)), # None for no max depth
    'extra_trees__min_samples_split': randint(2, 10),
    'extra_trees__min_samples_leaf': randint(1, 5),
    'extra_trees__criterion': ['gini', 'entropy']
}

random_search_et = RandomizedSearchCV(
    estimator=pipeline_et,
    param_distributions=param_distributions_et,
    n_iter=50, # Adjust as needed
    scoring='f1',
    cv=3,
    verbose=1,
    n_jobs=-1,
    random_state=69,
    error_score='raise'
)

# Step 5: Train Extra Trees with RandomizedSearchCV
random_search_et.fit(X_train, y_train)

# Get the best Extra Trees model
best_model_et = random_search_et.best_estimator_

Fitting 3 folds for each of 50 candidates, totalling 150 fits


### Model prediction

#### Train

In [95]:
y_pred = best_model_et.predict(X_train)

# Step 4: Evaluate
print(classification_report(y_train, y_pred))
print(confusion_matrix(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      1.00      7772
           1       1.00      0.83      0.91       288

    accuracy                           0.99      8060
   macro avg       1.00      0.91      0.95      8060
weighted avg       0.99      0.99      0.99      8060

[[7772    0]
 [  49  239]]


#### Test

In [96]:
y_pred = best_model_et.predict(X_test)

# Step 4: Evaluate
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99      1944
           1       0.97      0.40      0.57        72

    accuracy                           0.98      2016
   macro avg       0.97      0.70      0.78      2016
weighted avg       0.98      0.98      0.97      2016

[[1943    1]
 [  43   29]]
