# Import Libraries

In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import xgboost as xgb

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import f1_score, classification_report
from sklearn.preprocessing import OneHotEncoder
from sklearn.utils.class_weight import compute_class_weight
from sklearn.model_selection import RandomizedSearchCV

from imblearn.over_sampling import SMOTE
from scipy.stats import randint, uniform
from catboost import CatBoostClassifier, Pool, cv

# Load the Data

In [2]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

print("Train Data Shape:", train_df.shape)
print("Test Data Shape:", test_df.shape)

Train Data Shape: (6736, 10)
Test Data Shape: (2164, 9)


In [3]:
train_df.head()

Unnamed: 0,ID,Distributor,Product,Duration,Destination,Sales,Commission,Gender,Age,Target
0,fffe3800370038003900,7,1,22,122,31.0,0.0,,20,0
1,fffe34003200370037003500,7,1,26,52,22.0,0.0,,36,0
2,fffe32003100320030003200,7,10,15,83,63.0,0.0,,34,0
3,fffe34003400310037003000,8,25,24,55,62.0,24.8,0.0,118,0
4,fffe32003400390038003000,6,16,12,122,19.8,11.88,,26,0


In [4]:
test_df.head()

Unnamed: 0,ID,Distributor,Product,Duration,Destination,Sales,Commission,Gender,Age
0,fffe31003600330038003500,6,16,8,60,69.3,41.58,,51
1,fffe33003600300031003400,2,4,368,112,161.0,40.25,0.0,51
2,fffe320033003300,2,4,387,112,291.75,72.94,0.0,51
3,fffe390039003800,7,10,4,25,18.0,0.0,,36
4,fffe3500350031003000,11,20,40,59,39.5,25.68,0.0,38


# Data Preprocessing

## Feature Selection

In [5]:
train_df = train_df.drop(['ID'], axis=1)
test_ids = test_df['ID']
test_df = test_df.drop(['ID'], axis=1)

## Handling Missing Values

In [6]:
print("Missing Values in Training Data:")
train_df.isnull().sum()

Missing Values in Training Data:


Distributor       0
Product           0
Duration          0
Destination       0
Sales             0
Commission        0
Gender         4704
Age               0
Target            0
dtype: int64

In [7]:
print("Percentage of Missing Values in column 'Gender' in Training Data:")
missing_percentage = train_df['Gender'].isnull().mean() * 100
print(f"{missing_percentage:.2f}%")

Percentage of Missing Values in column 'Gender' in Training Data:
69.83%


In [8]:
print("Missing Values in Testing Data:")
test_df.isnull().sum()

Missing Values in Testing Data:


Distributor       0
Product           0
Duration          0
Destination       0
Sales             0
Commission        0
Gender         1494
Age               0
dtype: int64

In [9]:
print("Percentage of Missing Values in column 'Gender' in Testing Data:")
missing_percentage = test_df['Gender'].isnull().mean() * 100
print(f"{missing_percentage:.2f}%")

Percentage of Missing Values in column 'Gender' in Testing Data:
69.04%


In [10]:
train_df = train_df.drop(['Gender'], axis=1)
test_df = test_df.drop(['Gender'], axis=1)

## Summary Statistics

In [11]:
print("Summary Statistics for Numerical Features:")
train_df.describe()

Summary Statistics for Numerical Features:


Unnamed: 0,Distributor,Product,Duration,Destination,Sales,Commission,Age,Target
count,6736.0,6736.0,6736.0,6736.0,6736.0,6736.0,6736.0,6736.0
mean,6.563539,9.4038,51.588034,81.681413,42.802316,10.469831,39.880344,0.046912
std,2.440587,6.62581,79.504738,39.530726,52.408053,20.342999,13.872811,0.211466
min,0.0,0.0,-1.0,0.0,-277.2,0.0,1.0,0.0
25%,6.0,2.0,10.0,55.0,18.0,0.0,35.0,0.0
50%,7.0,10.0,23.0,86.0,28.0,0.0,36.0,0.0
75%,7.0,16.0,54.0,112.0,49.5,11.88,44.0,0.0
max,15.0,25.0,444.0,139.0,666.0,262.76,118.0,1.0


In [12]:
target_counts = train_df['Target'].value_counts(normalize=True) * 100
print("\nPercentage Distribution of Target Variable:")
target_counts


Percentage Distribution of Target Variable:


Target
0    95.308789
1     4.691211
Name: proportion, dtype: float64

In [13]:
categorical_features = ['Distributor', 'Product', 'Destination']
train_df[categorical_features] = train_df[categorical_features].astype(int)

## Separate Features and Target

In [14]:
X = train_df.drop(['Target'], axis=1)
y = train_df['Target']

X_test = test_df.copy()

## Split the Data into Training and Validation Sets

In [15]:
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training Set Shape: {X_train.shape}")
print(f"Validation Set Shape: {X_val.shape}")


Training Set Shape: (5388, 7)
Validation Set Shape: (1348, 7)


# Models

## CatBoost

In [19]:
catboost_clf = CatBoostClassifier(
    iterations=1000,              
    learning_rate=0.1,            
    depth=6,                      
    eval_metric='F1',             
    loss_function='Logloss',      
    early_stopping_rounds=50,     
    random_seed=42,
    verbose=100                   
)

train_pool = Pool(
    data=X_train,
    label=y_train,
    cat_features=categorical_features
)

val_pool = Pool(
    data=X_val,
    label=y_val,
    cat_features=categorical_features
)

catboost_clf.fit(
    train_pool,
    eval_set=val_pool,
    use_best_model=True
)


0:	learn: 0.0000000	test: 0.0000000	best: 0.0000000 (0)	total: 184ms	remaining: 3m 3s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0
bestIteration = 0

Shrink model to first 1 iterations.


<catboost.core.CatBoostClassifier at 0x22c29699510>

In [20]:
y_pred = catboost_clf.predict(X_val)


f1 = f1_score(y_val, y_pred, average='weighted')
print(f"\nWeighted F1 Score on Validation Set: {f1 * 100:.2f}")


Weighted F1 Score on Validation Set: 93.05


In [21]:
print("\nClassification Report:")
print(classification_report(y_val, y_pred, target_names=['Useful', 'Not Useful']))


Classification Report:
              precision    recall  f1-score   support

      Useful       0.95      1.00      0.98      1285
  Not Useful       0.00      0.00      0.00        63

    accuracy                           0.95      1348
   macro avg       0.48      0.50      0.49      1348
weighted avg       0.91      0.95      0.93      1348



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [22]:
feature_importances = catboost_clf.get_feature_importance()
feature_names = X_train.columns

feature_importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': feature_importances
})
feature_importance_df


Unnamed: 0,Feature,Importance
0,Distributor,0.0
1,Product,0.0
2,Duration,50.142267
3,Destination,0.0
4,Sales,0.0
5,Commission,30.279287
6,Age,19.578446


In [23]:
param_grid = {
    'depth': [4, 6, 8],
    'learning_rate': [0.01, 0.05, 0.1],
    'iterations': [500, 1000, 1500],
    'l2_leaf_reg': [1, 3, 5]
}

catboost = CatBoostClassifier(
    eval_metric='F1',
    loss_function='Logloss',
    early_stopping_rounds=50,
    random_seed=42,
    verbose=0
)

grid_search = GridSearchCV(
    estimator=catboost,
    param_grid=param_grid,
    scoring='f1_weighted',
    cv=5,
    verbose=2,
    n_jobs=-1
)

grid_search.fit(
    X_train, y_train,
    cat_features=categorical_features,
    eval_set=(X_val, y_val)
)

print("\nBest Parameters from Grid Search:")
print(grid_search.best_params_)

best_catboost = grid_search.best_estimator_

y_pred_best = best_catboost.predict(X_val)
best_f1 = f1_score(y_val, y_pred_best, average='weighted')
print(f"\nOptimized Weighted F1 Score on Validation Set: {best_f1 * 100:.2f}")

Fitting 5 folds for each of 81 candidates, totalling 405 fits

Best Parameters from Grid Search:
{'depth': 8, 'iterations': 500, 'l2_leaf_reg': 1, 'learning_rate': 0.1}

Optimized Weighted F1 Score on Validation Set: 93.19


In [24]:
test_predictions = best_catboost.predict(X_test)

submission = pd.DataFrame({
    'ID': test_ids,
    'Target': test_predictions
})

submission = submission.set_index('ID').sort_index()

submission.to_csv('catboost_submission.csv')

print("\nSubmission file 'catboost_submission.csv' created successfully.")


Submission file 'catboost_submission.csv' created successfully.


In [25]:
full_pool = Pool(
    data=X,
    label=y,
    cat_features=categorical_features
)

params = {
    'depth': grid_search.best_params_['depth'],
    'learning_rate': grid_search.best_params_['learning_rate'],
    'iterations': grid_search.best_params_['iterations'],
    'l2_leaf_reg': grid_search.best_params_['l2_leaf_reg'],
    'eval_metric': 'F1',
    'loss_function': 'Logloss',
    'random_seed': 42
}

cv_results = cv(
    pool=full_pool,
    params=params,
    fold_count=5,
    partition_random_seed=42,
    shuffle=True,
    plot=True
)

print("\nCross-Validation Results:")
print(cv_results.head())


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Training on fold [0/5]
0:	learn: 0.0157480	test: 0.0000000	best: 0.0000000 (0)	total: 28.6ms	remaining: 14.3s
1:	learn: 0.0079051	test: 0.0000000	best: 0.0000000 (0)	total: 59.5ms	remaining: 14.8s
2:	learn: 0.0235294	test: 0.0000000	best: 0.0000000 (0)	total: 96.9ms	remaining: 16s
3:	learn: 0.0235294	test: 0.0000000	best: 0.0000000 (0)	total: 117ms	remaining: 14.6s
4:	learn: 0.0079051	test: 0.0000000	best: 0.0000000 (0)	total: 127ms	remaining: 12.5s
5:	learn: 0.0000000	test: 0.0000000	best: 0.0000000 (0)	total: 158ms	remaining: 13s
6:	learn: 0.0000000	test: 0.0000000	best: 0.0000000 (0)	total: 190ms	remaining: 13.4s
7:	learn: 0.0000000	test: 0.0000000	best: 0.0000000 (0)	total: 204ms	remaining: 12.5s
8:	learn: 0.0000000	test: 0.0000000	best: 0.0000000 (0)	total: 241ms	remaining: 13.2s
9:	learn: 0.0000000	test: 0.0000000	best: 0.0000000 (0)	total: 258ms	remaining: 12.6s
10:	learn: 0.0000000	test: 0.0000000	best: 0.0000000 (0)	total: 279ms	remaining: 12.4s
11:	learn: 0.0000000	test: 0.00

In [26]:
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(y_train),
    y=y_train
)

class_weights_dict = {0: class_weights[0], 1: class_weights[1]}
print("\nClass Weights:", class_weights_dict)

catboost_clf = CatBoostClassifier(
    iterations=1000,
    learning_rate=0.1,
    depth=6,
    eval_metric='F1',
    loss_function='Logloss',
    early_stopping_rounds=50,
    random_seed=42,
    class_weights=class_weights_dict,
    verbose=100
)

catboost_clf.fit(
    train_pool,
    eval_set=val_pool,
    use_best_model=True
)

y_pred = catboost_clf.predict(X_val)


f1 = f1_score(y_val, y_pred, average='weighted')
print(f"\nWeighted F1 Score on Validation Set: {f1 * 100:.2f}")


Class Weights: {0: 0.5246348588120741, 1: 10.648221343873518}
0:	learn: 0.7459483	test: 0.6858916	best: 0.6858916 (0)	total: 16.9ms	remaining: 16.9s
100:	learn: 0.8483658	test: 0.6858916	best: 0.7180459 (72)	total: 2.04s	remaining: 18.2s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.7180458762
bestIteration = 72

Shrink model to first 73 iterations.

Weighted F1 Score on Validation Set: 87.41


In [27]:
smote = SMOTE(random_state=42)

X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

print("\nAfter SMOTE, counts of label '0': {}".format(sum(y_train_res == 0)))
print("After SMOTE, counts of label '1': {}".format(sum(y_train_res == 1)))

train_res_pool = Pool(
    data=X_train_res,
    label=y_train_res,
    cat_features=categorical_features
)

catboost_clf.fit(
    train_res_pool,
    eval_set=val_pool,
    use_best_model=True
)

y_pred = catboost_clf.predict(X_val)


f1 = f1_score(y_val, y_pred, average='weighted')
print(f"\nWeighted F1 Score on Validation Set: {f1 * 100:.2f}")


After SMOTE, counts of label '0': 5135
After SMOTE, counts of label '1': 5135
0:	learn: 0.9760260	test: 0.6664366	best: 0.6664366 (0)	total: 26.2ms	remaining: 26.1s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.7749917269
bestIteration = 33

Shrink model to first 34 iterations.

Weighted F1 Score on Validation Set: 69.96


## LightGBM

In [38]:
help(lgb.LGBMClassifier)

Help on class LGBMClassifier in module lightgbm.sklearn:

class LGBMClassifier(sklearn.base.ClassifierMixin, LGBMModel)
 |  LGBMClassifier(boosting_type: str = 'gbdt', num_leaves: int = 31, max_depth: int = -1, learning_rate: float = 0.1, n_estimators: int = 100, subsample_for_bin: int = 200000, objective: Union[str, Callable[[Optional[numpy.ndarray], numpy.ndarray], Tuple[numpy.ndarray, numpy.ndarray]], Callable[[Optional[numpy.ndarray], numpy.ndarray, Optional[numpy.ndarray]], Tuple[numpy.ndarray, numpy.ndarray]], Callable[[Optional[numpy.ndarray], numpy.ndarray, Optional[numpy.ndarray], Optional[numpy.ndarray]], Tuple[numpy.ndarray, numpy.ndarray]], NoneType] = None, class_weight: Union[Dict, str, NoneType] = None, min_split_gain: float = 0.0, min_child_weight: float = 0.001, min_child_samples: int = 20, subsample: float = 1.0, subsample_freq: int = 0, colsample_bytree: float = 1.0, reg_alpha: float = 0.0, reg_lambda: float = 0.0, random_state: Union[int, numpy.random.mtrand.RandomS

In [47]:
lgb_clf = lgb.LGBMClassifier(
    objective='binary',
    metric='binary_logloss',
    boosting_type='gbdt',
    num_leaves=31,
    max_depth=-1,
    learning_rate=0.05,
    n_estimators=1000,
    random_state=42,
    verbose=-1,
    min_child_samples=20,
    subsample=1.0,
    colsample_bytree=1.0,
    reg_alpha=0.0,
    reg_lambda=0.0,
)

lgb_clf.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    eval_metric='binary_logloss',
    categorical_feature=categorical_features,
)

y_pred = catboost_clf.predict(X_val)


f1 = f1_score(y_val, y_pred, average='weighted')
print(f"\nWeighted F1 Score on Validation Set: {f1 * 100:.2f}")



Weighted F1 Score on Validation Set: 69.96


In [43]:
print("\nClassification Report:")
print(classification_report(y_val, y_pred, target_names=['Useful', 'Not Useful']))


Classification Report:
              precision    recall  f1-score   support

      Useful       0.99      0.57      0.73      1285
  Not Useful       0.09      0.90      0.17        63

    accuracy                           0.59      1348
   macro avg       0.54      0.74      0.45      1348
weighted avg       0.95      0.59      0.70      1348



In [45]:
feature_importances = lgb_clf.feature_importances_
feature_names = X_train.columns

feature_importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': feature_importances
})
feature_importance_df

Unnamed: 0,Feature,Importance
0,Distributor,15
1,Product,231
2,Duration,11108
3,Destination,378
4,Sales,6981
5,Commission,4545
6,Age,6742


In [49]:
param_dist = {
    'num_leaves': randint(20, 150),
    'learning_rate': uniform(0.01, 0.2),
    'n_estimators': randint(100, 2000),
    'max_depth': randint(3, 15),
    'min_child_samples': randint(5, 100),
    'subsample': uniform(0.5, 0.5),
    'colsample_bytree': uniform(0.5, 0.5),
    'reg_alpha': uniform(0, 1),
    'reg_lambda': uniform(0, 1)
}

random_search = RandomizedSearchCV(
    estimator=lgb_clf,
    param_distributions=param_dist,
    n_iter=100,
    scoring='f1_weighted',
    cv=5,
    verbose=2,
    random_state=42,
    n_jobs=-1
)

random_search.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    eval_metric='binary_logloss',
    categorical_feature=categorical_features,
)

print("\nBest Parameters from Randomized Search:")
print(random_search.best_params_)

best_lgb = random_search.best_estimator_

y_pred_best = best_lgb.predict(X_val)
best_f1 = f1_score(y_val, y_pred_best, average='weighted')
print(f"\nOptimized Weighted F1 Score on Validation Set: {best_f1 * 100:.2f}")

Fitting 5 folds for each of 100 candidates, totalling 500 fits

Best Parameters from Randomized Search:
{'colsample_bytree': 0.6704017701265089, 'learning_rate': 0.19615146512071296, 'max_depth': 7, 'min_child_samples': 16, 'n_estimators': 1492, 'num_leaves': 32, 'reg_alpha': 0.7399087604473745, 'reg_lambda': 0.23823615240397944, 'subsample': 0.6888644430881474}

Optimized Weighted F1 Score on Validation Set: 92.98


In [50]:
test_predictions = best_lgb.predict(X_test)

submission = pd.DataFrame({
    'ID': test_ids,
    'Target': test_predictions
})

submission = submission.set_index('ID').sort_index()

submission.to_csv('lightgbm_submission.csv')

print("\nSubmission file 'lightgbm_submission.csv' created successfully.")



Submission file 'lightgbm_submission.csv' created successfully.


In [55]:
train_data = lgb.Dataset(X, label=y, categorical_feature=categorical_features, free_raw_data=False)

params = {
    'objective': 'binary',
    'metric': 'binary_logloss',
    'boosting_type': 'gbdt',
    'num_leaves': random_search.best_params_.get('num_leaves', 31),
    'learning_rate': random_search.best_params_.get('learning_rate', 0.05),
    'n_estimators': random_search.best_params_.get('n_estimators', 1000),
    'max_depth': random_search.best_params_.get('max_depth', -1),
    'min_child_samples': random_search.best_params_.get('min_child_samples', 20),
    'subsample': random_search.best_params_.get('subsample', 1.0),
    'colsample_bytree': random_search.best_params_.get('colsample_bytree', 1.0),
    'reg_alpha': random_search.best_params_.get('reg_alpha', 0.0),
    'reg_lambda': random_search.best_params_.get('reg_lambda', 0.0),
    'random_state': 42
}

cv_results = lgb.cv(
    params,
    train_data,
    num_boost_round=1000,
    nfold=5,
    stratified=True,
    shuffle=True,
    metrics='binary_logloss',
    seed=42,
)

print(f"\nBest number of boosting rounds: {len(cv_results['valid binary_logloss-mean'])}")





Best number of boosting rounds: 1492


In [None]:
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(y_train),
    y=y_train
)

class_weights_dict = {0: class_weights[0], 1: class_weights[1]}
print("\nClass Weights:", class_weights_dict)

lgb_clf_balanced = lgb.LGBMClassifier(
    objective='binary',
    metric='binary_logloss',
    boosting_type='gbdt',
    num_leaves=31,
    learning_rate=0.05,
    n_estimators=1000,
    random_state=42,
    verbose=-1,
    class_weight=class_weights_dict
)

lgb_clf_balanced.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    eval_metric='binary_logloss',
    categorical_feature=categorical_features,
)

y_pred_balanced = lgb_clf_balanced.predict(X_val)

f1_balanced = f1_score(y_val, y_pred_balanced, average='weighted')
print(f"\nWeighted F1 Score on Validation Set with Class Weights: {f1_balanced * 100:.2f}")



Class Weights: {0: 0.5246348588120741, 1: 10.648221343873518}

Weighted F1 Score on Validation Set with Class Weights: 92.63


In [16]:
smote = SMOTE(random_state=42)

X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

print("\nAfter SMOTE, counts of label '0': {}".format(sum(y_train_res == 0)))
print("After SMOTE, counts of label '1': {}".format(sum(y_train_res == 1)))

lgb_clf_smote = lgb.LGBMClassifier(
    objective='binary',
    metric='binary_logloss',
    boosting_type='gbdt',
    num_leaves=31,
    learning_rate=0.05,
    n_estimators=1000,
    random_state=42,
    verbose=-1
)

lgb_clf_smote.fit(
    X_train_res, y_train_res,
    eval_set=[(X_val, y_val)],
    eval_metric='binary_logloss',
    categorical_feature=categorical_features,
)

y_pred_smote = lgb_clf_smote.predict(X_val)

f1_smote = f1_score(y_val, y_pred_smote, average='weighted')
print(f"\nWeighted F1 Score on Validation Set with SMOTE: {f1_smote * 100:.2f}")



After SMOTE, counts of label '0': 5135
After SMOTE, counts of label '1': 5135

Weighted F1 Score on Validation Set with SMOTE: 91.35


## XGBoost

In [17]:
ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

train_encoded = pd.DataFrame(ohe.fit_transform(train_df[categorical_features]))
train_encoded.columns = ohe.get_feature_names_out(categorical_features)

test_encoded = pd.DataFrame(ohe.transform(test_df[categorical_features]))
test_encoded.columns = ohe.get_feature_names_out(categorical_features)

train_encoded.reset_index(drop=True, inplace=True)
test_encoded.reset_index(drop=True, inplace=True)

train_df = train_df.drop(categorical_features, axis=1)
test_df = test_df.drop(categorical_features, axis=1)

train_df = pd.concat([train_df, train_encoded], axis=1)
test_df = pd.concat([test_df, test_encoded], axis=1)

In [19]:
X = train_df.drop(['Target'], axis=1)
y = train_df['Target']

X_test = test_df.copy()

In [20]:
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training Set Shape: {X_train.shape}")
print(f"Validation Set Shape: {X_val.shape}")


Training Set Shape: (5388, 145)
Validation Set Shape: (1348, 145)


In [23]:
xgb_clf = xgb.XGBClassifier(
    objective='binary:logistic',
    eval_metric='logloss',
    use_label_encoder=False,
    random_state=42
)

xgb_clf.fit(X_train, y_train)

y_pred = xgb_clf.predict(X_val)


f1 = f1_score(y_val, y_pred, average='weighted')
print(f"\nWeighted F1 Score on Validation Set: {f1 * 100:.2f}")



Weighted F1 Score on Validation Set: 93.23


In [24]:
print("\nClassification Report:")
print(classification_report(y_val, y_pred, target_names=['Useful', 'Not Useful']))


Classification Report:
              precision    recall  f1-score   support

      Useful       0.96      0.99      0.97      1285
  Not Useful       0.27      0.05      0.08        63

    accuracy                           0.95      1348
   macro avg       0.61      0.52      0.53      1348
weighted avg       0.92      0.95      0.93      1348



In [26]:
feature_importances = xgb_clf.feature_importances_
feature_names = X_train.columns

feature_importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': feature_importances
})
feature_importance_df

Unnamed: 0,Feature,Importance
0,Duration,0.013398
1,Sales,0.016425
2,Commission,0.012760
3,Age,0.013909
4,Distributor_0,0.000000
...,...,...
140,Destination_132,0.019811
141,Destination_135,0.000000
142,Destination_136,0.000000
143,Destination_137,0.015257


In [27]:
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.7, 0.8, 1.0],
    'colsample_bytree': [0.7, 0.8, 1.0]
}

grid_search = GridSearchCV(
    estimator=xgb_clf,
    param_grid=param_grid,
    scoring='f1_weighted',
    cv=5,
    verbose=1,
    n_jobs=-1
)

grid_search.fit(X_train, y_train)

print("Best Parameters:", grid_search.best_params_)

best_xgb = grid_search.best_estimator_

y_pred_best = best_xgb.predict(X_val)
best_f1 = f1_score(y_val, y_pred_best, average='weighted')
print(f"Optimized Weighted F1 Score: {best_f1 * 100:.2f}")


Fitting 5 folds for each of 243 candidates, totalling 1215 fits
Best Parameters: {'colsample_bytree': 0.7, 'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 100, 'subsample': 0.7}
Optimized Weighted F1 Score: 93.12


In [None]:
test_predictions = best_xgb.predict(X_test)

submission = pd.DataFrame({
    'ID': test_ids,
    'Target': test_predictions
})

submission = submission.set_index('ID').sort_index()

submission.to_csv('submission.csv')