In [1]:
import numpy as np
import pandas as pd

from lightgbm import LGBMClassifier
from lightgbm import plot_importance
from lightgbm import early_stopping
from lightgbm import log_evaluation

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix
from sklearn.ensemble import ExtraTreesClassifier

In [7]:
# Importing the dataset
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
test_df.drop(columns=['row_id'])

y = train_df['target']

X = train_df.drop(columns=['row_id','target'])

# Label Encoder
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
y = encoder.fit_transform(y)

# Dataset Spliting for training/ testing set
from sklearn.model_selection import train_test_split
X_train, X_valid , y_train , y_valid = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [8]:
N_SPLITS = 10
kf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=42)

In [9]:
et_params = {
    'n_estimators': 1000,
    'n_jobs': -1,
    'random_state': 42
}

In [10]:
# Number holders for the accuracy
pred_validation_all_et = []
validation_all = []
validation_ids_all = []

y_pred_test_et = []
y_pred_test_prob_et = []

importances_et = []
accs_et = []

In [6]:
%%time

for fold, (trn_idx, val_idx) in enumerate(kf.split(X=X_train, y = y_train)):
    validation_ids_all.append(val_idx)
    print("===== Number of Fold {} =====".format(fold))
    X_tr = X_train.iloc[trn_idx]
    y_tr = y_train[trn_idx]
    X_val = X_train.iloc[val_idx]
    y_val = y_train[val_idx]
    sample_weight_tr = sample_weights.iloc[trn_idx].values
    sample_weight_val = sample_weights.iloc[val_idx].values
    
    model_et = ExtraTreesClassifier(**et_params)
    
    model_et.fit(X_tr, y_tr, sample_weight_tr)
        
    importances_et.append(model_et.feature_importances_)
    
    pred_val_et = model_et.predict(X_val)
    pred_validation_all_et.append(pred_val_et)
    validation_all.append(y_val)
    
    acc_et = accuracy_score(y_true = y_val, y_pred = pred_val_et, sample_weight=sample_weight_val)
    accs_et.append(acc_et)
    
    print("FOLD", fold, "ETC Accuracy:", acc_et)
    
    # Test data predictions
    y_pred_test_et.append(model_et.predict(X_test))
    y_pred_test_prob_et.append(model_et.predict_proba(X_test))
    
print("======================================")
print("Mean Accuracy (all folds) - ETC:", np.mean(accs_et))

===== Number of Fold 0 =====


NameError: name 'sample_weights' is not defined

In [25]:
y_pred_et = model_et.predict(X_test)
y_pred_et = encoder.inverse_transform(y_pred_et)

In [28]:
submission = pd.read_csv('sample_submission.csv')
submission['target'] = y_pred_et
submission.to_csv("submission.csv", index=False)

## LGBM setup

In [33]:
lgbm_params = {
    'objective' : 'multiclass',
    'n_estimators': 300,
    'random_state': 42,
    'learning_rate': 0.05,
    'n_jobs' : -1
}

In [34]:
pred_validation_all_lgbm = []
validation_all = []
validation_ids_all = []

y_pred_test_lgbm = []
y_pred_test_prob_lgbm = []

importances_lgbm = []
accs_lgbm = []

In [35]:
%%time
for fold, (trn_idx, val_idx) in enumerate(kf.split(X=X_train, y = y_train)):
    validation_ids_all.append(val_idx)
    print("===== Fold", fold," =====")
    X_tr = X_train.iloc[trn_idx]
    y_tr = y_train[trn_idx]
    X_val = X_train.iloc[val_idx]
    y_val = y_train[val_idx]
    sample_weight_tr = sample_weights.iloc[trn_idx].values
    sample_weight_val = sample_weights.iloc[val_idx].values
    
    model_lgbm = LGBMClassifier(**lgbm_params)
    
    model_lgbm.fit(
        X_tr, 
        y_tr,
        sample_weight = sample_weight_tr,
        eval_sample_weight = [sample_weight_val],
        eval_set=[(X_val, y_val)],
        eval_metric = ['multi_logloss', 'multi_error'],
        callbacks = [early_stopping(30), log_evaluation(period=50)]
    )
    
    importances_lgbm.append(model_lgbm.feature_importances_)
    
    pred_val_lgbm = model_lgbm.predict(X_val)
    
    pred_validation_all_lgbm.append(pred_val_lgbm)
    validation_all.append(y_val)
    
    acc_lgbm = accuracy_score(y_true = y_val, y_pred = pred_val_lgbm, sample_weight=sample_weight_val)
    accs_lgbm.append(acc_lgbm)
    
    print("FOLD", fold, "LGBM Accuracy:", acc_lgbm)

    # Test data predictions
    y_pred_test_lgbm.append(model_lgbm.predict(X_test))
    y_pred_test_prob_lgbm.append(model_lgbm.predict_proba(X_test))
    
print("======================================")
print("Mean Accuracy - LGBM:", np.mean(accs_lgbm))

===== Fold 0  =====
Training until validation scores don't improve for 30 rounds
[50]	valid_0's multi_logloss: 0.35475	valid_0's multi_error: 0.0854189
[100]	valid_0's multi_logloss: 0.227124	valid_0's multi_error: 0.0597487
[150]	valid_0's multi_logloss: 0.174847	valid_0's multi_error: 0.0460975
[200]	valid_0's multi_logloss: 0.144429	valid_0's multi_error: 0.0364527
[250]	valid_0's multi_logloss: 0.126114	valid_0's multi_error: 0.0343258
[300]	valid_0's multi_logloss: 0.113299	valid_0's multi_error: 0.0319517
Did not meet early stopping. Best iteration is:
[300]	valid_0's multi_logloss: 0.113299	valid_0's multi_error: 0.0319517
FOLD 0 LGBM Accuracy: 0.968048273815412
===== Fold 1  =====
Training until validation scores don't improve for 30 rounds
[50]	valid_0's multi_logloss: 0.351561	valid_0's multi_error: 0.083677
[100]	valid_0's multi_logloss: 0.226967	valid_0's multi_error: 0.0586845
[150]	valid_0's multi_logloss: 0.178666	valid_0's multi_error: 0.0488786
[200]	valid_0's multi_lo

In [36]:
y_pred_lgbm = model_et.predict(X_test)
y_pred_lgbm = encoder.inverse_transform(y_pred_lgbm)

In [37]:
submission = pd.read_csv('sample_submission.csv')
submission['target'] = y_pred_lgbm
submission.to_csv("submission.csv", index=False)