# How to leverage Optuna


**This notebook focuses on leveraging Optuna for model tuning, using your previous Optuna results (tuned model hyperparameters) as a starting point for further model optimization. In this example, a LightGBM model was tuned for 6000 iterations.**

# Data 
**Libraries are imported and the train and test data are loaded. The 'id' column is saved for submission and then dropped from the the test and train dataframe. Shape of train and test data is printed.**

In [1]:
# Importing libraries

# Data processing
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

# Visualizations
import matplotlib.pyplot as plt

# Model building
from lightgbm import LGBMClassifier
import lightgbm as lgb
import optuna

# Model evaluations
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.metrics import accuracy_score


# Loading the data
file_path_train = '/kaggle/input/playground-series-s4e6/train.csv'
file_path_test = '/kaggle/input/playground-series-s4e6/test.csv'


df_train = pd.read_csv(file_path_train)
df_test = pd.read_csv(file_path_test)

# Dropping the first column
submission_id = df_test['id'].reset_index(drop=True)
df_train = df_train.drop(columns='id')
df_test = df_test.drop(columns='id')

# Shape of train and test data
print("train data shape: ", df_train.shape)
print("test data shape: ", df_test.shape)

train data shape:  (76518, 37)
test data shape:  (51012, 36)


# Data Overview

In [2]:
# DataFrame information
print(df_train.info())

# Summary statistics 
df_train.describe().T

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 76518 entries, 0 to 76517
Data columns (total 37 columns):
 #   Column                                          Non-Null Count  Dtype  
---  ------                                          --------------  -----  
 0   Marital status                                  76518 non-null  int64  
 1   Application mode                                76518 non-null  int64  
 2   Application order                               76518 non-null  int64  
 3   Course                                          76518 non-null  int64  
 4   Daytime/evening attendance                      76518 non-null  int64  
 5   Previous qualification                          76518 non-null  int64  
 6   Previous qualification (grade)                  76518 non-null  float64
 7   Nacionality                                     76518 non-null  int64  
 8   Mother's qualification                          76518 non-null  int64  
 9   Father's qualification                 

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Marital status,76518.0,1.111934,0.441669,1.0,1.0,1.0,1.0,6.0
Application mode,76518.0,16.054419,16.682337,1.0,1.0,17.0,39.0,53.0
Application order,76518.0,1.64441,1.229645,0.0,1.0,1.0,2.0,9.0
Course,76518.0,9001.286377,1803.438531,33.0,9119.0,9254.0,9670.0,9991.0
Daytime/evening attendance,76518.0,0.915314,0.278416,0.0,1.0,1.0,1.0,1.0
Previous qualification,76518.0,3.65876,8.623774,1.0,1.0,1.0,1.0,43.0
Previous qualification (grade),76518.0,132.378766,10.995328,95.0,125.0,133.1,140.0,190.0
Nacionality,76518.0,1.2266,3.392183,1.0,1.0,1.0,1.0,109.0
Mother's qualification,76518.0,19.837633,15.399456,1.0,1.0,19.0,37.0,44.0
Father's qualification,76518.0,23.425076,14.921164,1.0,4.0,19.0,37.0,44.0


# Data Optimization

In [3]:
# Correcting the datatype of the categorical data
category_columns_train = [0,1,3,4,5,7,8,9,10,11,13,14,15,16,17,18,20,36]
for col in category_columns_train:
    df_train.iloc[:, col] = df_train.iloc[:, col].astype('str').astype('category')

category_columns_test = [0,1,3,4,5,7,8,9,10,11,13,14,15,16,17,18,20]
for col in category_columns_test:
    df_test.iloc[:, col] = df_test.iloc[:, col].astype('str').astype('category')


X = df_train.drop(columns='Target')
y = df_train['Target']

1        1
2        1
3        1
4        1
        ..
76513    1
76514    1
76515    5
76516    1
76517    1
Name: Marital status, Length: 76518, dtype: category
Categories (6, object): ['1', '2', '3', '4', '5', '6']' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  df_train.iloc[:, col] = df_train.iloc[:, col].astype('str').astype('category')
1        17
2        17
3         1
4         1
         ..
76513    17
76514     1
76515    17
76516     1
76517     1
Name: Application mode, Length: 76518, dtype: category
Categories (22, object): ['1', '10', '12', '15', ..., '51', '53', '7', '9']' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  df_train.iloc[:, col] = df_train.iloc[:, col].astype('str').astype('category')
1        9238
2        9254
3        9500
4        9500
         ... 
76513    9254
76514    9254
76515    9085
76516    9070
76517    9773
Name: Course, Length: 76518, dtype: category
Categori

# Leveraging Optuna for hyperparameter tuning (LightGBM)

In [4]:
# Run the code below to optimize Light GBM. With this code, you can use the best result of the previous runs and continue the model optimization.


# le = LabelEncoder()
# ye = le.fit_transform(y1)

# # initial params (I got these after 6000 optuna cycles):
# initial_params = {'num_leaves': 51, 'max_depth': 14, 'learning_rate': 0.05146180924674936, 'min_data_in_leaf': 68, 'feature_fraction': 0.5168869829633201, 'bagging_fraction': 0.7761541789352951, 'bagging_freq': 4, 'lambda_l1': 0.7181555315085657, 'lambda_l2': 0.7883444126577582, 'min_gain_to_split': 0.018279206881056703, 'min_child_samples': 75}

# # Objective function to be minimized
# def objective(trial):
#     param = {
#         'objective': 'multiclass',
#         'num_class': 3,
#         'metric': 'multi_logloss',
#         'num_leaves': trial.suggest_int('num_leaves', 30, 100),
#         'max_depth': trial.suggest_int('max_depth', 5, 15),
#         'learning_rate': trial.suggest_float('learning_rate', 0.05, 0.13, log=True),
#         'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 20, 80),
#         'feature_fraction': trial.suggest_float('feature_fraction', 0.5, 0.8),
#         'bagging_fraction': trial.suggest_float('bagging_fraction', 0.5, 0.8),
#         'bagging_freq': trial.suggest_int('bagging_freq', 3, 10),
#         'lambda_l1': trial.suggest_float('lambda_l1', 0.2, 0.8),
#         'lambda_l2': trial.suggest_float('lambda_l2', 0.2, 0.8),
#         'min_gain_to_split': trial.suggest_float('min_gain_to_split', 0.01, 0.1),
#         'min_child_samples': trial.suggest_int('min_child_samples', 10, 80),
#         'verbose': -1,
#         'random_state': 1,
#         'n_jobs': -1
#     }
    
#     # 10-fold cross-validation
#     kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=1)

#     scores = []
    
#     for train_index, valid_index in kf.split(X, ye):
#         X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
#         y_train, y_valid = ye[train_index], ye[valid_index]

#         cat_cols = X.select_dtypes(include='category').columns.tolist()
        
#         train_data = lgb.Dataset(X_train, label=y_train, categorical_feature=cat_cols)
#         valid_data = lgb.Dataset(X_valid, label=y_valid, reference=train_data, categorical_feature=cat_cols)
        
#         bst = lgb.train(param, train_data, 4000, valid_sets=valid_data, callbacks=[lgb.early_stopping(stopping_rounds=50, verbose=False)])
#         y_pred = bst.predict(X_valid, num_iteration=bst.best_iteration)
#         y_pred_max = [np.argmax(line) for line in y_pred]
#         scores.append(accuracy_score(y_valid, y_pred_max))
    
#     return np.mean(scores)

# # Create a study object and optimize the objective function
# study = optuna.create_study(direction='maximize')
# study.enqueue_trial(initial_params)  # Enqueue default parameters as the first trial
# study.optimize(objective, n_trials=100, n_jobs=-1, show_progress_bar=True)

# # Print the best parameters and the best score
# print("Best parameters found: ", study.best_params)
# print("Best cross-validation accuracy: ", study.best_value)



# initial params, after 6000 optuna cycles:
best_params = {'num_leaves': 51, 'max_depth': 14, 'learning_rate': 0.05146180924674936, 'min_data_in_leaf': 68, 'feature_fraction': 0.5168869829633201, 'bagging_fraction': 0.7761541789352951, 'bagging_freq': 4, 'lambda_l1': 0.7181555315085657, 'lambda_l2': 0.7883444126577582, 'min_gain_to_split': 0.018279206881056703, 'min_child_samples': 75}


# Feature Selection

In [5]:
# feature importance

model = LGBMClassifier(verbose=-1, random_state=1, **best_params)
model.fit(X, y)

# Get feature importances
importances = model.feature_importances_

# Create a DataFrame for better handling
feature_importances = pd.DataFrame({'feature': X.columns, 'importance': importances})

# Sort features by importance
feature_importances = feature_importances.sort_values(by='importance', ascending=False)

# Print feature importances
print("\nFeature Importances:")
print(feature_importances)



# I ran the code below; removing 1 feature gives the best accuracy



# kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=1)
# scores = cross_val_score(model, X, y, scoring='accuracy', cv=kf, n_jobs=-1)
# accuracy = np.mean(scores)
# print(f"accuracy before removing any features: {accuracy}")

# best_i = -1
# best_score = accuracy
# for i in range(1,15):
#     # Remove the least important features
#     selected_features = feature_importances[:-i]
    
#     # Extract the column names of the selected features
#     selected_columns = selected_features['feature']
    
#     # Create the new DataFrame with only the selected features
#     X2 = X[selected_columns]
#     scores = cross_val_score(model, X2, y, scoring='accuracy', cv=kf, n_jobs=-1)
#     accuracy = np.mean(scores)
#     print(f"accuracy for removing {i} least important features: {accuracy}")
    
#     if accuracy > best_score:
#         best_score = accuracy
#         best_i = i

# print(f"\nBest accuracy is {best_score} and {best_i} least important features were removed\n")




# Remove extra features
selected_features = feature_importances[:-1]
X = X[selected_features['feature']]
df_test = df_test[selected_features['feature']]


Feature Importances:
                                           feature  importance
31                Curricular units 2nd sem (grade)        1212
12                                 Admission grade        1116
3                                           Course        1108
25                Curricular units 1st sem (grade)         910
30             Curricular units 2nd sem (approved)         895
29          Curricular units 2nd sem (evaluations)         816
11                             Father's occupation         726
6                   Previous qualification (grade)         714
24             Curricular units 1st sem (approved)         699
19                               Age at enrollment         661
23          Curricular units 1st sem (evaluations)         622
28             Curricular units 2nd sem (enrolled)         508
10                             Mother's occupation         502
33                               Unemployment rate         496
18                              S

# Modeling and Submission

In [6]:
# Modeling 
model = LGBMClassifier(verbose=-1, random_state=1, **best_params)
model.fit(X, y)

# Make predictions on the test dataset
predictions = model.predict(df_test)

# Reshape predictions array to be 1-dimensional
predictions_flat = predictions.flatten()

# Prepare the submission dataframe
submission = pd.DataFrame({
    'id': submission_id,
    'Target': predictions_flat
})

print('Submission head:', submission.head(10))

# Save the submission dataframe to a CSV file
submission.to_csv('submission.csv', index=False)

Submission head:       id    Target
0  76518   Dropout
1  76519  Graduate
2  76520  Graduate
3  76521  Graduate
4  76522  Enrolled
5  76523  Graduate
6  76524  Graduate
7  76525  Graduate
8  76526   Dropout
9  76527  Graduate
