Loading Data

In [4]:
import pandas as pd

train = pd.read_csv('train_dataset.csv')
test = pd.read_csv('test_dataset.csv')   
sample_submission = pd.read_csv('sample_submission.csv') 

Understanding Data

In [5]:
train

Unnamed: 0,id,user_id,age,Gender,Date_Registered,Is_current_loyalty_program_member,loyalty_points_redeemed,loyalty_tier,Received_tier_discount_percentage,Received_card_discount_percentage,...,payment_datetime,purchased_datetime,purchase_medium,final_payment,released_date,estimated_delivery_date,received_date,shipping_method,tracking_number,customer_experience
0,0,****589084,44,O,2020-01-01,NO,5,,,3.0,...,2020-01-05 22:27:16,2020-01-05 22:27:16,online,1293.00,2020-01-12,2020-01-17,2020-01-17,standard,***9AWDD64SYI,neutral
1,1,****494191,36,O,2020-01-04,YES,4,1.0,3.0,4.0,...,2020-01-06 00:37:51,2020-01-06 00:37:51,in-store,4522.44,2020-01-07,2020-01-12,2020-01-09,express,***3SSRORRZ0X,bad
2,2,****216469,40,F,2020-01-02,NO,3,,,3.0,...,2020-01-07 03:02:35,2020-01-07 03:02:35,online,5628.00,2020-01-12,2020-01-18,2020-01-18,express,***2VSB7MH7FN,good
3,3,****707170,33,M,2020-01-06,YES,1,1.0,3.0,2.0,...,2020-01-09 22:05:39,2020-01-09 22:05:39,in-store,2073.32,2020-01-13,2020-01-15,2020-01-17,express,***9XENHE2PKZ,bad
4,4,****066329,43,O,2020-01-06,YES,1,1.0,3.0,4.0,...,2020-01-11 08:32:22,2020-01-11 08:32:22,online,310.86,2020-01-15,2020-01-22,2020-01-20,express,***QTWLMEL0PE,bad
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
206964,206964,****708759,19,M,2024-12-18,NO,4,,,3.0,...,2024-12-28 23:51:36,2024-12-28 23:51:36,online,5533.00,2024-12-31,2025-01-02,2024-12-31,standard,***OD1GASFD9M,bad
206965,206965,****074598,54,M,2024-12-27,NO,2,,,1.0,...,2024-12-28 23:54:24,2024-12-28 23:54:24,online,4910.00,2024-12-30,2025-01-01,2025-01-02,standard,***TGSHGSXIL3,bad
206966,206966,****016861,25,O,2024-12-02,YES,4,4.0,10.0,4.0,...,2024-12-28 23:57:17,2024-12-28 23:57:17,in-store,4558.99,2025-01-04,2025-01-11,2025-01-10,express,***0C2DHORJKD,bad
206967,206967,****741534,54,M,2024-12-12,NO,0,,,,...,2024-12-28 23:57:28,2024-12-28 23:57:28,online,8900.00,2024-12-30,2025-01-04,2025-01-06,standard,***AXVOZIFGOZ,good


In [6]:
train.dtypes

id                                       int64
user_id                                 object
age                                      int64
Gender                                  object
Date_Registered                         object
Is_current_loyalty_program_member       object
loyalty_points_redeemed                  int64
loyalty_tier                           float64
Received_tier_discount_percentage      float64
Received_card_discount_percentage      float64
Received_coupon_discount_percentage      int64
product_category                        object
Product_value                            int64
transaction_id                          object
order_id                                object
payment_method                          object
payment_datetime                        object
purchased_datetime                      object
purchase_medium                         object
final_payment                          float64
released_date                           object
estimated_del

In [7]:
train.isnull().sum()

id                                          0
user_id                                     0
age                                         0
Gender                                      0
Date_Registered                             0
Is_current_loyalty_program_member           0
loyalty_points_redeemed                     0
loyalty_tier                           102067
Received_tier_discount_percentage      103379
Received_card_discount_percentage      156879
Received_coupon_discount_percentage         0
product_category                            0
Product_value                               0
transaction_id                              0
order_id                                    0
payment_method                              0
payment_datetime                            0
purchased_datetime                          0
purchase_medium                             0
final_payment                               0
released_date                               0
estimated_delivery_date           

In [8]:
train['customer_experience'].value_counts()

customer_experience
good       91388
bad        68538
neutral    47043
Name: count, dtype: int64

Data Cleaning

In [9]:
train['Date_Registered'] = pd.to_datetime(train['Date_Registered'])
train['payment_datetime'] = pd.to_datetime(train['payment_datetime'])
train['purchased_datetime'] = pd.to_datetime(train['purchased_datetime'])
train['released_date'] = pd.to_datetime(train['released_date'])
train['estimated_delivery_date'] = pd.to_datetime(train['estimated_delivery_date'])
train['received_date'] = pd.to_datetime(train['received_date'])

In [10]:
train.dtypes

id                                              int64
user_id                                        object
age                                             int64
Gender                                         object
Date_Registered                        datetime64[ns]
Is_current_loyalty_program_member              object
loyalty_points_redeemed                         int64
loyalty_tier                                  float64
Received_tier_discount_percentage             float64
Received_card_discount_percentage             float64
Received_coupon_discount_percentage             int64
product_category                               object
Product_value                                   int64
transaction_id                                 object
order_id                                       object
payment_method                                 object
payment_datetime                       datetime64[ns]
purchased_datetime                     datetime64[ns]
purchase_medium             

Adding new features

In [11]:
train['purchase_hour'] = train['payment_datetime'].dt.hour
train['purchase_day'] = train['payment_datetime'].dt.day_name()
train['purchase_month'] = train['payment_datetime'].dt.month
train['days_since_registration'] = (train['purchased_datetime'] - train['Date_Registered']).dt.days
train['estimated_delivery_day'] = train['estimated_delivery_date'].dt.day_name()
train['received_day'] = train['received_date'].dt.day_name()

In [12]:
# Convert date columns to Unix timestamps
date_columns = ['Date_Registered', 'payment_datetime', 'purchased_datetime', 
                'released_date', 'estimated_delivery_date', 'received_date']

for col in date_columns:
    train[col] = pd.to_datetime(train[col], errors='coerce').astype(int) / 10**9  

Categorical Encoding

In [14]:
train['customer_experience'] = pd.Categorical(train['customer_experience'], categories=['bad', 'neutral', 'good'], ordered=True)
train['customer_experience'] = train['customer_experience'].cat.codes

One Hot Encoding

In [None]:
from sklearn.preprocessing import LabelEncoder

categorical_columns = [ 'Gender', 'Is_current_loyalty_program_member', 
                       'product_category',
                       'payment_method', 'purchase_medium', 'shipping_method', 'purchase_day','estimated_delivery_day','received_day']

label_encoder = LabelEncoder()

for col in categorical_columns:
    train[col] = label_encoder.fit_transform(train[col].astype(str))  # Convert categorical to numeric

Adding more new features

In [15]:
train['Delivery_time'] = train['received_date'] - train['released_date']
train['Delivery_delay'] = train['received_date'] - train['estimated_delivery_date']
train['Waiting_time'] = train['received_date'] - train['payment_datetime']
train['Additional_charge'] = train['final_payment'] - train['Product_value']
train['Waiting_percentage'] = (train['received_date'] - train['estimated_delivery_date'])/(train['received_date'] - train['payment_datetime'])
train['Processing_time'] = train['released_date'] - train['payment_datetime']
train['Loyalty_engagement'] = train['loyalty_points_redeemed'] / train['Product_value']

In [16]:
import numpy as np

train.replace(r'[^0-9]+', np.nan, regex=True, inplace=True)

  train.replace(r'[^0-9]+', np.nan, regex=True, inplace=True)


In [None]:
train.fillna(0, inplace=True)
train = train.apply(pd.to_numeric)

In [19]:
X = train.drop('customer_experience', axis=1) 
y = train['customer_experience'] 

In [None]:
from sklearn.feature_selection import mutual_info_classif

mi_scores = mutual_info_classif(X, y,random_state=42)
mi_scores_df = pd.DataFrame({'Feature': X.columns, 'MI Score': mi_scores})

print(mi_scores_df.sort_values(by='MI Score', ascending=False))

Dropping least important columns

In [None]:
X = train.drop(['customer_experience','tracking_number', 'user_id', 'loyalty_tier','purchase_medium' ,'shipping_method','Gender','order_id', 'Received_tier_discount_percentage','Is_current_loyalty_program_member', 'transaction_id'],axis=1)
y = train['customer_experience']

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)

Training the model

In [None]:
from sklearn.ensemble import RandomForestClassifier

# model = RandomForestClassifier(class_weight='balanced', random_state=42)
# model = RandomForestClassifier(n_estimators=200, max_depth=None, random_state=42, class_weight='balanced')
model = RandomForestClassifier(
    n_estimators=300, 
    max_depth=None, 
    min_samples_split=5, 
    min_samples_leaf=2, 
    max_features='sqrt', 
    random_state=42, 
    class_weight='balanced'
)

In [None]:
# # Define a list of hyperparameter combinations
# param_list = [
#     {'n_estimators': 100, 'max_depth': 10},
#     {'n_estimators': 200, 'max_depth': None},
#     {'n_estimators': 300, 'max_depth': 20, 'min_samples_split': 5},
#     {'n_estimators': 100, 'max_depth': 15, 'min_samples_split': 10}
# ]

# best_model = None
# best_score = 0

# # Iterate through parameters and test each combination
# for params in param_list:
#     model = RandomForestClassifier(**params, class_weight='balanced', random_state=42)
#     model.fit(X_train, y_train)
#     y_pred = model.predict(X_test)
#     score = accuracy_score(y_test, y_pred)
#     print(f"Params: {params}, Accuracy: {score}")
#     if score > best_score:
#         best_model = model
#         best_score = score

# print("Best Parameters:", best_model.get_params())

In [None]:
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)

In [None]:
from sklearn.metrics import classification_report, accuracy_score

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

In [None]:
from sklearn.metrics import f1_score

f1 = f1_score(y_test, y_pred, average='weighted')
print(f'Weighted F1 Score: {f1}')

In [None]:
# import optuna
# from lightgbm import LGBMClassifier
# from sklearn.model_selection import cross_val_score

# def objective(trial):
#     params = {
#         'n_estimators': trial.suggest_int('n_estimators', 100, 500),
#         'max_depth': trial.suggest_int('max_depth', -1, 15),
#         'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2),
#         'num_leaves': trial.suggest_int('num_leaves', 31, 100),
#         'feature_fraction': trial.suggest_float('feature_fraction', 0.6, 1.0),
#         'bagging_fraction': trial.suggest_float('bagging_fraction', 0.6, 1.0),
#         'lambda_l1': trial.suggest_float('lambda_l1', 0.0, 10.0),
#         'lambda_l2': trial.suggest_float('lambda_l2', 0.0, 10.0),
#     }

#     model = LGBMClassifier(random_state=42, **params)
#     scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy')
#     return scores.mean()

# study = optuna.create_study(direction='maximize')
# study.optimize(objective, n_trials=20)

# print("Best Parameters:", study.best_params)
# print("Best Score:", study.best_value)

In [None]:
from lightgbm import LGBMClassifier

model = LGBMClassifier(
    n_estimators=100,
    max_depth=9,
    learning_rate=0.1,
    num_leaves=57,
    feature_fraction=0.7233,
    bagging_fraction=0.7492,
    lambda_l1=1.9796,
    lambda_l2=8.1072,
    random_state=42
)

In [None]:
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)

In [None]:
from sklearn.metrics import classification_report, accuracy_score

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

In [None]:
from sklearn.metrics import f1_score

f1 = f1_score(y_test, y_pred, average='weighted')
print(f'Weighted F1 Score: {f1}')

In [None]:
test

In [None]:
test['Date_Registered'] = pd.to_datetime(test['Date_Registered'])
test['payment_datetime'] = pd.to_datetime(test['payment_datetime'])
test['purchased_datetime'] = pd.to_datetime(test['purchased_datetime'])
test['released_date'] = pd.to_datetime(test['released_date'])
test['estimated_delivery_date'] = pd.to_datetime(test['estimated_delivery_date'])
test['received_date'] = pd.to_datetime(test['received_date'])

In [None]:
test['purchase_hour'] = test['payment_datetime'].dt.hour
test['purchase_day'] = test['payment_datetime'].dt.day_name()
test['purchase_month'] = test['payment_datetime'].dt.month
test['days_since_registration'] = (test['purchased_datetime'] - test['Date_Registered']).dt.days
test['estimated_delivery_day'] = test['estimated_delivery_date'].dt.day_name()
test['received_day'] = test['received_date'].dt.day_name()

In [None]:
date_columns = ['Date_Registered', 'payment_datetime', 'purchased_datetime', 
                'released_date', 'estimated_delivery_date', 'received_date']

for col in date_columns:
    test[col] = pd.to_datetime(test[col], errors='coerce').astype(int) / 10**9

In [None]:
from sklearn.preprocessing import LabelEncoder

categorical_columns = [ 'Gender', 'Is_current_loyalty_program_member', 
                       'product_category', 
                       'payment_method', 'purchase_medium', 'shipping_method', 'purchase_day','estimated_delivery_day','received_day']

label_encoder = LabelEncoder()

for col in categorical_columns:
    test[col] = label_encoder.fit_transform(test[col].astype(str))

In [None]:
test['Delivery_time'] = test['received_date'] - test['released_date']
test['Delivery_delay'] = test['received_date'] - test['estimated_delivery_date']
test['Waiting_time'] = test['received_date'] - test['payment_datetime']
test['Additional_charge'] = test['final_payment'] - test['Product_value']
test['Waiting_percentage'] = (test['received_date'] - test['estimated_delivery_date'])/(test['received_date'] - test['payment_datetime'])
test['Processing_time'] = test['released_date'] - test['payment_datetime']
test['Loyalty_engagement'] = test['loyalty_points_redeemed'] / test['Product_value']

In [None]:
test.replace(r'[^0-9]+', np.nan, regex=True, inplace=True)

In [None]:
test.fillna(0,inplace=True)

In [None]:
test = test.apply(pd.to_numeric)

In [None]:
new_test=test.drop(['tracking_number', 'user_id', 'loyalty_tier','purchase_medium' ,'shipping_method','Gender','order_id', 'Received_tier_discount_percentage', 'Is_current_loyalty_program_member', 'transaction_id'], axis=1)  

In [None]:
final_model = LGBMClassifier(
    n_estimators=100,
    max_depth=9,
    learning_rate=0.1,
    num_leaves=57,
    feature_fraction=0.7233,
    bagging_fraction=0.7492,
    lambda_l1=1.9796,
    lambda_l2=8.1072,
    random_state=42
)

In [None]:
final_model.fit(X, y)

In [None]:
y_pred = final_model.predict(new_test)

In [None]:
label_mapping = {0: "bad", 1: "neutral", 2: "good"}
y_pred_labels = [label_mapping[label] for label in y_pred]

submission = pd.DataFrame({"id": new_test["id"], "customer_experience": y_pred_labels})
submission.to_csv("submission.csv", index=False)

In [None]:
submission = pd.read_csv("submission.csv")

In [None]:
submission 