In [107]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import classification_report, f1_score, mean_squared_error

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from lightgbm import LGBMClassifier
from sklearn.neighbors import KNeighborsClassifier
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier

from scipy.stats import uniform, randint
import matplotlib.pyplot as plt

In [2]:
mrbilit = pd.read_csv('data/mrbilit_dataset.csv')

In [3]:
def count_family(x):
    if len(x) > 1:
        return True
    else:
        return False


In [4]:
mrbilit.groupby('Cancel')['Created'].count()

Cancel
0    85716
1    15301
Name: Created, dtype: int64

In [5]:
class Preprocessor:
    
    def __init__(self):
        self.selected_cols = ['Created', 'DepartureTime', 'BillID', 'TicketID', 'ReserveStatus', 
                'Male', 'Price', 'CouponDiscount', 'From', 'To', 'Domestic',
                'VehicleClass', 'Vehicle', 'BuyerMobile', 'TripReason']

        self.final_features = ['ReserveStatus',
       'Male', 'Domestic', 'Vehicle', 'TripReason',
       'TimeDiff', 'TicketPerOrder', 'is_with_familiy', 'net_price',
       'From_encoded', 'To_encoded', 'has_discount']
        

    def select_useful_cols (self) : 
        self.df = self.dataset[self.selected_cols]


    def is_with_family(self):
        self.df['is_with_familiy'] = self.df.groupby('BillID')['Male'].transform(count_family)


    def datetime_cols (self) : 
        self.df[['Created', 'DepartureTime']] = self.df[['Created', 'DepartureTime']].apply(pd.to_datetime)        
        self.df['TimeDiff'] = (self.df['DepartureTime'] - self.df['Created']).dt.days
        self.df.drop(['Created', 'DepartureTime'], axis=1, inplace=True)
    
    def ticket_per_order (self) : 
        self.df['TicketPerOrder'] = self.df.groupby('BillID').TicketID.transform('count')
    
    def handle_monetary (self) : 
        self.df['net_price'] = self.df.Price - self.df.CouponDiscount
        self.df['net_price'] /= self.df['net_price'].max()
        self.df['has_discount'] = self.df.CouponDiscount > 0

    
    def encode_cities (self, is_train) : 
        if is_train : 
            cities = list(set(self.df.From.unique().tolist()).union(set(self.df.To.unique().tolist())))
            self.city_encoder = LabelEncoder().fit(cities)
        try : 
            city2idx = dict(zip(self.city_encoder.classes_, self.city_encoder.transform(self.city_encoder.classes_)))
            
            self.df['From_encoded'] = self.df.From.map(city2idx).fillna(-1).astype(int)
            self.df['To_encoded'] = self.df.To.map(city2idx).fillna(-1).astype(int)
            
            self.df.drop(['To', 'From'], axis=1, inplace=True)
        
        except KeyError as exc:  
            raise Exception('Something went wrong. Maybe this class is used for test date before fit on train data!')
    
    def encode_categorical_cols (self, is_train) : 
        if is_train : 
            self.cat_cols = self.df.select_dtypes(exclude=['int','float']).columns
            self.cat_les = {}

            for col in self.cat_cols : 
                le = LabelEncoder().fit(self.df[col])
                self.cat_les.update({col:le})
        try:
            for col in self.cat_cols :
                self.df[col] = self.cat_les[col].transform(self.df[col])
        except Exception as exc: 
            raise Exception('Something went wrong. Maybe this class is used for test date before fit on train data!')
        
    def select_final_features(self):
        self.df = self.df[self.final_features]

        
    def transform(self, dataset:pd.DataFrame, is_train=True) : 
        self.dataset = dataset.copy()
        
        self.select_useful_cols()
        self.datetime_cols()
        self.ticket_per_order()
        self.is_with_family()
        self.handle_monetary()
        self.encode_cities(is_train)
        self.encode_categorical_cols(is_train)
        self.select_final_features()
        
        return self.df


In [72]:
x = mrbilit.drop(columns='Cancel')
y = mrbilit['Cancel']

x_train_val, x_test, y_train_val, y_test = train_test_split(x, y, test_size=0.01, shuffle=True, stratify=y)

x_train, x_val, y_train, y_val  = train_test_split(x_train_val, y_train_val, test_size=0.05, shuffle=True, stratify=y_train_val)

In [73]:
preprocessor = Preprocessor()

X_train = preprocessor.transform(x_train, is_train = True)
X_val = preprocessor.transform(x_val, is_train = False)
X_test = preprocessor.transform(x_test, is_train = False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.df[['Created', 'DepartureTime']] = self.df[['Created', 'DepartureTime']].apply(pd.to_datetime)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.df['TimeDiff'] = (self.df['DepartureTime'] - self.df['Created']).dt.days
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.df.drop(['Created', 'DepartureTime'], axis=1, inplace=True)
A value is trying t

In [74]:
# smote = SMOTE(random_state=42)
# X_train, y_train = smote.fit_resample(X_train, y_train)

In [108]:
y_train.value_counts()

Cancel
0    80615
1    14390
Name: count, dtype: int64

In [124]:
models = []

In [125]:
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')

param_distributions = {
    'n_estimators': randint(50, 500),
    'learning_rate': uniform(0.01, 0.3),
    'max_depth': randint(3, 10),
    'min_child_weight': uniform(1, 10),
    'gamma': uniform(0, 5),
    'subsample': uniform(0.5, 0.5),
    'colsample_bytree': uniform(0.5, 0.5)
}

random_search = RandomizedSearchCV(
    estimator=xgb_model,
    param_distributions=param_distributions,
    n_iter=20,
    scoring='accuracy',
    cv=5,
    verbose=1,
    random_state=42,
    n_jobs=-1
)
random_search.fit(X_train, y_train)
print("Best Parameters:", random_search.best_params_)
print("Best Cross-Validation Accuracy:", random_search.best_score_)



Fitting 5 folds for each of 20 candidates, totalling 100 fits
Best Parameters: {'colsample_bytree': 0.6632703844029177, 'gamma': 2.852219872026997, 'learning_rate': 0.1662502780077471, 'max_depth': 8, 'min_child_weight': 4.253303307632644, 'n_estimators': 257, 'subsample': 0.8736600550686904}
Best Cross-Validation Accuracy: 0.9835166570180516


In [126]:
model = random_search.best_estimator_
y_pred = model.predict(X_val)
print(classification_report(y_val, y_pred))
print(f1_score(y_val, y_pred))
models.append({
    'name': 'XGBClassifier',
    'f1_score': f1_score(y_val, y_pred),
    'model' : model
})

              precision    recall  f1-score   support

           0       0.98      1.00      0.99      4243
           1       0.99      0.87      0.93       758

    accuracy                           0.98      5001
   macro avg       0.98      0.93      0.96      5001
weighted avg       0.98      0.98      0.98      5001

0.9256661991584852


In [127]:
model = RandomForestClassifier(n_estimators=100, random_state=42)

model.fit(X_train, y_train)

In [128]:
y_pred = model.predict(X_val)
print(classification_report(y_val, y_pred))
print(f1_score(y_val, y_pred))
models.append({
    'name': 'RandomForest',
    'f1_score': f1_score(y_val, y_pred),
    'model' : model
})

              precision    recall  f1-score   support

           0       0.98      1.00      0.99      4243
           1       0.99      0.87      0.93       758

    accuracy                           0.98      5001
   macro avg       0.98      0.94      0.96      5001
weighted avg       0.98      0.98      0.98      5001

0.9270687237026648


In [129]:
model = GradientBoostingClassifier(random_state=42)
model.fit(X_train, y_train)

In [130]:
y_pred = model.predict(X_val)
print(classification_report(y_val, y_pred))
print(f1_score(y_val, y_pred))
models.append({
    'name': 'GradientBoosting',
    'f1_score': f1_score(y_val, y_pred),
    'model' : model
})

              precision    recall  f1-score   support

           0       0.98      1.00      0.99      4243
           1       1.00      0.87      0.93       758

    accuracy                           0.98      5001
   macro avg       0.99      0.93      0.96      5001
weighted avg       0.98      0.98      0.98      5001

0.9287226534932957


In [131]:
model = LGBMClassifier(random_state=42)
model.fit(X_train, y_train)

[LightGBM] [Info] Number of positive: 14390, number of negative: 80615
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001579 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 730
[LightGBM] [Info] Number of data points in the train set: 95005, number of used features: 12
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.151466 -> initscore=-1.723151
[LightGBM] [Info] Start training from score -1.723151


In [132]:
y_pred = model.predict(X_val)
print(classification_report(y_val, y_pred))
print(f1_score(y_val, y_pred))
models.append({
    'name': 'LGBM',
    'f1_score': f1_score(y_val, y_pred),
    'model' : model
})

              precision    recall  f1-score   support

           0       0.98      1.00      0.99      4243
           1       1.00      0.87      0.93       758

    accuracy                           0.98      5001
   macro avg       0.99      0.93      0.96      5001
weighted avg       0.98      0.98      0.98      5001

0.9287226534932957


In [133]:
model = KNeighborsClassifier(n_neighbors=2)
model.fit(X_train, y_train)

In [134]:
y_pred = model.predict(X_val)
print(classification_report(y_val, y_pred))
print(f1_score(y_val, y_pred))
models.append({
    'name': 'KNeighbors',
    'f1_score': f1_score(y_val, y_pred),
    'model' : model
})

              precision    recall  f1-score   support

           0       0.96      0.99      0.98      4243
           1       0.96      0.76      0.85       758

    accuracy                           0.96      5001
   macro avg       0.96      0.88      0.91      5001
weighted avg       0.96      0.96      0.96      5001

0.8457564575645756


In [135]:
models.sort(key=lambda x: x['f1_score'])
models

[{'name': 'KNeighbors',
  'f1_score': 0.8457564575645756,
  'model': KNeighborsClassifier(n_neighbors=2)},
 {'name': 'XGBClassifier',
  'f1_score': 0.9256661991584852,
  'model': XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
                colsample_bylevel=1, colsample_bynode=1,
                colsample_bytree=0.6632703844029177, early_stopping_rounds=None,
                enable_categorical=False, eval_metric='logloss',
                feature_types=None, gamma=2.852219872026997, gpu_id=-1,
                grow_policy='depthwise', importance_type=None,
                interaction_constraints='', learning_rate=0.1662502780077471,
                max_bin=256, max_cat_threshold=64, max_cat_to_onehot=4,
                max_delta_step=0, max_depth=8, max_leaves=0,
                min_child_weight=4.253303307632644, missing=nan,
                monotone_constraints='()', n_estimators=257, n_jobs=0,
                num_parallel_tree=1, predictor='auto', random_state=0, .

In [136]:
f1_score(models[-1]['model'].predict(X_test), y_test)

0.9375

In [137]:
for model in models:
    print(f1_score(model['model'].predict(X_test), y_test))

0.8581818181818183
0.9411764705882353
0.9342560553633218
0.9342560553633218
0.9375
