In [21]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score, mean_squared_error

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from lightgbm import LGBMClassifier
from sklearn.neighbors import KNeighborsClassifier
from imblearn.over_sampling import SMOTE

import matplotlib.pyplot as plt


In [2]:
mrbilit = pd.read_csv('data/mrbilit_dataset.csv')

In [3]:
def count_family(x):
    if len(x) > 1:
        return True
    else:
        return False


In [4]:
mrbilit.groupby('Cancel')['Created'].count()

Cancel
0    85716
1    15301
Name: Created, dtype: int64

In [5]:
class Preprocessor:
    
    def __init__(self):
        self.selected_cols = ['Created', 'DepartureTime', 'BillID', 'TicketID', 'ReserveStatus', 
                'Male', 'Price', 'CouponDiscount', 'From', 'To', 'Domestic',
                'VehicleClass', 'Vehicle', 'BuyerMobile', 'TripReason']

        self.final_features = ['ReserveStatus',
       'Male', 'Domestic', 'Vehicle', 'TripReason',
       'TimeDiff', 'TicketPerOrder', 'is_with_familiy', 'net_price',
       'From_encoded', 'To_encoded', 'has_discount']
        

    def select_useful_cols (self) : 
        self.df = self.dataset[self.selected_cols]


    def is_with_family(self):
        self.df['is_with_familiy'] = self.df.groupby('BillID')['Male'].transform(count_family)


    def datetime_cols (self) : 
        self.df[['Created', 'DepartureTime']] = self.df[['Created', 'DepartureTime']].apply(pd.to_datetime)        
        self.df['TimeDiff'] = (self.df['DepartureTime'] - self.df['Created']).dt.days
        self.df.drop(['Created', 'DepartureTime'], axis=1, inplace=True)
    
    def ticket_per_order (self) : 
        self.df['TicketPerOrder'] = self.df.groupby('BillID').TicketID.transform('count')
    
    def handle_monetary (self) : 
        self.df['net_price'] = self.df.Price - self.df.CouponDiscount
        self.df['net_price'] /= self.df['net_price'].max()
        self.df['has_discount'] = self.df.CouponDiscount > 0

    
    def encode_cities (self, is_train) : 
        if is_train : 
            cities = list(set(self.df.From.unique().tolist()).union(set(self.df.To.unique().tolist())))
            self.city_encoder = LabelEncoder().fit(cities)
        try : 
            city2idx = dict(zip(self.city_encoder.classes_, self.city_encoder.transform(self.city_encoder.classes_)))
            
            self.df['From_encoded'] = self.df.From.map(city2idx).fillna(-1).astype(int)
            self.df['To_encoded'] = self.df.To.map(city2idx).fillna(-1).astype(int)
            
            self.df.drop(['To', 'From'], axis=1, inplace=True)
        
        except KeyError as exc:  
            raise Exception('Something went wrong. Maybe this class is used for test date before fit on train data!')
    
    def encode_categorical_cols (self, is_train) : 
        if is_train : 
            self.cat_cols = self.df.select_dtypes(exclude=['int','float']).columns
            self.cat_les = {}

            for col in self.cat_cols : 
                le = LabelEncoder().fit(self.df[col])
                self.cat_les.update({col:le})
        try:
            for col in self.cat_cols :
                self.df[col] = self.cat_les[col].transform(self.df[col])
        except Exception as exc: 
            raise Exception('Something went wrong. Maybe this class is used for test date before fit on train data!')
        
    def select_final_features(self):
        self.df = self.df[self.final_features]

        
    def transform(self, dataset:pd.DataFrame, is_train=True) : 
        self.dataset = dataset.copy()
        
        self.select_useful_cols()
        self.datetime_cols()
        self.ticket_per_order()
        self.is_with_family()
        self.handle_monetary()
        self.encode_cities(is_train)
        self.encode_categorical_cols(is_train)
        self.select_final_features()
        
        return self.df


In [6]:
x = mrbilit.drop(columns='Cancel')
y = mrbilit['Cancel']

x_train_val, x_test, y_train_val, y_test = train_test_split(x, y, test_size=0.1, shuffle=True, stratify=y)

x_train, x_val, y_train, y_val  = train_test_split(x_train_val, y_train_val, test_size=0.15, shuffle=True, stratify=y_train_val)

In [7]:
preprocessor = Preprocessor()

X_train = preprocessor.transform(x_train, is_train = True)
X_val = preprocessor.transform(x_val, is_train = False)
X_test = preprocessor.transform(x_test, is_train = False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.df[['Created', 'DepartureTime']] = self.df[['Created', 'DepartureTime']].apply(pd.to_datetime)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.df['TimeDiff'] = (self.df['DepartureTime'] - self.df['Created']).dt.days
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.df.drop(['Created', 'DepartureTime'], axis=1, inplace=True)
A value is trying t

In [8]:
smote = SMOTE(random_state=42)
X_train, y_train = smote.fit_resample(X_train, y_train)




In [9]:
y_train.value_counts()

Cancel
1    65572
0    65572
Name: count, dtype: int64

In [10]:
models = []

In [11]:
model = RandomForestClassifier(n_estimators=100, random_state=42)

model.fit(X_train, y_train)

In [12]:
y_pred = model.predict(X_val)
print(classification_report(y_val, y_pred))
print(f1_score(y_val, y_pred))
models.append({
    'name': 'RandomForest',
    'f1_score': f1_score(y_val, y_pred),
    'model' : model
})

              precision    recall  f1-score   support

           0       0.98      0.98      0.98     11572
           1       0.91      0.91      0.91      2066

    accuracy                           0.97     13638
   macro avg       0.95      0.95      0.95     13638
weighted avg       0.97      0.97      0.97     13638

0.9138014527845036


In [13]:
model = GradientBoostingClassifier(random_state=42)
model.fit(X_train, y_train)

In [14]:
y_pred = model.predict(X_val)
print(classification_report(y_val, y_pred))
print(f1_score(y_val, y_pred))
models.append({
    'name': 'GradientBoosting',
    'f1_score': f1_score(y_val, y_pred),
    'model' : model
})

              precision    recall  f1-score   support

           0       0.99      0.97      0.98     11572
           1       0.87      0.92      0.89      2066

    accuracy                           0.97     13638
   macro avg       0.93      0.95      0.94     13638
weighted avg       0.97      0.97      0.97     13638

0.8917586287861


In [15]:
model = LGBMClassifier(random_state=42)
model.fit(X_train, y_train)

[LightGBM] [Info] Number of positive: 65572, number of negative: 65572
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002987 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 677
[LightGBM] [Info] Number of data points in the train set: 131144, number of used features: 12
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000




In [16]:
y_pred = model.predict(X_val)
print(classification_report(y_val, y_pred))
print(f1_score(y_val, y_pred))
models.append({
    'name': 'LGBM',
    'f1_score': f1_score(y_val, y_pred),
    'model' : model
})

              precision    recall  f1-score   support

           0       0.99      0.98      0.98     11572
           1       0.89      0.92      0.90      2066

    accuracy                           0.97     13638
   macro avg       0.94      0.95      0.94     13638
weighted avg       0.97      0.97      0.97     13638

0.9048414023372288


In [27]:
model = KNeighborsClassifier(n_neighbors=2)
model.fit(X_train, y_train)

In [28]:
y_pred = model.predict(X_val)
print(classification_report(y_val, y_pred))
print(f1_score(y_val, y_pred))
models.append({
    'name': 'KNeighbors',
    'f1_score': f1_score(y_val, y_pred),
    'model' : model
})

              precision    recall  f1-score   support

           0       0.97      0.98      0.98     11572
           1       0.88      0.84      0.86      2066

    accuracy                           0.96     13638
   macro avg       0.93      0.91      0.92     13638
weighted avg       0.96      0.96      0.96     13638

0.8611799702528508


In [29]:
models.sort(key=lambda x: x['f1_score'])
models

[{'name': 'KNeighbors',
  'f1_score': 0.8611799702528508,
  'model': KNeighborsClassifier(n_neighbors=2)},
 {'name': 'GradientBoosting',
  'f1_score': 0.8917586287861,
  'model': GradientBoostingClassifier(random_state=42)},
 {'name': 'LGBM',
  'f1_score': 0.9048414023372288,
  'model': LGBMClassifier(random_state=42)},
 {'name': 'RandomForest',
  'f1_score': 0.9138014527845036,
  'model': RandomForestClassifier(random_state=42)}]

In [30]:
f1_score(models[-1]['model'].predict(X_test), y_test)

0.9133289560078792