In [75]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import classification_report, f1_score, mean_squared_error

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from lightgbm import LGBMClassifier
from sklearn.neighbors import KNeighborsClassifier
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
import matplotlib.pyplot as plt

from scipy.stats import uniform, randint
import plotly.express as px
import plotly.graph_objects as go


|Column|Description|
|:------:|:---:|
|Created|Time of ticket registration|
|CancelTime|When the passenger canceled the ticket|
|DepartureTime|Time of departure|
|BillID|Purchase ID|
|TicketID|Ticket ID|
|ReserveStatus|Customer payment status|
|UserID|User ID|
|Male|Whether the ticket belongs to a woman or a man|
|Price|Undiscounted ticket price|
|CouponDiscount|Discount that the person applied to the ticket|
|From|Origin of the trip|
|To|Destination of the trip|
|Domestic|Whether the trip is domestic or international|
|VehicleType|Identifies vehicle details|
|VehicleClass|Whether the vehicle is first class or not|
|Vehicle|Vehicle type|
|HashPassportNumber_p|Hashed passport number|
|HashEmail|Hashed Email|
|BuyerMobile|Hashed Mobile Number|
|NationalCode|Hashed National Number|
|TripReason|Reason for Trip|
|Cancel|Whether the Ticket is Cancelled or Not|

In [76]:
mrbilit = pd.read_csv('data/mrbilit_dataset.csv')

In [None]:
mrbilit.info()

In [None]:
mrbilit.isna().sum()

In [None]:
mrbilit.nunique()

In [80]:
mrbilit = mrbilit.drop_duplicates()

In [None]:
fig = px.histogram(mrbilit, x="TripReason", color="Male", title="")
fig.show()

In [None]:
fig = px.histogram(mrbilit, x="TripReason", color="Cancel", title="")
fig.show()

In [None]:
fig = px.histogram(mrbilit, x="TripReason", color="VehicleClass")
fig.show()

In [None]:
fig = px.histogram(mrbilit, x="Domestic", color="TripReason", title="")
fig.show()

In [None]:
fig = px.histogram(mrbilit, x="Vehicle", color="TripReason", title="")
fig.show()

In [86]:
mrbilit[['Created', 'DepartureTime']] = mrbilit[['Created', 'DepartureTime']].apply(pd.to_datetime)        

In [None]:
mrbilit_month = mrbilit[['DepartureTime', 'TripReason']]
mrbilit_month['year'] = mrbilit_month['DepartureTime'].dt.year
mrbilit_month['month'] = mrbilit_month['DepartureTime'].dt.month
Int = mrbilit_month.groupby(['TripReason', 'year', 'month']).count().unstack().iloc[0]
Work = mrbilit_month.groupby(['TripReason', 'year', 'month']).count().unstack().iloc[2]
df_line_trip_by_month = pd.DataFrame({
    'month': np.arange(1, 13),
    'Int': Int.values,
    'work': Work.values
})
fig = go.Figure()

fig.add_trace(go.Scatter(x=df_line_trip_by_month['month'], y=df_line_trip_by_month['Int'], name='Int',
                         line=dict(color='firebrick', width=4)))
fig.add_trace(go.Scatter(x=df_line_trip_by_month['month'], y=df_line_trip_by_month['work'], name = 'Work',
                         line=dict(color='royalblue', width=4)))
fig.show()

In [None]:
mrbilit['TimeDiff'] = (mrbilit['DepartureTime'] - mrbilit['Created']).dt.days
df_timediff = mrbilit.groupby(['TripReason', 'TimeDiff'])['BillID'].count().unstack()
df_timediff = df_timediff.iloc[: , 0:50]
df_timediff

In [None]:
mrbilit['TimeDiff'] = (mrbilit['DepartureTime'] - mrbilit['Created']).dt.days
df_timediff = mrbilit.groupby(['TripReason', 'TimeDiff'])['BillID'].count().unstack()
df_timediff = df_timediff.iloc[: , 0:50]
Int = df_timediff.iloc[0].fillna(0) 
Work = df_timediff.iloc[1].fillna(0) 
fig = go.Figure()

fig.add_trace(go.Scatter(x=df_timediff.columns, y=Int, name='Int',
                         line=dict(color='blue', width=4)))
fig.add_trace(go.Scatter(x=df_timediff.columns, y=Work, name = 'Work',
                         line=dict(color='red', width=4)))

In [None]:
mrbilit['has_discount'] = mrbilit.CouponDiscount > 0
fig = px.histogram(mrbilit , x="has_discount", color="TripReason", title="")
fig.show()

In [None]:
mrbilit.TripReason.value_counts()

In [92]:
cities = list(set(mrbilit.To.value_counts().iloc[0:10].index).union(set(mrbilit.From.value_counts().iloc[0:10].index)))

In [None]:
filter_df_From = mrbilit[mrbilit['From'].isin(cities)]
fig = px.histogram(filter_df_From , x="From", color="TripReason", title="")
fig.show()

In [None]:
filter_df_to = mrbilit[mrbilit['To'].isin(cities)]
fig = px.histogram(filter_df_to , x="To", color="TripReason", title="")
fig.show()

In [None]:
mrbilit.groupby('TripReason')['Price'].mean()

In [None]:
fig = px.histogram(mrbilit, x="ReserveStatus", color="TripReason", title="")
fig.show()

In [97]:
indexes = mrbilit.UserID.value_counts().iloc[:10].index

In [None]:
mrbilit[mrbilit['UserID'].isin(indexes)].groupby(['UserID', 'TripReason'])['TicketID'].count().unstack()

In [99]:
mrbilit['TripReason'] = mrbilit['TripReason'].map({
    'Work': 0,
    'Int': 1
})

In [None]:
mrbilit['TripReason'].value_counts()

In [101]:
def count_family(x):
    if len(x) > 1:
        return True
    else:
        return False

In [138]:
class Preprocessor:
    
    def __init__(self):
        self.selected_cols = ['Created', 'DepartureTime', 'BillID', 'TicketID', 'ReserveStatus',
                'Male', 'Price', 'CouponDiscount', 'From', 'To', 'Domestic',
                'VehicleClass', 'Vehicle', 'Cancel', 'BuyerMobile']

        self.final_features = ['ReserveStatus',
       'Male', 'Domestic', 'Vehicle', 'Cancel',
       'TimeDiff', 'TicketPerOrder', 'is_with_familiy', 'net_price',
       'From_encoded', 'To_encoded', 'has_discount']
        

    def select_useful_cols (self) : 
        self.df = self.dataset[self.selected_cols]

    def is_with_family(self):
        self.df['is_with_familiy'] = self.df.groupby('BillID')['Male'].transform(count_family)


    def datetime_cols (self) : 
        self.df[['Created', 'DepartureTime']] = self.df[['Created', 'DepartureTime']].apply(pd.to_datetime)        
        self.df['TimeDiff'] = (self.df['DepartureTime'] - self.df['Created']).dt.days
        self.df.drop(['Created', 'DepartureTime'], axis=1, inplace=True)
    
    def ticket_per_order (self) : 
        self.df['TicketPerOrder'] = self.df.groupby('BillID').TicketID.transform('count')
    
    def handle_monetary (self) : 
        self.df['net_price'] = self.df.Price - self.df.CouponDiscount
        self.df['net_price'] /= self.df['net_price'].max()
        self.df['has_discount'] = self.df.CouponDiscount > 0

    
    def encode_cities (self, is_train) : 
        if is_train : 
            cities = list(set(self.df.From.unique().tolist()).union(set(self.df.To.unique().tolist())))
            self.city_encoder = LabelEncoder().fit(cities)
        try : 
            city2idx = dict(zip(self.city_encoder.classes_, self.city_encoder.transform(self.city_encoder.classes_)))
            
            self.df['From_encoded'] = self.df.From.map(city2idx).fillna(-1).astype(int)
            self.df['To_encoded'] = self.df.To.map(city2idx).fillna(-1).astype(int)
            
            self.df.drop(['To', 'From'], axis=1, inplace=True)
        
        except KeyError as exc:  
            raise Exception('Something went wrong. Maybe this class is used for test date before fit on train data!')
    
    def encode_categorical_cols (self, is_train) : 
        if is_train : 
            self.cat_cols = self.df.select_dtypes(exclude=['int','float']).columns
            self.cat_les = {}

            for col in self.cat_cols : 
                le = LabelEncoder().fit(self.df[col])
                self.cat_les.update({col:le})
        try:
            for col in self.cat_cols :
                self.df[col] = self.cat_les[col].transform(self.df[col])
        except Exception as exc: 
            raise Exception('Something went wrong. Maybe this class is used for test date before fit on train data!')
        
    def select_final_features(self):
        self.df = self.df[self.final_features]

        
    def transform(self, dataset:pd.DataFrame, is_train=True) : 
        self.dataset = dataset.copy()
        
        self.select_useful_cols()
        self.datetime_cols()
        self.ticket_per_order()
        self.is_with_family()
        self.handle_monetary()
        self.encode_cities(is_train)
        self.encode_categorical_cols(is_train)
        self.select_final_features()
        
        return self.df


In [139]:
x = mrbilit.drop(columns='TripReason')
y = mrbilit['TripReason']

x_train_val, x_test, y_train_val, y_test = train_test_split(x, y, test_size=0.01, shuffle=True, stratify=y)

x_train, x_val, y_train, y_val  = train_test_split(x_train_val, y_train_val, test_size=0.05, shuffle=True, stratify=y_train_val)

In [None]:
preprocessor = Preprocessor()

X_train = preprocessor.transform(x_train, is_train = True)
X_val = preprocessor.transform(x_val, is_train = False)
X_test = preprocessor.transform(x_test, is_train = False)

In [105]:
models = []

In [None]:
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')

param_distributions = {
    'n_estimators': randint(50, 500),
    'learning_rate': uniform(0.01, 0.3),
    'max_depth': randint(3, 10),
    'min_child_weight': uniform(1, 10),
    'gamma': uniform(0, 5),
    'subsample': uniform(0.5, 0.5),
    'colsample_bytree': uniform(0.5, 0.5)
}

random_search = RandomizedSearchCV(
    estimator=xgb_model,
    param_distributions=param_distributions,
    n_iter=10,
    scoring='f1',
    cv=4,
    verbose=1,
    random_state=42,
    n_jobs=-1
)
random_search.fit(X_train, y_train)
print("Best Parameters:", random_search.best_params_)
print("Best Cross-Validation Accuracy:", random_search.best_score_)

In [None]:
model = random_search.best_estimator_
y_pred = model.predict(X_val)
print(classification_report(y_val, y_pred))
print(f1_score(y_val, y_pred))
models.append({
    'name': 'XGBClassifier',
    'f1_score': f1_score(y_val, y_pred),
    'model' : model
})

In [None]:
rf = RandomForestClassifier(random_state=42)

param_dist = {
    'n_estimators': np.arange(50, 200, 10),       
    'max_depth': [None] + list(np.arange(10, 50, 5)),  
    'min_samples_split': np.arange(2, 20, 2),     
    'min_samples_leaf': np.arange(1, 10, 1),      
    'max_features': ['sqrt', 'log2', None],       
    'bootstrap': [True, False]                   
}

random_search = RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_dist,
    n_iter=10,  
    scoring='f1',
    cv=4,       
    random_state=42,
    verbose=2,
    n_jobs=-1   
)

random_search.fit(X_train, y_train)

model.fit(X_train, y_train)

In [None]:
model = random_search.best_estimator_
y_pred = model.predict(X_val)
print(classification_report(y_val, y_pred))
print(f1_score(y_val, y_pred))
models.append({
    'name': 'RandomForest',
    'f1_score': f1_score(y_val, y_pred),
    'model' : model
})

In [None]:
gb = GradientBoostingClassifier(random_state=42)

param_dist = {
    'n_estimators': np.arange(50, 300, 10),            
    'learning_rate': np.linspace(0.01, 0.2, 20),     
    'max_depth': np.arange(3, 15, 1),                 
    'min_samples_split': np.arange(2, 20, 2),         
    'min_samples_leaf': np.arange(1, 10, 1),          
    'subsample': np.linspace(0.6, 1.0, 5),            
    'max_features': ['sqrt', 'log2', None]            
}

random_search = RandomizedSearchCV(
    estimator=gb,
    param_distributions=param_dist,
    n_iter=10, 
    scoring='f1',
    cv=4,       
    random_state=42,
    verbose=2,
    n_jobs=-1   
)

random_search.fit(X_train, y_train)


In [None]:
model = random_search.best_estimator_
y_pred = model.predict(X_val)
print(classification_report(y_val, y_pred))
print(f1_score(y_val, y_pred))
models.append({
    'name': 'GradientBoosting',
    'f1_score': f1_score(y_val, y_pred),
    'model' : model
})

In [None]:
lgbm  = LGBMClassifier(random_state=42)

param_dist = {
    'num_leaves': np.arange(20, 150, 10),             
    'max_depth': [-1] + list(np.arange(3, 15, 1)),   
    'learning_rate': np.linspace(0.01, 0.2, 20),
    'n_estimators': np.arange(50, 500, 50),           
    'min_child_samples': np.arange(10, 100, 10),     
    'subsample': np.linspace(0.6, 1.0, 5),            
    'colsample_bytree': np.linspace(0.6, 1.0, 5),     
    'reg_alpha': np.linspace(0, 1.0, 10),            
    'reg_lambda': np.linspace(0, 1.0, 10),           
}

random_search = RandomizedSearchCV(
    estimator=lgbm,
    param_distributions=param_dist,
    n_iter=20,  
    scoring='f1',
    cv=4,       
    random_state=42,
    verbose=2,
    n_jobs=-1   
)

random_search.fit(X_train, y_train)

In [None]:
model = random_search.best_estimator_
y_pred = model.predict(X_val)
print(classification_report(y_val, y_pred))
print(f1_score(y_val, y_pred))
models.append({
    'name': 'LGBM',
    'f1_score': f1_score(y_val, y_pred),
    'model' : model
})

In [None]:
models.sort(key=lambda x: x['f1_score'])
for model in models:
    print(model['name'], model['f1_score'])

In [None]:
for model in models:
    print(model['name'], f1_score(model['model'].predict(X_test), y_test))
    if model['name'] not in ['SVC', 'KNeighbors']:
        l = []
        for i in range(len(X_train.columns)):
            l.append([X_train.columns[i], round(model['model'].feature_importances_[i], 3)])
        print(sorted(l, key=lambda x: x[1], reverse=True))

    print('-----------')