In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder,MinMaxScaler
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score,classification_report, confusion_matrix

<center>
<div dir=rtl style="direction: rtl;line-height:200%;font-family:vazir;font-size:medium">
<font face="vazir" size=3>
    
|ستون|توضیحات|
|:------:|:---:|
|Created|زمان ثبت بلیط|
|CancelTime|زمانی که مسافر بلیط را کنسل کرده است|
|DepartureTime|زمان حرکت|
|BillID|شناسه خرید|
|TicketID|شناسه بلیط|
|ReserveStatus|وضعیت پرداخت مشتری|
|UserID|شناسه کاربری|
|Male|آیا بلیط متعلق به زن است یا مرد|
|Price|قیمت بلیط بدون تخفیف|
|CouponDiscount|تخفیفی که شخص روی بلیط اعمال کرده است|
|From|مبدا سفر|
|To|مقصد سفر|
|Domestic|آیا سفر، داخلی است یا خارجی|
|VehicleType|جزئیات وسیله نقلیه را مشخص می‌کند|
|VehicleClass|آیا وسیله نقلیه، فرست‌کلس است یا نه|
|Vehicle|نوع وسیله نقلیه|
|HashPassportNumber_p|هش‌شده شماره پاسپورت|
|HashEmail|هش‌شده ایمیل|
|BuyerMobile|هش‌شده شماره موبایل|
|NationalCode|هش‌شده شماره‌ملی|
|TripReason|دلیل سفر|
|Cancel|آیا بلیط کنسل شده‌است یا نه|
    
</font>
</div>
</center>



In [2]:
train_data = pd.read_csv('../data/train_data.csv')
train_data

Unnamed: 0,Created,CancelTime,DepartureTime,BillID,TicketID,ReserveStatus,UserID,Male,Price,CouponDiscount,...,Domestic,VehicleType,VehicleClass,TripReason,Vehicle,Cancel,HashPassportNumber_p,HashEmail,BuyerMobile,NationalCode
0,2022-07-26 13:33:20.457,,2022-07-26 16:30:00,38428546,7445571.0,3,,True,1180000.0,0.0,...,1,VIPمانیتوردار-شارژراختصاصی تخت شو مارال (جدید)...,True,Work,Bus,0,,,302222356019,330024570
1,2022-10-27 23:07:01.837,2022-10-27 23:26:39.070,2022-10-29 09:45:00,39768762,7762719.0,5,,False,1050000.0,0.0,...,1,classicus 2+2,True,Int,Bus,1,,,900764168521,995520696
2,2022-09-12 11:01:13.607,,2022-10-03 18:35:00,39128001,2327596.0,5,800398.0,False,4674000.0,0.0,...,1,فوکر 100,False,Int,Plane,0,,1c44d7a76b52341fa12dcfa993138576befcc9ebf01d14...,749804783291,979382950
3,2022-08-08 17:43:35.840,,2022-08-08 22:30:00,38606546,7495440.0,3,,True,1200000.0,0.0,...,1,VIPدرسا+مانیتوردار+شارژراختصاصی+پذیرایی,True,Work,Bus,0,,,781396205677,911237229
4,2022-11-01 15:12:56.823,,2022-11-03 11:30:00,39822185,2356902.0,5,,True,6222000.0,0.0,...,1,,False,Work,Plane,0,,bb38b345aec02255e31d178492907175c5984f2a1f5b59...,524576220177,727496008
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101012,2022-10-27 21:41:35.803,,2022-11-05 20:15:00,39767774,3082282.0,2,,False,7200000.0,0.0,...,1,5ستاره بيزينس سلامت فدك,,Int,Train,0,,,395081863564,789320493
101013,2022-09-24 15:51:11.993,,2022-10-21 09:15:00,39319207,3026516.0,2,151423.0,False,4292000.0,0.0,...,1,4ستاره4تخته سپهر,,Int,Train,0,,3f28ed65a16d629747e4d27fab100b2b082fcbdf7ec831...,130026405332,866503410
101014,2022-08-12 13:46:20.480,,2022-08-12 23:59:00,38653461,7508988.0,3,,True,1320000.0,0.0,...,1,VIP 2+1 / مانیتوردار / سیستم تهویه مطبوع / تخ...,True,Int,Bus,0,,,784949357488,881677039
101015,2022-06-20 15:15:37.780,,2022-06-25 10:10:00,37880627,1057205.0,5,641744.0,True,13000000.0,0.0,...,1,,False,Work,Plane,0,,750690ca99468f159eff6ad928cec4339b089af8c2e6dc...,766602541733,403505466


In [3]:
train_data.isna().sum()

Created                      0
CancelTime               85705
DepartureTime                0
BillID                       0
TicketID                     0
ReserveStatus                0
UserID                   58515
Male                         0
Price                        0
CouponDiscount               0
From                         0
To                           0
Domestic                     0
VehicleType               7678
VehicleClass             38351
TripReason                   0
Vehicle                      0
Cancel                       0
HashPassportNumber_p    100155
HashEmail                58002
BuyerMobile                  0
NationalCode                 0
dtype: int64

In [4]:
# do some preprocessing 
train_data.drop(['CancelTime','VehicleType','UserID','HashPassportNumber_p','HashEmail','NationalCode'],axis=1,inplace=True)

train_data['VehicleClass'].fillna(train_data.VehicleClass.mode()[0],inplace=True)

train_data[['Created','DepartureTime']] = train_data[['Created','DepartureTime']].apply(pd.to_datetime) 
train_data['dif_day'] = (train_data['DepartureTime'] - train_data['Created']).dt.days
train_data.drop(['Created','DepartureTime'],axis=1,inplace=True)

X=train_data.drop('Cancel',axis=1)
y=train_data.Cancel

encoder= LabelEncoder()
categorical_cols=['Male','VehicleClass','TripReason','Vehicle']
for i in categorical_cols:
    X[i]=encoder.fit_transform(X[i])
    
cities= np.unique(np.append(X['From'].unique(),X['To'].unique()))
cities_dict={}
i=0
for city in cities:
    cities_dict[city]=i
    i+=1
X['To'].replace(cities_dict,inplace=True)
X['From'].replace(cities_dict,inplace=True)

scaler=MinMaxScaler()
for i in X.columns:
    X[[i]]=scaler.fit_transform(X[[i]])

In [5]:
X

Unnamed: 0,BillID,TicketID,ReserveStatus,Male,Price,CouponDiscount,From,To,Domestic,VehicleClass,TripReason,Vehicle,BuyerMobile,dif_day
0,0.640748,0.950157,0.333333,1.0,0.006992,0.0,0.563291,0.515823,1.0,1.0,1.0,0.000000,0.302219,0.000000
1,0.981829,0.997024,1.000000,0.0,0.006655,0.0,0.658228,0.458861,1.0,1.0,0.0,0.000000,0.900778,0.004695
2,0.818757,0.193845,1.000000,0.0,0.016061,0.0,0.259494,0.069620,1.0,0.0,0.0,0.666667,0.749815,0.098592
3,0.686048,0.957526,0.333333,1.0,0.007044,0.0,0.259494,0.743671,1.0,1.0,1.0,0.000000,0.781407,0.000000
4,0.995425,0.198176,1.000000,1.0,0.020079,0.0,0.895570,0.259494,1.0,0.0,1.0,0.666667,0.524580,0.004695
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101012,0.981577,0.305369,0.000000,0.0,0.022617,0.0,0.731013,0.259494,1.0,1.0,0.0,1.000000,0.395081,0.037559
101013,0.867418,0.297128,0.000000,0.0,0.015070,0.0,0.731013,0.101266,1.0,1.0,0.0,1.000000,0.130018,0.122066
101014,0.697988,0.959528,0.333333,1.0,0.007356,0.0,0.563291,0.101266,1.0,1.0,0.0,0.000000,0.784960,0.000000
101015,0.501304,0.006112,1.000000,1.0,0.037671,0.0,0.259494,0.439873,1.0,0.0,1.0,0.666667,0.766613,0.018779


In [6]:
# modeling
from sklearn.ensemble import RandomForestClassifier
X_train, X_test, y_train, y_test= train_test_split(X, y, test_size=0.2, shuffle=True, random_state=43)
model= XGBClassifier(max_depth=6,n_estimators=100)
model.fit(X_train,y_train)
y_predict= model.predict(X_test)

In [7]:
# evaluate model
print('f1_score is:', f1_score(y_test,y_predict),'\n')
print('classification_report is:\n',classification_report(y_test, y_predict))

f1_score is: 0.9479591836734694 

classification_report is:
               precision    recall  f1-score   support

           0       0.98      1.00      0.99     17132
           1       0.99      0.91      0.95      3072

    accuracy                           0.98     20204
   macro avg       0.99      0.95      0.97     20204
weighted avg       0.98      0.98      0.98     20204

