In [None]:
import numpy as np
import pandas as pd
from pathlib import Path
from math import radians, cos, sin, asin, sqrt
import datetime as dt
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import StackingClassifier


pd.set_option("display.max_columns", 100)

#mount google drive folder
from google.colab import drive
drive.mount('/content/gdrive')
data_path = "/content/gdrive/My Drive/FareClassification/"

#read csv files
train_df = pd.read_csv(data_path + "train.csv", index_col="tripid" )
test_df = pd.read_csv(data_path + "test.csv",  index_col="tripid" )

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


1.Preprocessing & Feature Engineering




In [None]:
train_df.isnull().sum() #check for missing values

additional_fare              202
duration                     202
meter_waiting                202
meter_waiting_fare           202
meter_waiting_till_pickup    202
pickup_time                    0
drop_time                      0
pick_lat                       0
pick_lon                       0
drop_lat                       0
drop_lon                       0
fare                         137
label                          0
dtype: int64

In [None]:
#fill missing values of "duration" column using "drop_time" and "pickup time"
train_df['calculated_duration'] = (pd.to_datetime(train_df['drop_time']) - pd.to_datetime(train_df['pickup_time']))/np.timedelta64(1,'s')
train_df['duration'] = train_df['duration'].fillna(train_df ['calculated_duration'])
train_df = train_df.drop(train_df[['calculated_duration','pickup_time','drop_time']], axis=1) #drop "drop_time" and "pickup time" column
train_df.head()

Unnamed: 0_level_0,additional_fare,duration,meter_waiting,meter_waiting_fare,meter_waiting_till_pickup,pick_lat,pick_lon,drop_lat,drop_lon,fare,label
tripid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
189123628,10.5,834.0,56.0,0.0,64.0,6.86252,79.8993,6.9033,79.8783,270.32,correct
189125358,10.5,791.0,47.0,0.0,134.0,6.88589,79.8984,6.91373,79.8923,197.85,correct
189125719,10.5,1087.0,80.0,0.0,61.0,6.90839,79.8651,6.93669,79.9146,301.64,correct
189127273,10.5,598.0,271.0,15.6638,68.0,6.9257,79.8895,6.92748,79.8971,82.3,correct
189128020,,1020.0,,,,6.87441,79.8615,6.84478,79.929,358.39,correct


In [None]:
train_df.isnull().sum()

additional_fare              202
duration                       0
meter_waiting                202
meter_waiting_fare           202
meter_waiting_till_pickup    202
pick_lat                       0
pick_lon                       0
drop_lat                       0
drop_lon                       0
fare                         137
label                          0
dtype: int64

In [None]:
#fill missing values by taking the mean of correct & incorrect labels seperately
correct = []
incorrect =[]
for value in train_df.values:
  if value[-1] == 'correct':
    correct.append(value)
  else:
    incorrect.append(value)
    
correct = pd.DataFrame(correct,columns=train_df.columns)
incorrect = pd.DataFrame(incorrect,columns=train_df.columns)

correct = correct.fillna(correct.mean())
incorrect = incorrect.fillna(incorrect.mean())
train_df = pd.concat([correct,incorrect])

In [None]:
train_df.isnull().sum()

additional_fare              0
duration                     0
meter_waiting                0
meter_waiting_fare           0
meter_waiting_till_pickup    0
pick_lat                     0
pick_lon                     0
drop_lat                     0
drop_lon                     0
fare                         0
label                        0
dtype: int64

In [None]:
dataset_map = {"incorrect": 0, "correct":1} 
train_df['label'] = train_df["label"].map(dataset_map) #convert "label" column to binary values
train_df.head()

Unnamed: 0,additional_fare,duration,meter_waiting,meter_waiting_fare,meter_waiting_till_pickup,pick_lat,pick_lon,drop_lat,drop_lon,fare,label
0,10.5,834.0,56.0,0.0,64.0,6.86252,79.8993,6.9033,79.8783,270.32,1
1,10.5,791.0,47.0,0.0,134.0,6.88589,79.8984,6.91373,79.8923,197.85,1
2,10.5,1087.0,80.0,0.0,61.0,6.90839,79.8651,6.93669,79.9146,301.64,1
3,10.5,598.0,271.0,15.6638,68.0,6.9257,79.8895,6.92748,79.8971,82.3,1
4,12.44434,1020.0,356.655679,16.909938,107.549152,6.87441,79.8615,6.84478,79.929,358.39,1


In [None]:
def calc_haversine(lon1, lat1, lon2, lat2):
    #convert decimal degrees to radians
    lon1, lat1, lon2, lat2 = map(np.radians, [lon1, lat1, lon2, lat2]) 

    # haversine formula
    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    c = 2 * np.arcsin(np.sqrt(a)) 
    r = 6371
    haversine_distance = c * r
    return haversine_distance

#create new column "distance" using havesine distance
train_df['distance'] = calc_haversine(train_df['pick_lat'], train_df['pick_lon'], train_df['drop_lat'] , train_df['drop_lon'])
train_df = train_df.drop(train_df[['pick_lat','pick_lon','drop_lat','drop_lon']], axis=1) #drop columns
train_df.head()

Unnamed: 0,additional_fare,duration,meter_waiting,meter_waiting_fare,meter_waiting_till_pickup,fare,label,distance
0,10.5,834.0,56.0,0.0,64.0,270.32,1,2.467063
1,10.5,791.0,47.0,0.0,134.0,197.85,1,0.868942
2,10.5,1087.0,80.0,0.0,61.0,301.64,1,5.531798
3,10.5,598.0,271.0,15.6638,68.0,82.3,1,0.845795
4,12.44434,1020.0,356.655679,16.909938,107.549152,358.39,1,7.527884


In [None]:
train_df = train_df.drop(train_df[['meter_waiting_till_pickup']], axis=1)

In [None]:
X_train = train_df.drop(['label'],axis='columns')
Y_train = train_df['label']
Y_train.head()

0    1
1    1
2    1
3    1
4    1
Name: label, dtype: int64

In [None]:
train_df.head(3)

Unnamed: 0,additional_fare,duration,meter_waiting,meter_waiting_fare,fare,label,distance
0,10.5,834.0,56.0,0.0,270.32,1,2.467063
1,10.5,791.0,47.0,0.0,197.85,1,0.868942
2,10.5,1087.0,80.0,0.0,301.64,1,5.531798


2.Model creation & Training

In [None]:
random_grid = {'n_estimators': range(100,500,100),
               'max_depth': range(5,20,1),
               'min_samples_leaf':range(2,5,1),
              'max_features':['auto','sqrt','log2'],
              'bootstrap': [True, False],
              'min_samples_split': range(2,5,1)}

rforest = RandomForestClassifier()
xgb_= XGBClassifier()

rfcv = RandomizedSearchCV(rforest, random_grid,cv=5)
xgbcv = RandomizedSearchCV(xgb_, random_grid,cv=5)

rfcv.fit(X_train, Y_train)
xgbcv.fit(X_train, Y_train)

# Print the tuned parameters and score
print("Tuned Parameters for random forest: {}".format(rfcv.best_params_))
print("Best score for random forest is {}".format(rfcv.best_score_))
print("Tuned Parameters for xgboost: {}".format(xgbcv.best_params_))
print("Best score for xgboost is {}".format(xgbcv.best_score_))


Tuned Parameters for random forest: {'n_estimators': 100, 'min_samples_split': 3, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': 18, 'bootstrap': False}
Best score for random forest is 0.9441659253083626
Tuned Parameters for xgboost: {'n_estimators': 200, 'min_samples_split': 4, 'min_samples_leaf': 4, 'max_features': 'auto', 'max_depth': 13, 'bootstrap': False}
Best score for xgboost is 0.9450975119168052


In [None]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())])

numeric_features = train_df.select_dtypes(include=['int64', 'float64']).drop(['label'], axis=1).columns

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features)])

level_zero = list()
level_zero.append(('xg',XGBClassifier(n_estimators=550,subsample=0.14))) 
level_zero.append(('mlp',MLPClassifier(hidden_layer_sizes=(50,100,50), max_iter=1000)))
level_zero.append(('dt', RandomForestClassifier(n_estimators = 100, max_features = 'log2')))
level_one = LogisticRegression(penalty="l2", C=3)
 
estimator = StackingClassifier(estimators=level_zero, final_estimator=level_one, cv=10)

pipe = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', estimator)])

In [None]:
x_train, x_eval, y_eval, y_eval = train_test_split(X_train, Y_train, test_size=0.25,stratify=Y_train)

In [None]:
pipe.fit(x_eval, y_eval)
y_pred = pipe.predict(x_eval)

In [None]:
f1_score(y_eval, y_pred,average='weighted')

0.9665168822805305

In [None]:
pipe.fit(X_train,Y_train)

Pipeline(memory=None,
         steps=[('preprocessor',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('num',
                                                  Pipeline(memory=None,
                                                           steps=[('imputer',
                                                                   SimpleImputer(add_indicator=False,
                                                                                 copy=True,
                                                                                 fill_value=None,
                                                                                 missing_values=nan,
                                                                                 strategy='mean',
                                                               

3.Test Data Preprocessing

In [None]:
test_df.head()

Unnamed: 0_level_0,additional_fare,duration,meter_waiting,meter_waiting_fare,meter_waiting_till_pickup,pickup_time,drop_time,pick_lat,pick_lon,drop_lat,drop_lon,fare
tripid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
213284604,10.5,924,42,2.4486,148,2/1/2020 0:38,2/1/2020 0:53,6.83454,79.875,6.7749,79.884,289.27
213286352,10.5,4249,20,0.0,91,2/1/2020 1:02,2/1/2020 2:13,6.91168,79.8723,6.55091,79.9706,1912.7
213293973,10.5,1552,255,2.6588,23,2/1/2020 5:02,2/1/2020 5:28,6.92145,79.8478,6.90539,79.8989,394.0
213294622,10.5,462,16,0.0,198,2/1/2020 5:30,2/1/2020 5:38,6.77433,79.9416,6.80401,79.9407,154.32
213298687,10.5,814,392,12.3692,69,2/1/2020 7:00,2/1/2020 7:14,6.97968,79.913,6.98875,79.8914,147.47


In [None]:
test_df['distance'] = calc_haversine(test_df['pick_lat'], test_df['pick_lon'], test_df['drop_lat'] , test_df['drop_lon'])

In [None]:
test_df = test_df.drop(['pick_lat','pick_lon','drop_lat','drop_lon'],axis='columns')
test_df = test_df.drop(['pickup_time','drop_time','meter_waiting_till_pickup'],axis='columns')

In [None]:
test_df.head()

Unnamed: 0_level_0,additional_fare,duration,meter_waiting,meter_waiting_fare,fare,distance
tripid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
213284604,10.5,924,42,2.4486,289.27,1.536052
213286352,10.5,4249,20,0.0,1912.7,12.99064
213293973,10.5,1552,255,2.6588,394.0,5.690729
213294622,10.5,462,16,0.0,154.32,0.585046
213298687,10.5,814,392,12.3692,147.47,2.408311


In [None]:
test_df.shape

(8576, 6)

4.Predicting Results



In [None]:
result = pipe.predict(test_df)
test_df['prediction'] = result

In [None]:
submission_df = test_df.drop(['additional_fare','distance','duration','fare','meter_waiting','meter_waiting_fare'],axis='columns')

In [None]:
submission_df.head()

Unnamed: 0_level_0,prediction
tripid,Unnamed: 1_level_1
213284604,1
213286352,0
213293973,1
213294622,1
213298687,1


5.Write Generated csv file  to the drive

In [None]:
submission_df.to_csv(data_path +'submission.csv', index=True)