In [43]:
import pandas as pd
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np

In [44]:
# Loading data
data=pd.read_csv("uomds20/train.csv")

In [45]:
# Below print statements used to identify missing values
print(data.isnull().sum())

tripid                         0
additional_fare              202
duration                     202
meter_waiting                202
meter_waiting_fare           202
meter_waiting_till_pickup    202
pickup_time                    0
drop_time                      0
pick_lat                       0
pick_lon                       0
drop_lat                       0
drop_lon                       0
fare                         137
label                          0
dtype: int64


In [47]:
# Drop the rows which are included null values
data = data.dropna()

In [46]:
# Lable value map to 0 and 1
data['label'] = data['label'].map({"correct": 1, "incorrect": 0})

In [48]:
# Change data types of columns
data['pickup_time']= pd.to_datetime(data['pickup_time']) 
data['drop_time']= pd.to_datetime(data['drop_time']) 
data = data.astype({'tripid': 'object'})

In [34]:
data.dtypes

tripid                               object
additional_fare                     float64
duration                            float64
meter_waiting                       float64
meter_waiting_fare                  float64
meter_waiting_till_pickup           float64
pickup_time                  datetime64[ns]
drop_time                    datetime64[ns]
pick_lat                            float64
pick_lon                            float64
drop_lat                            float64
drop_lon                            float64
fare                                float64
label                                 int64
dtype: object

In [49]:
data.head()

Unnamed: 0,tripid,additional_fare,duration,meter_waiting,meter_waiting_fare,meter_waiting_till_pickup,pickup_time,drop_time,pick_lat,pick_lon,drop_lat,drop_lon,fare,label
0,189123628,10.5,834.0,56.0,0.0,64.0,2019-11-01 00:20:00,2019-11-01 00:34:00,6.86252,79.8993,6.9033,79.8783,270.32,1
1,189125358,10.5,791.0,47.0,0.0,134.0,2019-11-01 00:56:00,2019-11-01 01:09:00,6.88589,79.8984,6.91373,79.8923,197.85,1
2,189125719,10.5,1087.0,80.0,0.0,61.0,2019-11-01 01:08:00,2019-11-01 01:26:00,6.90839,79.8651,6.93669,79.9146,301.64,1
3,189127273,10.5,598.0,271.0,15.6638,68.0,2019-11-01 02:27:00,2019-11-01 02:37:00,6.9257,79.8895,6.92748,79.8971,82.3,1
5,189129552,10.5,3407.0,182.0,0.0,112.0,2019-11-01 05:38:00,2019-11-01 06:35:00,7.13402,79.8969,6.91865,79.8649,1065.02,1


In [50]:
# Devide dataset to feature and label set
X,y =data.iloc[:,~data.columns.isin(['tripid','pickup_time', 'drop_time','label'])],data['label']

In [51]:
#split dataset to train set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=562)

In [52]:
model = XGBClassifier()
model.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster=None, colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints=None,
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=0, num_parallel_tree=1,
              objective='binary:logistic', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method=None,
              validate_parameters=False, verbosity=None)

In [53]:
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 95.23%


In [54]:
testData = pd.read_csv("uomds20/test.csv")

In [55]:
finalDf = testData[['tripid']]
test_x =testData.iloc[:,~testData.columns.isin(['tripid','pickup_time', 'drop_time'])]
pred = model.predict(test_x)
finalDf["prediction"] = pred

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [56]:
finalDf.head(10)

Unnamed: 0,tripid,prediction
0,213284604,1
1,213286352,1
2,213293973,1
3,213294622,1
4,213298687,1
5,213299545,1
6,213302332,1
7,213302671,1
8,213305594,1
9,213305134,1


In [57]:
finalDf.to_csv("submission.csv", index= False)