In [1]:
# Importing libraries
import numpy as np 
from sklearn.model_selection import train_test_split
from sklearn import svm
import pandas as pd
from sklearn import metrics

In [2]:
# Loading breast-cancer-wisconsin data 
data=pd.read_csv("uomds20/train.csv")

In [3]:
# Below print statements used to identify missing values
print(data.isnull().sum())

tripid                         0
additional_fare              202
duration                     202
meter_waiting                202
meter_waiting_fare           202
meter_waiting_till_pickup    202
pickup_time                    0
drop_time                      0
pick_lat                       0
pick_lon                       0
drop_lat                       0
drop_lon                       0
fare                         137
label                          0
dtype: int64


In [4]:
# Drop the rows which are included null values
data = data.dropna()

In [5]:
# Below print statements used to identify missing values
print(data.isnull().sum())

tripid                       0
additional_fare              0
duration                     0
meter_waiting                0
meter_waiting_fare           0
meter_waiting_till_pickup    0
pickup_time                  0
drop_time                    0
pick_lat                     0
pick_lon                     0
drop_lat                     0
drop_lon                     0
fare                         0
label                        0
dtype: int64


In [6]:
# Check data types of the columns
print("Columns and data types")
print(pd.DataFrame(data.dtypes).rename(columns = {0:'dtype'}))

Columns and data types
                             dtype
tripid                       int64
additional_fare            float64
duration                   float64
meter_waiting              float64
meter_waiting_fare         float64
meter_waiting_till_pickup  float64
pickup_time                 object
drop_time                   object
pick_lat                   float64
pick_lon                   float64
drop_lat                   float64
drop_lon                   float64
fare                       float64
label                       object


In [7]:
print("Dataset size")
print("Rows {} Columns {}".format(data.shape[0], data.shape[1]))

Dataset size
Rows 16968 Columns 14


In [8]:
# Lable value map to 0 and 1
data['label'] = data['label'].map({"correct": 1, "incorrect": 0})

In [9]:
# Devide dataset to feature and label set
X,y =data.iloc[:,~data.columns.isin(['tripid','pickup_time', 'drop_time','label'])],data['label']

In [10]:
#split dataset to train set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=562)

In [11]:
#Create a svm Classifier
clf = svm.SVC(kernel='rbf')

In [12]:
#Train the model using the training sets
clf.fit(X_train, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [13]:
#Predict the response for test dataset
predictions = clf.predict(X_test)

In [14]:
# Check the accuracy
print("Accuracy:",metrics.accuracy_score(y_test, predictions)*100)


Accuracy: 92.01532115497938


In [15]:
testData = pd.read_csv("uomds20/test.csv")

In [16]:
finalDf = testData[['tripid']]
test_x =testData.iloc[:,~testData.columns.isin(['tripid','pickup_time', 'drop_time'])]
pred = clf.predict(test_x)
finalDf["prediction"] = pred

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [17]:
finalDf.head(10)

Unnamed: 0,tripid,prediction
0,213284604,1
1,213286352,1
2,213293973,1
3,213294622,1
4,213298687,1
5,213299545,1
6,213302332,1
7,213302671,1
8,213305594,1
9,213305134,1


In [18]:
finalDf.to_csv("submission.csv", index= False)