In [95]:
import pandas as pd
import numpy as np
import csv
from sklearn.compose import ColumnTransformer
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder
# import seaborn as sns
# import matplotlib.pyplot as plt
from scipy import stats
import sys
from sklearn.impute import SimpleImputer
import random
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.svm import LinearSVC

In [96]:
def read_train():
    train = pd.read_csv("train.csv")

    train = train[~(train['num_of_cancelled_trips'] >= 6)]
    train = train[~(train['anon_var_1'] >=115)]
    train = train[~(train['anon_var_2'] >= 70)]
    train = train[~(train['anon_var_3'] >= 110 )]
    train = train[~(2>train['customer_score'])]
    train = train[~(train['customer_score']>=3.7 )]
#     print(train.columns)
    return train

def hotenc(train):
    mf1 = pd.get_dummies(train['taxi_type'])
    train = train.drop('taxi_type',axis = 1)
    train = train.join(mf1,rsuffix='_Taxitype')

    mf2 = pd.get_dummies(train['customer_score_confidence'])
    train = train.drop('customer_score_confidence',axis = 1)
    train = train.join(mf2,rsuffix='_Customerscore_confidence')

    mf = pd.get_dummies(train['drop_location_type'])
    train = train.drop('drop_location_type',axis = 1)
    train = train.join(mf,rsuffix='_drop_location_type')

    mf = pd.get_dummies(train['sex'])
    train = train.drop('sex',axis = 1)
    train = train.join(mf,rsuffix='sex')
    mf = train['anon_var_1'].isnull()
    mf =mf.astype(int)
    print (mf)
    train = train.join(mf,rsuffix= 'anon_var_1_presence')
    return train

def imputer(data,test):
    
    imp_mean1 = SimpleImputer(missing_values=np.nan, strategy='mean')
    imp_mean2 = SimpleImputer(missing_values=np.nan, strategy='mean')
    imp_median = SimpleImputer(missing_values=np.nan, strategy='median')

    
    dat = np.array(data["customer_score"])
    imp_mean1.fit(np.resize(dat,(len(dat),1)))
    dat = imp_mean1.transform(np.resize(dat,(len(dat),1)))
    data["customer_score"] = dat
#     print(data.isna().sum())

    
    dat = np.array(test["customer_score"])
    dat = imp_mean1.transform(np.resize(dat,(len(dat),1)))
    test["customer_score"] = dat
#     print(test.isna().sum())

    dat = np.array(data["anon_var_1"])
    imp_mean2.fit(np.resize(dat,(len(dat),1)))
    dat = imp_mean2.transform(np.resize(dat,(len(dat),1)))
    data["anon_var_1"] = dat
#     print(data.isna().sum())
    
    
    
    dat = np.array(test["anon_var_1"])
    dat = imp_mean1.transform(np.resize(dat,(len(dat),1)))
    test["anon_var_1"] = dat
    
    
    dat = np.array(data["months_of_activity"])
    imp_median.fit(np.resize(dat,(len(dat),1)))
    dat = imp_median.transform(np.resize(dat,(len(dat),1)))
    data["months_of_activity"] = dat
#     print(data.isna().sum())
    
    
    
    dat = np.array(test["months_of_activity"])
    dat = imp_median.transform(np.resize(dat,(len(dat),1)))
    test["months_of_activity"] = dat

    
#     print(test)
    return data, test

def boxCoxTransform(train, test):
  
    xt, _ = stats.boxcox(train["anon_var_2"])
    xte = stats.boxcox(test["anon_var_2"],lmbda = _)
    train["anon_var_2"] = (xt-xt.mean())/xt.std()
    test["anon_var_2"] = (xte-xt.mean())/xt.std()
    
    xt, _ = stats.boxcox(train["distance"])
    xte = stats.boxcox(test["distance"],lmbda = _)
    train["distance"] = (xt-xt.mean())/xt.std()
    test["distance"] = (xte-xt.mean())/xt.std()
    
    xt, _ = stats.boxcox(train["customer_score"])
    xte = stats.boxcox(test["customer_score"],lmbda = _)
    train["customer_score"] = (xt-xt.mean())/xt.std()
    test["customer_score"] = (xte-xt.mean())/xt.std()

    xt, _ = stats.boxcox(train["ratings_given_by_cust"])
    xte = stats.boxcox(test["ratings_given_by_cust"],lmbda = _)
    train["ratings_given_by_cust"] = (xt-xt.mean())/xt.std()
    test["ratings_given_by_cust"] = (xte-xt.mean())/xt.std()
    
    xt, _ = stats.boxcox(train["anon_var_1"])
    xte = stats.boxcox(test["anon_var_1"],lmbda = _)
    train["anon_var_1"] = (xt-xt.mean())/xt.std()
    test["anon_var_1"] = (xte-xt.mean())/xt.std()

    
    xt, _ = stats.boxcox(train["anon_var_3"])
    xte = stats.boxcox(test["anon_var_3"],lmbda = _)
    train["anon_var_3"] = (xt-xt.mean())/xt.std()
    test["anon_var_3"] = (xte-xt.mean())/xt.std()

    return train, test

In [97]:
train_data = read_train()
train_data = hotenc(train_data)



0        1
1        0
2        0
3        0
4        1
5        1
6        1
7        1
8        0
9        1
10       0
11       1
12       0
13       0
14       1
15       0
16       0
17       0
18       0
19       1
20       1
21       1
22       1
23       1
24       0
25       0
26       1
27       0
28       1
29       1
        ..
78967    0
78968    0
78969    1
78970    0
78971    0
78972    1
78973    0
78974    1
78975    1
78976    1
78977    1
78978    1
78979    0
78980    0
78981    1
78982    1
78983    1
78984    1
78985    1
78986    1
78987    1
78988    0
78989    0
78990    0
78991    1
78992    0
78993    1
78994    1
78995    1
78996    0
Name: anon_var_1, Length: 77673, dtype: int64


In [98]:
test_data = pd.read_csv("test.csv")
test_data = hotenc(test_data)
# train_data, test_data = imputer(train_data,test_data)
# train_data, test_data = boxCoxTransform(train_data,test_data)

0        1
1        1
2        1
3        0
4        0
5        0
6        1
7        0
8        1
9        1
10       0
11       0
12       1
13       1
14       1
15       1
16       1
17       1
18       0
19       1
20       0
21       1
22       0
23       1
24       1
25       1
26       1
27       0
28       0
29       1
        ..
52635    1
52636    1
52637    1
52638    0
52639    0
52640    1
52641    1
52642    0
52643    1
52644    0
52645    1
52646    1
52647    1
52648    0
52649    0
52650    1
52651    1
52652    0
52653    1
52654    1
52655    1
52656    1
52657    0
52658    0
52659    0
52660    0
52661    0
52662    1
52663    1
52664    1
Name: anon_var_1, Length: 52665, dtype: int64


In [100]:
data_train,data_test = train_test_split(train_data, test_size=0, shuffle = True)

# data_train, data_test = imputer(data_train,data_test)
# data_train, data_test = boxCoxTransform(data_train,data_test)

In [101]:
print(data_test.shape)
print(data_train.columns)

(0, 35)
Index(['id', 'distance', 'months_of_activity', 'customer_score',
       'ratings_given_by_cust', 'num_of_cancelled_trips', 'anon_var_1',
       'anon_var_2', 'anon_var_3', 'pricing_category', 'A', 'B', 'C', 'D', 'E',
       'A_Customerscore_confidence', 'B_Customerscore_confidence',
       'C_Customerscore_confidence', 'A_drop_location_type',
       'B_drop_location_type', 'C_drop_location_type', 'D_drop_location_type',
       'E_drop_location_type', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N',
       'Female', 'Male', 'anon_var_1anon_var_1_presence'],
      dtype='object')


In [93]:
def svm(training_data,testing_data):
    ytrain = training_data["pricing_category"]
    xtrain = training_data.drop("pricing_category",axis = 1)
    xtrain = xtrain.drop("id",axis = 1)
    
    ytest = testing_data["pricing_category"]
    xtest = testing_data.drop("pricing_category",axis = 1)
    xtest = xtest.drop("id",axis = 1)
    
    
#     clf = LinearSVC(random_state=0, tol=1e-5 ,max_iter = 10000)
#     clf.fit(xtrain, ytrain)

    polynomial_svm = SVC( kernel = "poly",degree = 2,coef0 = 1, gamma = 1)

    polynomial_svm.fit(xtrain,ytrain)

    outcome = clf.predict(xtest)
    
    

    accuracy = accuracy_score(ytest,outcome)
    print(accuracy)

In [None]:
svm(data_train,data_test)

In [None]:
def mlp(training_data,testing_data):
    ytrain = training_data["pricing_category"]
    xtrain = training_data.drop("pricing_category",axis = 1)
    xtrain = xtrain.drop("id",axis = 1)
    
    ytest = testing_data["pricing_category"]
    xtest = testing_data.drop("pricing_category",axis = 1)
    xtest = xtest.drop("id",axis = 1)
    
    
    clf = MLPClassifier(solver='lbfgs', alpha=1e-5,hidden_layer_sizes=(10, 3), random_state=1)
    clf.fit(xtrain, ytrain)

    #     polynomial_svm = SVC(C = c, kernel = "poly",degree = deg,coef0 = 1, gamma = 1)

    #     polynomial_svm.fit(x,y)
    
    outcome = clf.predict(xtest)
    
    

    accuracy = accuracy_score(ytest,outcome)
    print(accuracy)

In [None]:
mlp(data_train,data_test)

In [93]:
def xg(training_data,testing_data):
    ytrain = training_data["pricing_category"]
    xtrain = training_data.drop("pricing_category",axis = 1)
    xtrain = xtrain.drop("id",axis = 1)
    
    ytest = testing_data["pricing_category"]
    xtest = testing_data.drop("pricing_category",axis = 1)
    xtest = xtest.drop("id",axis = 1)
    clf = XGBClassifier(nthread=-1,max_depth = 4, learning_rate=0.01,n_estimators=500,subsample=0.77,colsample_bynode=0.75,verbosity = 0)
    
    clf.fit(xtrain, ytrain)

    #     polynomial_svm = SVC(C = c, kernel = "poly",degree = deg,coef0 = 1, gamma = 1)

    #     polynomial_svm.fit(x,y)
    
    outcome = clf.predict(xtest)
    
    

    accuracy = accuracy_score(ytest,outcome)
    print(accuracy)

In [94]:
xg(data_train,data_test)

0.699130994528484
