In [30]:
import pandas as pd
import numpy as np
import csv
from sklearn.compose import ColumnTransformer
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder
# import seaborn as sns
# import matplotlib.pyplot as plt
from scipy import stats
import sys
from sklearn.impute import SimpleImputer
import random
from sklearn.svm import LinearSVC

In [31]:
def read_train():
    train = pd.read_csv("train.csv")

    train = train[~(train['num_of_cancelled_trips'] >= 6)]
    train = train[~(train['anon_var_1'] >=115)]
    train = train[~(train['anon_var_2'] >= 70)]
    train = train[~(train['anon_var_3'] >= 110 )]
    train = train[~(2>train['customer_score'])]
    train = train[~(train['customer_score']>=3.7 )]
#     print(train.columns)
    return train

def hotenc(train):
    mf1 = pd.get_dummies(train['taxi_type'])
    train = train.drop('taxi_type',axis = 1)
    train = train.join(mf1,rsuffix='_Taxitype')

    mf2 = pd.get_dummies(train['customer_score_confidence'])
    train = train.drop('customer_score_confidence',axis = 1)
    train = train.join(mf2,rsuffix='_Customerscore_confidence')

    mf = pd.get_dummies(train['drop_location_type'])
    train = train.drop('drop_location_type',axis = 1)
    train = train.join(mf,rsuffix='_drop_location_type')

    mf = pd.get_dummies(train['sex'])
    train = train.drop('sex',axis = 1)
    train = train.join(mf,rsuffix='sex')

    return train

def imputer(data,test):
    
    imp_mean1 = SimpleImputer(missing_values=np.nan, strategy='mean')
    imp_mean2 = SimpleImputer(missing_values=np.nan, strategy='mean')
    imp_median = SimpleImputer(missing_values=np.nan, strategy='median')

    
    dat = np.array(data["customer_score"])
    imp_mean1.fit(np.resize(dat,(len(dat),1)))
    dat = imp_mean1.transform(np.resize(dat,(len(dat),1)))
    data["customer_score"] = dat
#     print(data.isna().sum())

    
    dat = np.array(test["customer_score"])
    dat = imp_mean1.transform(np.resize(dat,(len(dat),1)))
    test["customer_score"] = dat
#     print(test.isna().sum())

    dat = np.array(data["anon_var_1"])
    imp_mean2.fit(np.resize(dat,(len(dat),1)))
    dat = imp_mean2.transform(np.resize(dat,(len(dat),1)))
    data["anon_var_1"] = dat
#     print(data.isna().sum())
    
    
    
    dat = np.array(test["anon_var_1"])
    dat = imp_mean1.transform(np.resize(dat,(len(dat),1)))
    test["anon_var_1"] = dat
    
    
    dat = np.array(data["months_of_activity"])
    imp_median.fit(np.resize(dat,(len(dat),1)))
    dat = imp_median.transform(np.resize(dat,(len(dat),1)))
    data["months_of_activity"] = dat
#     print(data.isna().sum())
    
    
    
    dat = np.array(test["months_of_activity"])
    dat = imp_median.transform(np.resize(dat,(len(dat),1)))
    test["months_of_activity"] = dat

    
#     print(test)
    return data, test

def boxCoxTransform(train, test):
  
    xt, _ = stats.boxcox(train["anon_var_2"])
    xte = stats.boxcox(test["anon_var_2"],lmbda = _)
    train["anon_var_2"] = (xt-xt.mean())/xt.std()
    test["anon_var_2"] = (xte-xt.mean())/xt.std()
    
    xt, _ = stats.boxcox(train["distance"])
    xte = stats.boxcox(test["distance"],lmbda = _)
    train["distance"] = (xt-xt.mean())/xt.std()
    test["distance"] = (xte-xt.mean())/xt.std()
    
    xt, _ = stats.boxcox(train["customer_score"])
    xte = stats.boxcox(test["customer_score"],lmbda = _)
    train["customer_score"] = (xt-xt.mean())/xt.std()
    test["customer_score"] = (xte-xt.mean())/xt.std()

    xt, _ = stats.boxcox(train["ratings_given_by_cust"])
    xte = stats.boxcox(test["ratings_given_by_cust"],lmbda = _)
    train["ratings_given_by_cust"] = (xt-xt.mean())/xt.std()
    test["ratings_given_by_cust"] = (xte-xt.mean())/xt.std()
    
    xt, _ = stats.boxcox(train["anon_var_1"])
    xte = stats.boxcox(test["anon_var_1"],lmbda = _)
    train["anon_var_1"] = (xt-xt.mean())/xt.std()
    test["anon_var_1"] = (xte-xt.mean())/xt.std()

    
    xt, _ = stats.boxcox(train["anon_var_3"])
    xte = stats.boxcox(test["anon_var_3"],lmbda = _)
    train["anon_var_3"] = (xt-xt.mean())/xt.std()
    test["anon_var_3"] = (xte-xt.mean())/xt.std()

    return train, test

In [32]:
train_data = read_train()
train_data = hotenc(train_data)
train_data = train_data.drop('id',axis = 1)


In [20]:
# test_data = pd.read_csv("test.csv")
# test_data = hotenc(test_data)
# train_data, test_data = imputer(train_data,test_data)
# train_data, test_data = boxCoxTransform(train_data,test_data)

In [33]:
data_train,data_test = train_test_split(train_data, test_size=0.2, shuffle = True)

data_train, data_test = imputer(data_train,data_test)
# data_train, data_test = boxCoxTransform(data_train,data_test)
data_train_y = np.array(data_train['pricing_category'])
data_train_x = np.array(data_train.drop('pricing_category',axis = 1))
data_test_y = np.array(data_test['pricing_category'])
data_test_x = np.array(data_test.drop('pricing_category',axis = 1))



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is tryin

In [34]:
print(data_test_y)

[3 2 2 ... 3 2 3]


In [35]:
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
kernel = 1.0 * RBF(1.0)
gpc = GaussianProcessClassifier()
gpc.fit(data_train_x, data_train_y)

MemoryError: 