In [257]:
import warnings

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

import plotly.offline as py
import plotly.figure_factory as ff
import plotly.graph_objs as go
import plotly.io as pio
import plotly.express as px

plt.style.use('seaborn')
plt.rcParams['figure.figsize'] = [8, 4.5]
plt.rcParams['figure.dpi'] = 300
warnings.simplefilter(action = 'ignore', category = FutureWarning)

#import graphing

# import mglearn

In [258]:
Test = pd.read_csv('Test.csv')
Train =pd.read_csv('Train.csv')
Train.shape

(18506, 21)

In [259]:
Train.isnull().sum()

Tour_ID                     0
country                     0
age_group                   0
travel_with              1075
total_female                2
total_male                  6
purpose                     0
main_activity               0
info_source                 0
tour_arrangement            0
package_transport_int       0
package_accomodation        0
package_food                0
package_transport_tz        0
package_sightseeing         0
package_guided_tour         0
package_insurance           0
night_mainland              0
night_zanzibar              0
first_trip_tz               0
cost_category               0
dtype: int64

In [260]:
#List the columns for different datatypes:
print('List of all Columns: ')
print(Train.columns)
print('\n')
print('Integer Type: ')
Col_int = Train.select_dtypes(np.int64).columns
print(Col_int)
print('\n')
print('Float Type: ')
Col_float = Train.select_dtypes(np.float64).columns
print(Col_float)
print('\n')
print('Object Type: ')
Col_cat = Train.select_dtypes(object).columns
print(Col_cat)
print('\n')
print('Count:')
print(Train.dtypes.value_counts())

List of all Columns: 
Index(['Tour_ID', 'country', 'age_group', 'travel_with', 'total_female',
       'total_male', 'purpose', 'main_activity', 'info_source',
       'tour_arrangement', 'package_transport_int', 'package_accomodation',
       'package_food', 'package_transport_tz', 'package_sightseeing',
       'package_guided_tour', 'package_insurance', 'night_mainland',
       'night_zanzibar', 'first_trip_tz', 'cost_category'],
      dtype='object')


Integer Type: 
Index(['night_mainland', 'night_zanzibar'], dtype='object')


Float Type: 
Index(['total_female', 'total_male'], dtype='object')


Object Type: 
Index(['Tour_ID', 'country', 'age_group', 'travel_with', 'purpose',
       'main_activity', 'info_source', 'tour_arrangement',
       'package_transport_int', 'package_accomodation', 'package_food',
       'package_transport_tz', 'package_sightseeing', 'package_guided_tour',
       'package_insurance', 'first_trip_tz', 'cost_category'],
      dtype='object')


Count:
object     1

In [261]:
# Founction
def Clean(cols):
    travel_with =cols[0]
    total_female =cols[1]
    total_male =cols[2]
    if pd.isnull(travel_with):
        
        if total_female+total_male == 1:
            return "Alone"
        elif total_female+total_male <= 5:
            return "With Spouse and Children"
        elif total_female+total_male == 2:
            return "With Spouse "
        else:
            return "With Other Friend/Relative"
    else:
        return travel_with


Train['travel_with']=Train[['travel_with','total_female','total_male']].apply(Clean,axis=1)
Test['travel_with']=Test[['travel_with','total_female','total_male']].apply(Clean,axis=1)
    

In [263]:
Train.isnull().sum()

Tour_ID                  0
country                  0
age_group                0
travel_with              0
total_female             2
total_male               6
purpose                  0
main_activity            0
info_source              0
tour_arrangement         0
package_transport_int    0
package_accomodation     0
package_food             0
package_transport_tz     0
package_sightseeing      0
package_guided_tour      0
package_insurance        0
night_mainland           0
night_zanzibar           0
first_trip_tz            0
cost_category            0
dtype: int64

In [264]:
Train['total'] = Train['total_female'] + Train['total_male']
Test['total'] = Test['total_female'] + Test['total_male']
 

In [265]:
grouped = Train.groupby("total")
total =grouped.get_group(0)
total.shape

(94, 22)

In [266]:
g = Test.groupby("total")
t =g.get_group(0)
t.shape

(37, 21)

In [267]:
Train.drop(Train[Train['total'] ==0].index,inplace = True)
Test.drop(Test[Test['total'] ==0].index,inplace = True)
Test.shape

(6132, 21)

In [268]:
Train.describe(include='object').transpose()

Unnamed: 0,count,unique,top,freq
Tour_ID,18412,18412,tour_id1hffseyw,1
country,18412,131,UNITED STATES OF AMERICA,2825
age_group,18412,5,25-44,9002
travel_with,18412,6,Alone,8595
purpose,18412,8,Leisure and Holidays,11690
main_activity,18412,10,Widlife Tourism,5944
info_source,18412,8,"Travel agent, tour operator",7970
tour_arrangement,18412,2,Package Tour,9251
package_transport_int,18412,2,No,12170
package_accomodation,18412,2,No,9490


In [269]:
Train['first_trip_tz'] = Train['first_trip_tz'].map({'No':0,'Yes':1})
Train['package_insurance'] = Train['package_insurance'].map({'No':0,'Yes':1})
Train['package_guided_tour'] = Train['package_guided_tour'].map({'No':0,'Yes':1})
Train['package_sightseeing'] = Train['package_sightseeing'].map({'No':0,'Yes':1})
Train['package_transport_tz'] = Train['package_transport_tz'].map({'No':0,'Yes':1})
Train['package_food'] = Train['package_food'].map({'No':0,'Yes':1})
Train['package_accomodation'] = Train['package_accomodation'].map({'No':0,'Yes':1})
Train['package_transport_int'] = Train['package_transport_int'].map({'No':0,'Yes':1})
Train['tour_arrangement'] = Train['tour_arrangement'].map({'Package Tour':0,'Independent':1})

Test['first_trip_tz'] = Test['first_trip_tz'].map({'No':0,'Yes':1})
Test['package_insurance'] = Test['package_insurance'].map({'No':0,'Yes':1})
Test['package_guided_tour'] = Test['package_guided_tour'].map({'No':0,'Yes':1})
Test['package_sightseeing'] = Test['package_sightseeing'].map({'No':0,'Yes':1})
Test['package_transport_tz'] = Test['package_transport_tz'].map({'No':0,'Yes':1})
Test['package_food'] = Test['package_food'].map({'No':0,'Yes':1})
Test['package_accomodation'] = Test['package_accomodation'].map({'No':0,'Yes':1})
Test['package_transport_int'] = Test['package_transport_int'].map({'No':0,'Yes':1})
Test['tour_arrangement'] = Test['tour_arrangement'].map({'Package Tour':0,'Independent':1})


In [271]:
XTrain =  Train.drop(['cost_category','Tour_ID','total'], axis=1)
yTrain =  Train['cost_category']
XTest  =  Test.drop(['Tour_ID','total'], axis=1)

In [278]:
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.metrics import precision_score,recall_score
from sklearn.metrics import f1_score
import xgboost as xgb
from sklearn.neural_network import MLPClassifier
#from lightgbm import LGBMClassifier

'''
    Select the best model for the given dataset
    :param X: features
    :param Y: labels
    :return: the name and the accuracy of the model for the given dataset
    '''
    

models = [RandomForestClassifier(), KNeighborsClassifier(), SVC(), LogisticRegression(),xgb.XGBClassifier(),GaussianNB(),
              SGDClassifier(), DecisionTreeClassifier(),MLPClassifier()]
scores = dict()

for model in models:
    model.fit(X_Train,yTrain)
    predictions = model.predict(X_Test)
    predictions

    print(f'model: {str(model)}')
    print(f'Accuracy_score: {accuracy_score(predictions)}')

ValueError: X has 161 features, but DecisionTreeClassifier is expecting 175 features as input.

In [None]:
# output = pd.DataFrame({'PassengerId':test.PassengerId,'Survived':predictions})
# print(output)

In [None]:
# output.to_csv('Submission.csv',index=False)
# print('Your Submission was successfully saved!')