In [168]:
import warnings

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

import plotly.offline as py
import plotly.figure_factory as ff
import plotly.graph_objs as go
import plotly.io as pio
import plotly.express as px

plt.style.use('seaborn')
plt.rcParams['figure.figsize'] = [8, 4.5]
plt.rcParams['figure.dpi'] = 300
warnings.simplefilter(action = 'ignore', category = FutureWarning)

#import graphing

# import mglearn

In [169]:
Test = pd.read_csv('Test.csv')
Train =pd.read_csv('Train.csv')
Train.shape

(18506, 21)

In [171]:
#List the columns for different datatypes:
print('List of all Columns: ')
print(Train.columns)
print('\n')
print('Integer Type: ')
Col_int = Train.select_dtypes(np.int64).columns
print(Col_int)
print('\n')
print('Float Type: ')
Col_float = Train.select_dtypes(np.float64).columns
print(Col_float)
print('\n')
print('Object Type: ')
Col_cat = Train.select_dtypes(object).columns
print(Col_cat)
print('\n')
print('Count:')
print(Train.dtypes.value_counts())

List of all Columns: 
Index(['Tour_ID', 'country', 'age_group', 'travel_with', 'total_female',
       'total_male', 'purpose', 'main_activity', 'info_source',
       'tour_arrangement', 'package_transport_int', 'package_accomodation',
       'package_food', 'package_transport_tz', 'package_sightseeing',
       'package_guided_tour', 'package_insurance', 'night_mainland',
       'night_zanzibar', 'first_trip_tz', 'cost_category'],
      dtype='object')


Integer Type: 
Index(['night_mainland', 'night_zanzibar'], dtype='object')


Float Type: 
Index(['total_female', 'total_male'], dtype='object')


Object Type: 
Index(['Tour_ID', 'country', 'age_group', 'travel_with', 'purpose',
       'main_activity', 'info_source', 'tour_arrangement',
       'package_transport_int', 'package_accomodation', 'package_food',
       'package_transport_tz', 'package_sightseeing', 'package_guided_tour',
       'package_insurance', 'first_trip_tz', 'cost_category'],
      dtype='object')


Count:
object     1

# Get summary statistics for categorical variables:


In [None]:
Train.describe().transpose()

In [None]:
# Let take a look at the Object Type

Train.describe(include='object').transpose()

# Note

---

The majority of the visitors under the age group of 25-44 came for business (18.5%), or leisure and holidays (53.2%), which is consistent with the fact that they are economically more productive. Those at the age group of 45-64 were more prominent in holiday making and visiting friends and relatives. The results further reveal that most visitors belonging to the age group of 18-24 came for leisure and holidays (55.3%) as well as volunteering (13.7%). The majority of senior citizens (65 and above) came for leisure and holidays (80.9%) and visiting friends and relatives (9.5%).

# Let Varify that

---


In [None]:
plot_ = sns.countplot(x=Train.age_group.dropna(), color='blue')

In [172]:

def Clean(cols):
    travel_with =cols[0]
    total_female =cols[1]
    total_male =cols[2]
    if pd.isnull(travel_with):
        
        if total_female+total_male == 1:
            return "Alone"
        elif total_female+total_male <= 5:
            return "With Spouse and Children"
        elif total_female+total_male == 2:
            return "With Spouse "
        else:
            return "With Other Friend/Relative"
    else:
        return travel_with


Train['travel_with']=Train[['travel_with','total_female','total_male']].apply(Clean,axis=1)
Test['travel_with']=Test[['travel_with','total_female','total_male']].apply(Clean,axis=1)


    

In [173]:
Train['total'] = Train['total_female'] + Train['total_male']
Test['total'] = Test['total_female'] + Test['total_male']
 

In [174]:
grouped = Train.groupby("total")
total =grouped.get_group(0)
total.shape

(94, 22)

In [175]:
g = Test.groupby("total")
t =g.get_group(0)
t.shape

(37, 21)

In [176]:
Train.drop(Train[Train['total'] ==0].index,inplace = True)
Test.drop(Test[Test['total'] ==0].index,inplace = True)

Train['total'].fillna(Train['total'].mode()[0],inplace=True)
Test['total'].fillna(Test['total'].mode()[0],inplace=True)

Test.shape

(6132, 21)

In [177]:
Train = Train.drop(['Tour_ID','total_female','total_male'], axis=1)
Test = Test.drop(['Tour_ID','total_female','total_male'], axis=1)


In [178]:
Train.describe(include='object').transpose()

Unnamed: 0,count,unique,top,freq
country,18412,131,UNITED STATES OF AMERICA,2825
age_group,18412,5,25-44,9002
travel_with,17339,5,Alone,7550
purpose,18412,8,Leisure and Holidays,11690
main_activity,18412,10,Widlife Tourism,5944
info_source,18412,8,"Travel agent, tour operator",7970
tour_arrangement,18412,2,Package Tour,9251
package_transport_int,18412,2,No,12170
package_accomodation,18412,2,No,9490
package_food,18412,2,No,10090


In [179]:
Test.drop(Test[Test['country'] =='IRAQ'].index,inplace = True)
Test.drop(Test[Test['country'] =='CHAD'].index,inplace = True)
Test.drop(Test[Test['country'] =='HAITI'].index,inplace = True)
Test.drop(Test[Test['country'] =='GUINEA'].index,inplace = True)
Test.drop(Test[Test['country'] =='BOLIVIA'].index,inplace = True)
Test.drop(Test[Test['country'] =='MYANMAR'].index,inplace = True)
Test.drop(Test[Test['country'] =='KAZAKHSTAN'].index,inplace = True)
Test.drop(Test[Test['country'] =='BENIN'].index,inplace = True)
Test.drop(Test[Test['country'] =='EL SALVADOR'].index,inplace = True)
Test.drop(Test[Test['country'] =='BELARUS'].index,inplace = True)
Test.country.shape

(6121,)

In [180]:
X = Train.drop('cost_category', axis=1)  #independent columns without the index column
y = Train.cost_category


In [181]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer

LE = LabelEncoder()

y=y.astype('str')

y= LE.fit_transform(y)


In [182]:
from sklearn.impute import SimpleImputer
for col in X.select_dtypes('object'):
    cat_imputer = SimpleImputer(strategy='most_frequent')
    cat_imputer.fit(X[[col]])
    X.loc[:, col] = cat_imputer.transform(X[[col]])
    Test.loc[:, col] = cat_imputer.transform(Test[[col]])

In [183]:
CAT_FEATURES = ['country','age_group', 'travel_with', 'purpose',
       'main_activity', 'info_source', 'tour_arrangement',
       'package_transport_int', 'package_accomodation', 'package_food',
       'package_transport_tz', 'package_sightseeing', 'package_guided_tour',
       'package_insurance', 'first_trip_tz']
       
# Instantiate the One-Hot Encoder object:
one_hot_encoder = OneHotEncoder(sparse=False, 
                               handle_unknown='error', 
                               drop='first')  
# Create the column transformer using the one-hot encoder:
one_hot_transformer = ColumnTransformer(
    [("one_hot", one_hot_encoder, CAT_FEATURES)]
    #,remainder='passthrough'
)    
# Fit the transformer:
one_hot_transformer.fit(X)

# Apply the transformations to both training and test sets:
col_names = one_hot_transformer.get_feature_names()

X_train_cat = pd.DataFrame(one_hot_transformer.transform(X),
                          columns=col_names, 
                          index=X.index)

X = pd.concat([X, X_train_cat], axis=1) \
                .drop(CAT_FEATURES, axis=1)

X_test_cat = pd.DataFrame(one_hot_transformer.transform(Test), 
                         columns=col_names, 
                         index=Test.index)
                         
Test = pd.concat([Test, X_test_cat], axis=1) \
               .drop(CAT_FEATURES, axis=1)

In [184]:
## Splitting the data into training and test sets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X = StandardScaler().fit_transform(X)

X_train, X_cv, y_train, y_cv = train_test_split(X, y ,test_size = 0.15, random_state = 42,shuffle= True)


In [185]:
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import  KFold, GridSearchCV
from sklearn.metrics import precision_score,recall_score
from sklearn.metrics import f1_score
import xgboost as xgb
from sklearn.neural_network import MLPClassifier
#from lightgbm import LGBMClassifier

'''
    Select the best model for the given dataset
    :param X: features
    :param Y: labels
    :return: the name and the accuracy of the model for the given dataset
    '''
    #,xgb.XGBClassifier()
models = [RandomForestClassifier(), KNeighborsClassifier(), SVC(), LogisticRegression(max_iter=1000),GaussianNB(),
              SGDClassifier(), DecisionTreeClassifier(),MLPClassifier()]
scores = dict()

for m in models:
    m.fit(X_train, y_train)
    ypred = m.predict(X_cv)

    print(f'model: {str(m)}')
    print(f'Accuracy_score: {accuracy_score(y_cv,ypred)}')

model: RandomForestClassifier()
Accuracy_score: 0.556118754525706
model: KNeighborsClassifier()
Accuracy_score: 0.4885952208544533
model: SVC()
Accuracy_score: 0.5499637943519189



lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression



model: LogisticRegression()
Accuracy_score: 0.5555756698044895
model: GaussianNB()
Accuracy_score: 0.10300506879073136
model: SGDClassifier()
Accuracy_score: 0.5
model: DecisionTreeClassifier()
Accuracy_score: 0.46469949312092684
model: MLPClassifier()
Accuracy_score: 0.5048877624909486



Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.



In [186]:
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.preprocessing import StandardScaler
# from sklearn.metrics import accuracy_score

# X = StandardScaler().fit_transform(X)

# rf = RandomForestClassifier()
# rf.fit(X_train, y_train)
# y_pred = rf.predict(X_cv)
# print(f'Accuracy_score: {accuracy_score(y_cv,y_pred)}')

In [187]:
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 600, num = 50)] # Number of estimators
max_features = ['auto' , 'sqrt'] # Number of feature to consider at every split
max_depth = [5,10,15,20,25] # Maximum number of level in tree
min_sample_split = [2 , 5] # Minimum number of samples required to split a node
min_sample_leaf = [1 , 2] # Minimum number of sample required at each leaf node
bootstrap = [True , False] # Method of selecting sample for training each tree

In [188]:
Param_grid = {
    'n_estimators': n_estimators,
    'max_features': max_features,
    'max_depth': max_depth,
    'min_samples_split': min_sample_split,
    'min_samples_leaf': min_sample_leaf,
    'bootstrap': bootstrap
}

In [189]:
Grid_rf = RandomForestClassifier(random_state=45)

In [190]:
from sklearn.model_selection import GridSearchCV

Grid = GridSearchCV(
    estimator= Grid_rf,
    param_grid =Param_grid,
    cv = 3,
    verbose= 2,
    n_jobs = -1
)

In [191]:
Grid.fit(X_train, y_train)

Fitting 3 folds for each of 320 candidates, totalling 960 fits


GridSearchCV(cv=3, estimator=RandomForestClassifier(), n_jobs=4,
             param_grid={'bootstrap': [True, False], 'max_depth': [2, 4],
                         'max_features': ['auto', 'sqrt'],
                         'min_samples_leaf': [1, 2],
                         'min_samples_split': [2, 5],
                         'n_estimators': [10, 20, 30, 40, 50, 60, 70, 80, 90,
                                          100]},
             verbose=2)

In [192]:
Grid.best_params_

{'bootstrap': True,
 'max_depth': 4,
 'max_features': 'sqrt',
 'min_samples_leaf': 1,
 'min_samples_split': 5,
 'n_estimators': 20}

In [193]:
print(f'Train Accuracy -: {Grid.score(X_train, y_train):.3f}')
print(f'CV Accuracy -: {Grid.score(X_cv, y_cv):.3f}')

Train Accuracy -: 0.488
CV Accuracy -: 0.485


In [None]:
# preds0 = preds_test[:, 0]
# preds1 = preds_test[:, 1]
# preds2 = preds_test[:, 2]
# preds3 = preds_test[:, 3]
# preds4 = preds_test[:, 4]
# preds5 = preds_test[:, 5]

In [None]:
# SampleSubmission['Normal Cost']  = preds0
# SampleSubmission['Higher Cost']  = preds1
# SampleSubmission['High Cost']    = preds2
# SampleSubmission['Lower Cost']   = preds3
# SampleSubmission['Low Cost']     = preds4
# SampleSubmission['Highest Cost']= preds5
# SampleSubmission.to_csv('sub.csv', index=False)

# SampleSubmission