In [None]:
import warnings

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

import plotly.offline as py
import plotly.figure_factory as ff
import plotly.graph_objs as go
import plotly.io as pio
import plotly.express as px

plt.style.use('seaborn')
plt.rcParams['figure.figsize'] = [8, 4.5]
plt.rcParams['figure.dpi'] = 300
warnings.simplefilter(action = 'ignore', category = FutureWarning)

#import graphing

# import mglearn

In [None]:
Test = pd.read_csv('Test.csv')
Train =pd.read_csv('Train.csv')
Train.shape

(18506, 21)

In [None]:
Train.isnull().sum()

Tour_ID                     0
country                     0
age_group                   0
travel_with              1075
total_female                2
total_male                  6
purpose                     0
main_activity               0
info_source                 0
tour_arrangement            0
package_transport_int       0
package_accomodation        0
package_food                0
package_transport_tz        0
package_sightseeing         0
package_guided_tour         0
package_insurance           0
night_mainland              0
night_zanzibar              0
first_trip_tz               0
cost_category               0
dtype: int64

In [None]:
#List the columns for different datatypes:
print('List of all Columns: ')
print(Train.columns)
print('\n')
print('Integer Type: ')
Col_int = Train.select_dtypes(np.int64).columns
print(Col_int)
print('\n')
print('Float Type: ')
Col_float = Train.select_dtypes(np.float64).columns
print(Col_float)
print('\n')
print('Object Type: ')
Col_cat = Train.select_dtypes(object).columns
print(Col_cat)
print('\n')
print('Count:')
print(Train.dtypes.value_counts())

List of all Columns: 
Index(['Tour_ID', 'country', 'age_group', 'travel_with', 'total_female',
       'total_male', 'purpose', 'main_activity', 'info_source',
       'tour_arrangement', 'package_transport_int', 'package_accomodation',
       'package_food', 'package_transport_tz', 'package_sightseeing',
       'package_guided_tour', 'package_insurance', 'night_mainland',
       'night_zanzibar', 'first_trip_tz', 'cost_category'],
      dtype='object')


Integer Type: 
Index(['night_mainland', 'night_zanzibar'], dtype='object')


Float Type: 
Index(['total_female', 'total_male'], dtype='object')


Object Type: 
Index(['Tour_ID', 'country', 'age_group', 'travel_with', 'purpose',
       'main_activity', 'info_source', 'tour_arrangement',
       'package_transport_int', 'package_accomodation', 'package_food',
       'package_transport_tz', 'package_sightseeing', 'package_guided_tour',
       'package_insurance', 'first_trip_tz', 'cost_category'],
      dtype='object')


Count:
object     1

In [None]:

def Clean(cols):
    travel_with =cols[0]
    total_female =cols[1]
    total_male =cols[2]
    if pd.isnull(travel_with):
        
        if total_female+total_male == 1:
            return "Alone"
        elif total_female+total_male <= 5:
            return "With Spouse and Children"
        elif total_female+total_male == 2:
            return "With Spouse "
        else:
            return "With Other Friend/Relative"
    else:
        return travel_with


Train['travel_with']=Train[['travel_with','total_female','total_male']].apply(Clean,axis=1)
Test['travel_with']=Test[['travel_with','total_female','total_male']].apply(Clean,axis=1)


    

In [None]:
Train['total'] = Train['total_female'] + Train['total_male']
Test['total'] = Test['total_female'] + Test['total_male']
 

In [None]:
grouped = Train.groupby("total")
total =grouped.get_group(0)
total.shape

(94, 22)

In [None]:
g = Test.groupby("total")
t =g.get_group(0)
t.shape

(37, 21)

In [None]:
Train.drop(Train[Train['total'] ==0].index,inplace = True)
Test.drop(Test[Test['total'] ==0].index,inplace = True)

Train['total'].fillna(Train['total'].mode()[0],inplace=True)
Test['total'].fillna(Test['total'].mode()[0],inplace=True)

Test.shape

(6132, 21)

In [None]:
Train = Train.drop(['Tour_ID','total_female','total_male'], axis=1)
Test = Test.drop(['Tour_ID','total_female','total_male'], axis=1)


In [None]:
Train.describe(include='object').transpose()

Unnamed: 0,count,unique,top,freq
country,18412,131,UNITED STATES OF AMERICA,2825
age_group,18412,5,25-44,9002
travel_with,17339,5,Alone,7550
purpose,18412,8,Leisure and Holidays,11690
main_activity,18412,10,Widlife Tourism,5944
info_source,18412,8,"Travel agent, tour operator",7970
tour_arrangement,18412,2,Package Tour,9251
package_transport_int,18412,2,No,12170
package_accomodation,18412,2,No,9490
package_food,18412,2,No,10090


In [None]:
Test.drop(Test[Test['country'] =='IRAQ'].index,inplace = True)
Test.drop(Test[Test['country'] =='CHAD'].index,inplace = True)
Test.drop(Test[Test['country'] =='HAITI'].index,inplace = True)
Test.drop(Test[Test['country'] =='GUINEA'].index,inplace = True)
Test.drop(Test[Test['country'] =='BOLIVIA'].index,inplace = True)
Test.drop(Test[Test['country'] =='MYANMAR'].index,inplace = True)
Test.drop(Test[Test['country'] =='KAZAKHSTAN'].index,inplace = True)
Test.drop(Test[Test['country'] =='BENIN'].index,inplace = True)
Test.drop(Test[Test['country'] =='EL SALVADOR'].index,inplace = True)
Test.drop(Test[Test['country'] =='BELARUS'].index,inplace = True)
Test.country.shape

(6121,)

In [None]:
X = Train.drop('cost_category', axis=1)  #independent columns without the index column
y = Train.cost_category


In [None]:
from sklearn.impute import SimpleImputer
for col in X.select_dtypes('object'):
    cat_imputer = SimpleImputer(strategy='most_frequent')
    cat_imputer.fit(X[[col]])
    X.loc[:, col] = cat_imputer.transform(X[[col]])
    Test.loc[:, col] = cat_imputer.transform(Test[[col]])

In [None]:
## Splitting the data into training and test sets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# X = StandardScaler().fit_transform(X)

X_train, X_cv, y_train, y_cv = train_test_split(X, y ,test_size = 0.3)


In [None]:

from sklearn.metrics import accuracy_score
from sklearn.model_selection import  GridSearchCV
from sklearn.metrics import mean_squared_error
import xgboost as xgb




'''
    Select the best model for the given dataset
    :param X: features
    :param Y: labels
    :return: the name and the accuracy of the model for the given dataset
    '''
    #,xgb.XGBClassifier()
models = xgb.XGBClassifier(
    tree_method="gpu_hist", enable_categorical=True, use_label_encoder=False
)



models.fit(X_train, y_train)
ypred = models.predict(X_cv)


(f'Accuracy_score: {mean_squared_error(y_cv,ypred)}')

model: RandomForestClassifier()
Accuracy_score: 0.556118754525706
model: KNeighborsClassifier()
Accuracy_score: 0.4885952208544533
model: SVC()
Accuracy_score: 0.5499637943519189



lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression



model: LogisticRegression()
Accuracy_score: 0.5555756698044895
model: GaussianNB()
Accuracy_score: 0.10300506879073136
model: SGDClassifier()
Accuracy_score: 0.5
model: DecisionTreeClassifier()
Accuracy_score: 0.46469949312092684
model: MLPClassifier()
Accuracy_score: 0.5048877624909486



Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.



In [None]:
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.preprocessing import StandardScaler
# from sklearn.metrics import accuracy_score

# X = StandardScaler().fit_transform(X)

# rf = RandomForestClassifier()
# rf.fit(X_train, y_train)
# y_pred = rf.predict(X_cv)
# print(f'Accuracy_score: {accuracy_score(y_cv,y_pred)}')

In [None]:
n_estimators = [int(x) for x in np.linspace(start = 10, stop = 100, num = 10)] # Number of estimators
max_features = ['auto' , 'sqrt'] # Number of feature to consider at every split
max_depth = [2 , 4] # Maximum number of level in tree
min_sample_split = [2 , 5] # Minimum number of samples required to split a node
min_sample_leaf = [1 , 2] # Minimum number of sample required at each leaf node
bootstrap = [True , False] # Method of selecting sample for training each tree

In [None]:
Param_grid = {
    'n_estimators': n_estimators,
    'max_features': max_features,
    'max_depth': max_depth,
    'min_samples_split': min_sample_split,
    'min_samples_leaf': min_sample_leaf,
    'bootstrap': bootstrap
}

In [None]:
Grid_rf = RandomForestClassifier()

In [None]:
from sklearn.model_selection import GridSearchCV

Grid = GridSearchCV(
    estimator= Grid_rf,
    param_grid =Param_grid,
    cv = 3,
    verbose= 2,
    n_jobs = 4
)

In [None]:
Grid.fit(X_train, y_train)

Fitting 3 folds for each of 320 candidates, totalling 960 fits


GridSearchCV(cv=3, estimator=RandomForestClassifier(), n_jobs=4,
             param_grid={'bootstrap': [True, False], 'max_depth': [2, 4],
                         'max_features': ['auto', 'sqrt'],
                         'min_samples_leaf': [1, 2],
                         'min_samples_split': [2, 5],
                         'n_estimators': [10, 20, 30, 40, 50, 60, 70, 80, 90,
                                          100]},
             verbose=2)

In [None]:
Grid.best_params_

{'bootstrap': True,
 'max_depth': 4,
 'max_features': 'sqrt',
 'min_samples_leaf': 1,
 'min_samples_split': 5,
 'n_estimators': 20}

In [None]:
print(f'Train Accuracy -: {Grid.score(X_train, y_train):.3f}')
print(f'CV Accuracy -: {Grid.score(X_cv, y_cv):.3f}')

Train Accuracy -: 0.488
CV Accuracy -: 0.485
