## Loading Essentials and Helper Functions 

In [1]:
# fix for windows memory leak with MKL
import os
import platform

if platform.system() == "Windows":
    os.environ["OMP_NUM_THREADS"] = "2"

In [2]:
# import libraries
import time
import random
import numpy as np  # linear algebra
import pandas as pd  # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt  # this is used for the plot the graph

# Sklearn classes
from sklearn.model_selection import (
    train_test_split,
    cross_val_score,
    GridSearchCV,
    KFold,
)
from sklearn import metrics
from sklearn.metrics import confusion_matrix, silhouette_score
import sklearn.metrics.cluster as smc
from sklearn.cluster import KMeans
from sklearn.tree import DecisionTreeClassifier, export_text
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import (
    StandardScaler,
    OneHotEncoder,
    LabelEncoder,
    MinMaxScaler,
)
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn import tree
from sklearn import datasets
from sklearn.decomposition import PCA
from sklearn.neural_network import MLPClassifier
from sklearn.datasets import make_blobs

from helper import (
    draw_confusion_matrix,
    heatmap,
    make_meshgrid,
    plot_contours,
    draw_contour,
)

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

# Sets random seed for reproducibility
SEED = 42
random.seed(SEED)

In [3]:
# TODO
data = pd.read_csv("datasets/hotel_booking.csv")
data.head()
# # Phone number, email, name, and hotel are not factors of indication of whether a person cancelled or not
# data = data.drop(columns=["phone-number","email","name","hotel"],axis=1)
# data.head()

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_month,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,meal,...,booking_changes,deposit_type,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,name,email,phone-number
0,City Hotel,1,157,May,1,3,2,0.0,0,BB,...,0,Non Refund,0,Transient,130.0,0,0,Taylor Juarez,Juarez.Taylor44@zoho.com,634-458-8010
1,Resort Hotel,0,167,September,2,8,2,0.0,0,BB,...,0,No Deposit,0,Contract,62.48,0,2,Yolanda Taylor,Taylor.Yolanda35@xfinity.com,571-733-2380
2,City Hotel,0,124,April,1,1,2,0.0,0,SC,...,0,No Deposit,0,Transient,99.0,0,1,Angie Dixon,Angie_Dixon@hotmail.com,818-661-8987
3,Resort Hotel,0,8,July,2,4,2,1.0,0,BB,...,0,No Deposit,0,Transient,169.0,1,2,Jennifer Higgins,Higgins.Jennifer@yandex.com,669-803-3888
4,City Hotel,0,43,July,0,2,2,0.0,0,HB,...,1,No Deposit,0,Transient-Party,43.0,0,0,Jeremy Wilcox,Jeremy_Wilcox@hotmail.com,100-100-0744


In [4]:
data = data.drop(columns=["phone-number","email","name"],axis=1)
data.head()

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_month,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,meal,...,previous_cancellations,previous_bookings_not_canceled,reserved_room_type,booking_changes,deposit_type,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests
0,City Hotel,1,157,May,1,3,2,0.0,0,BB,...,0,0,A,0,Non Refund,0,Transient,130.0,0,0
1,Resort Hotel,0,167,September,2,8,2,0.0,0,BB,...,0,0,D,0,No Deposit,0,Contract,62.48,0,2
2,City Hotel,0,124,April,1,1,2,0.0,0,SC,...,0,0,A,0,No Deposit,0,Transient,99.0,0,1
3,Resort Hotel,0,8,July,2,4,2,1.0,0,BB,...,0,0,A,0,No Deposit,0,Transient,169.0,1,2
4,City Hotel,0,43,July,0,2,2,0.0,0,HB,...,0,0,A,1,No Deposit,0,Transient-Party,43.0,0,0


I dropped phone-number, email, and name because they will not be useful when using the models. In addition, these features won't provide any important information whether a reservation will be canceled or not. And removing them will reduce noise and improve computational efficiency. 

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 69591 entries, 0 to 69590
Data columns (total 21 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   hotel                           69591 non-null  object 
 1   is_canceled                     69591 non-null  int64  
 2   lead_time                       69591 non-null  int64  
 3   arrival_date_month              69591 non-null  object 
 4   stays_in_weekend_nights         69591 non-null  int64  
 5   stays_in_week_nights            69591 non-null  int64  
 6   adults                          69591 non-null  int64  
 7   children                        69588 non-null  float64
 8   babies                          69591 non-null  int64  
 9   meal                            69591 non-null  object 
 10  country                         69591 non-null  object 
 11  previous_cancellations          69591 non-null  int64  
 12  previous_bookings_not_canceled  

In [6]:
data.isnull().sum()

hotel                             0
is_canceled                       0
lead_time                         0
arrival_date_month                0
stays_in_weekend_nights           0
stays_in_week_nights              0
adults                            0
children                          3
babies                            0
meal                              0
country                           0
previous_cancellations            0
previous_bookings_not_canceled    0
reserved_room_type                0
booking_changes                   0
deposit_type                      0
days_in_waiting_list              0
customer_type                     0
adr                               0
required_car_parking_spaces       0
total_of_special_requests         0
dtype: int64

In [7]:
newdata = data.fillna(0)
newdata.isnull().sum()

hotel                             0
is_canceled                       0
lead_time                         0
arrival_date_month                0
stays_in_weekend_nights           0
stays_in_week_nights              0
adults                            0
children                          0
babies                            0
meal                              0
country                           0
previous_cancellations            0
previous_bookings_not_canceled    0
reserved_room_type                0
booking_changes                   0
deposit_type                      0
days_in_waiting_list              0
customer_type                     0
adr                               0
required_car_parking_spaces       0
total_of_special_requests         0
dtype: int64

By replace the rest of the children errors (3) with a 0 value, it won't make any difference when analyzing the data because 
the dataset is so huge, roughly 78k rows for childen.

In [9]:
newdata.describe(include="object")

Unnamed: 0,hotel,arrival_date_month,meal,country,reserved_room_type,deposit_type,customer_type
count,69591,69591,69591,69591,69591,69591,69591
unique,2,12,5,5,10,3,4
top,City Hotel,August,BB,PRT,A,No Deposit,Transient
freq,44561,8218,53846,38919,52137,57909,50903


In [10]:
newdata['total_guests'] = newdata['children'] + newdata['babies'] + newdata['adults']
newdata['total_night'] = newdata['stays_in_week_nights'] + newdata['stays_in_weekend_nights']
newdata['total_book'] = newdata['previous_bookings_not_canceled'] + newdata['previous_cancellations']
newdata = newdata.drop(columns=['adults','children','babies','stays_in_week_nights','stays_in_weekend_nights', 'previous_bookings_not_canceled','previous_cancellations'],axis=1)
newdata.head()


Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_month,meal,country,reserved_room_type,booking_changes,deposit_type,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,total_guests,total_night,total_book
0,City Hotel,1,157,May,BB,PRT,A,0,Non Refund,0,Transient,130.0,0,0,2.0,4,0
1,Resort Hotel,0,167,September,BB,GBR,D,0,No Deposit,0,Contract,62.48,0,2,2.0,10,0
2,City Hotel,0,124,April,SC,GBR,A,0,No Deposit,0,Transient,99.0,0,1,2.0,2,0
3,Resort Hotel,0,8,July,BB,PRT,A,0,No Deposit,0,Transient,169.0,1,2,3.0,6,0
4,City Hotel,0,43,July,HB,PRT,A,1,No Deposit,0,Transient-Party,43.0,0,0,2.0,2,0


I added a feature by augmenting the adults, children, and babies and added another feature by augmenting both week and weekend nights. Lastly, I added a last feature by augmenting the previous bookings not getting canceled and otherwise.

Then I dropped the original features so that the model doesn't learn "twice" and overpredict the target data. 

I did these so that the total number of guests, nights, and books might have an impact on the reservation cancellation. So by creating 3 of these augmented features, we can ensure that the model will give up accurate predictions with these useful information and won't just "repeat" data. 

In [11]:
feature_data = newdata['is_canceled']
target_data = newdata.drop(['is_canceled'], axis=1)

train_raw, test_raw, target, target_test = train_test_split(target_data, feature_data, 
                                                            test_size=0.2, 
                                                            stratify= feature_data, 
                                                            random_state=0)


Performed 20% on test data, while 80% implicitly on the training data. It ensures that the train and test data are balanced in terms if the reservation will be canceled or not. Train-test is important when evaluating the model's performance on unseen data and to avoid overfitting. By balancing the classes in the testing and training datasets, we can ensure that the model's performance has an equal representation of the reservations predictions which can help the model's predictive ability.

In [12]:
categorical_features = ['hotel','arrival_date_month', 'meal', 'country', 'reserved_room_type', 'deposit_type', 'customer_type']
numerical_features = ['lead_time', 'booking_changes','days_in_waiting_list','required_car_parking_spaces',
                      'total_of_special_requests',"adr" ,"total_guests","total_book","total_night"]

numerical_pipeline = Pipeline([
    ('scalar', MinMaxScaler()),
    ])

preprocess_data = ColumnTransformer([
    ('numerical', numerical_pipeline, numerical_features),
    ('categorical', OneHotEncoder(categories="auto"), categorical_features)
])

train = preprocess_data.fit_transform(train_raw)
test = preprocess_data.transform(test_raw)


For one-hot encoding, I applied all categorical features in the dataset because it is necessary to convert all of them into numerical representations that ML algorithms could understand. In addition, it creates binary features for each category; so that the machine model can understand the relationships between the different categories.

For rescaling real values features, I have applied Min-Max scaling to rescale the features between 0 and 1. I did this because it is important to bring the real values features to a similar scale and it will make for the model easier to understand. It also prevents features from having larger magnitudes that dominates other data in the dataset, thus reducing the learning process. It also ensures that all the real values features are normalized within the expected range, which can improve the ML performance. 



Now, let' try out a few models
Now that I have pre-processed my data, I'm ready to try out different models. 



I performed classification the 3 of the following models:
- Logistic Regression
- K-nearest neighbors
- Decision Tree

Due to the size of the dataset, I did not use SVM or MLP because it's too large to train.


In [135]:
from sklearn import metrics
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import RandomizedSearchCV
from joblib import parallel_backend


#KNN Model creation and fitting

# k=3
# kf = KFold(n_splits=k,random_state=None)
with parallel_backend('threading', n_jobs=2):
    knn = KNeighborsClassifier()
    
params_knn = {
    'n_neighbors' : [1,3,5,7],
    'metric' : ["euclidean", "manhattan"],
}

with parallel_backend('threading', n_jobs=2):
    knn_grid = RandomizedSearchCV(knn,params_knn,cv=3)
    knn_grid.fit(train,target)

#Best parameters and performance
print("Best Parameters: ", knn_grid.best_params_)
print("Best Score: " , knn_grid.best_score_)
# knn_y_pred = knn_grid.predict(train)
# knn_accuracy = metrics.accuracy_score(target,knn_y_pred)
# print("%-12s %f" % ("Accuracy: ", knn_accuracy))

# draw_confusion_matrix(target, knn_y_pred, ['Canceled', 'Not Canceled'])



Best Parameters:  {'n_neighbors': 7, 'metric': 'manhattan'}
Best Score:  0.8268968141428107


For KNNs, the best model that were chosen is manhattan for metric and k = 7 neighbors. I only optimized the metric and neighbors using GridSearchCV with cross validation of 3. 

In [181]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import RandomizedSearchCV
import numpy as np
from joblib import parallel_backend


# Define the decision tree classifier
with parallel_backend('threading', n_jobs=3):
    dtc = DecisionTreeClassifier()


param_grid = {
'max_depth': [None, 4,3,2,1,5, 10, 15, 20, 30, 50, 60, 120],
'min_samples_split': [2,4,8,16,32,64],
'criterion': ["gini", "entropy", "log_loss"],
'splitter': ['best','random'],
}



with parallel_backend('threading', n_jobs=3):   
    dt_grid = RandomizedSearchCV(dtc, param_grid, cv=5,n_iter=20)
    dt_grid.fit(train, target)      

print("Best score:", dt_grid.best_score_)
print("Best parameters:", dt_grid.best_params_)


Best score: 0.8383388705122858
Best parameters: {'splitter': 'best', 'min_samples_split': 64, 'max_depth': 120, 'criterion': 'gini'}


For Decision Trees, the best parameters I choose are splitter (best), 64 for minimum sample split, 120 for max depth, and gini for criterion. These are also the parameters I optimized.

In [180]:
from sklearn.linear_model import LogisticRegression
# Logistic Regression
with parallel_backend('threading', n_jobs=3):
    log_reg = LogisticRegression()
    
params_log_reg = {
    "penalty": ["l1", 'l2'],
    "C": [0.001, 0.01, 1, 100],
    "solver": ["liblinear", "saga"],
}
with parallel_backend('threading', n_jobs=3):
    grid_log_reg = RandomizedSearchCV(log_reg, params_log_reg, cv=5)
    grid_log_reg.fit(train,target)

print("Best Parameters: ", grid_log_reg.best_estimator_)
print("Best Score: ", grid_log_reg.best_score_)
# log_reg_y_pred = grid_log_reg.predict(train)
# log_reg_accuracy = metrics.accuracy_score(target, log_reg_y_pred)
# print("%-12s %f" % ("Accuracy: ", log_reg_accuracy))
# draw_confusion_matrix(target, log_reg_y_pred, ['Canceled', 'Not Canceled'])



Best Parameters:  LogisticRegression(C=100, solver='liblinear')
Best Score:  0.799594075002083


For Logistic Regression Model, the best model that were chosen is 100 for C and liblinear. I only optimized C, penalty, and solver using randomSearchCV with cross validation of 5. 

In [None]:
# # from sklearn.neural_network import MLPClassifier

# # Fitting and creating the MLP Model
# mlp = MLPClassifier()
# params_mlp = {
#     'hidden_layer_sizes' : [(100,), (50,50),(25,25)],
#     'alpha' : [0.0001,0.001,0.01,0.1],
#     "activation" : ["relu","identity","tanh","logistic"],
#     "max_iter" : [1000]
#     }

# grid_mlp = GridSearchCV(mlp, param_grid=params_mlp, cv=3)
# grid_mlp.fit(train,target)
# print("Best Parameters: ", grid_mlp.best_estimator_)
# print("Best Scores: ", grid_mlp.best_score_)

# # pred_y_mlp = grid_mlp.predict(train)
# # accuracy_mlp = metrics.accuracy_score(target, pred_y_mlp)
# # print("%-12s %f" % ("Accuracy: ", accuracy_mlp))
# # draw_confusion_matrix(target, pred_y_mlp, ['Canceled', 'Not Canceled'])



In [None]:
# from sklearn.svm import SVC
# # SVC

# svm = SVC()
# params_svm = {
#     "C": [0.001, 0.01, 1],
#     "kernel": ["linear", "poly","rbf","sigmoid","precomputed"],
# }

# grid_svm = GridSearchCV(svm, param_grid=params_svm, cv=3)
# grid_svm.fit(train,target)

# print("Best Parameters: ", grid_svm.best_estimator_)
# svm_y_pred = grid_svm.predict(train)
# svm_accuracy = metrics.accuracy_score(target, svm_y_pred)
# print("%-12s %f" % ("Accuracy: ", svm_accuracy))
# draw_confusion_matrix(target, svm_y_pred, ['Canceled', 'Not Canceled'])

Training the model on the data using SVC and MLP takes forever to run because of the large size of the dataset

In [13]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
import numpy as np
from joblib import parallel_backend


# Define the randomForest classifier
with parallel_backend('threading', n_jobs=3):
    rfc = RandomForestClassifier()

# Best params: n_estimators = 190, max_depth = 25, and min_samples_split = 2
param_grid = {
# 'n_estimators': [100, 200, 500],
# 'max_depth': [None, 5, 10, 15, 20, 30, 40, 50],
# 'min_samples_split': [2, 5, 10, 20, 50, 100],
# 'min_samples_leaf': [1, 5, 10, 20, 50, 100],
# 'criterion': ['gini', 'entropy']
'n_estimators': [180],
'max_depth': [25],
'min_samples_split': [2],
}

# best cv = 120
with parallel_backend('threading', n_jobs=3):   
    rfc_randomSearch = RandomizedSearchCV(rfc, param_grid, cv=60,n_iter=1)
    rfc_randomSearch.fit(train, target)      

print("Best score:", rfc_randomSearch.best_score_)
print("Best parameters:", rfc_randomSearch.best_params_)


Best score: 0.8783775992262768
Best parameters: {'n_estimators': 180, 'min_samples_split': 2, 'max_depth': 25}


For this part, I used RFCs as the best model for this large dataset. I got an accuracy of 87% when put into kaggle, so not too bad :D. 

Firstly, I did not made any changes to the earlier steps. Then, I used this model called Random Forest Classifier, with parameters 180 for n_estimators, 25 for max_depth, and 2 for min_samples_split, alongside of using randomizedSearchCV. 

Random Forest Classifier is a model that fits a decision tree classifiers on various sub-samples of the dataset, then averaging to improve its predictive accuracy and control over-fitting. It is an ensemble learning technique that constructs multiple decision trees at training time and outputs class that is mode of the classes or mean prediction. 

Using Random Forest Classifier is beneficial to the large dataset of this project because RFC perform well for a wide range of data. It reduces the risk of overfitting while maintaining high capacity to model complex relationships in the data. Two, it can handle non-linear data effectively and efficiently. Thirdly, it provides important insights into the importance of each feature in making predictions, which can be invaluable for feature selection and understanding my model.

As for Random Search CV, it randomly selects combinations, which reduces the computational cost, unlike GridSearchCV which use a lot of computational power and exhaust every possible combinations. I also experimented with different number of cross-validations ranging from 5 to 120. If we set cv = 5, the computational power is shortened and you will get a best score of roughly 0.872. If we set cv = 60, computational power is used quite a bit and it takes 8-12 minutes to fully run and you will get best score of 0.875. Finally, if we set cv = 180, it takes roughly 30 -50 mins to run, but you get best score of roughly 0.8789.

As for the parallel processing using parallel_backend, I used n_jobs = 3 as to not kill my computer. The advantages of this is to distribute tasks across different processors and decrease training time, especially for large datasets. It also allows for better resource utilization, optimizing the performance without overwhelming a single processor or thread. If I set n_jobs = -1, it will brick my computer because it will use all of my processors 



In [14]:
test_data = pd.read_csv('datasets/hotel_booking_test.csv')
test_data.head()

Unnamed: 0,hotel,lead_time,arrival_date_month,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,meal,country,...,booking_changes,deposit_type,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,name,email,phone-number
0,City Hotel,107,June,0,2,2,0.0,0,BB,PRT,...,0,No Deposit,0,Transient-Party,130.0,0,1,Dustin Marshall,Dustin.Marshall@xfinity.com,833-801-0855
1,Resort Hotel,20,May,0,3,2,0.0,0,BB,PRT,...,0,No Deposit,0,Transient,91.67,0,0,Gregory Roberts,GRoberts17@verizon.com,881-819-0764
2,Resort Hotel,125,April,2,5,2,0.0,0,BB,GBR,...,0,No Deposit,0,Contract,42.95,0,1,Dustin Hardin,Dustin_Hardin@verizon.com,560-971-8576
3,Resort Hotel,0,August,1,1,2,0.0,0,BB,FRA,...,0,No Deposit,0,Transient,106.0,0,0,Kristy Stewart,Kristy.Stewart@mail.com,783-987-6285
4,City Hotel,124,August,0,1,2,0.0,0,BB,GBR,...,0,No Deposit,0,Transient,127.8,1,1,Deanna Leblanc,Deanna.Leblanc75@gmail.com,518-112-1761


In [15]:
test_data = test_data.drop(columns=["name","email","phone-number"],axis=1)
test_data.head()

Unnamed: 0,hotel,lead_time,arrival_date_month,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,meal,country,previous_cancellations,previous_bookings_not_canceled,reserved_room_type,booking_changes,deposit_type,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests
0,City Hotel,107,June,0,2,2,0.0,0,BB,PRT,0,0,A,0,No Deposit,0,Transient-Party,130.0,0,1
1,Resort Hotel,20,May,0,3,2,0.0,0,BB,PRT,0,0,A,0,No Deposit,0,Transient,91.67,0,0
2,Resort Hotel,125,April,2,5,2,0.0,0,BB,GBR,0,0,A,0,No Deposit,0,Contract,42.95,0,1
3,Resort Hotel,0,August,1,1,2,0.0,0,BB,FRA,0,0,A,0,No Deposit,0,Transient,106.0,0,0
4,City Hotel,124,August,0,1,2,0.0,0,BB,GBR,0,0,D,0,No Deposit,0,Transient,127.8,1,1


In [16]:
test_data = test_data.fillna(0)
test_data.isnull().sum()

hotel                             0
lead_time                         0
arrival_date_month                0
stays_in_weekend_nights           0
stays_in_week_nights              0
adults                            0
children                          0
babies                            0
meal                              0
country                           0
previous_cancellations            0
previous_bookings_not_canceled    0
reserved_room_type                0
booking_changes                   0
deposit_type                      0
days_in_waiting_list              0
customer_type                     0
adr                               0
required_car_parking_spaces       0
total_of_special_requests         0
dtype: int64

I did the same process and reasoning logic before by dropping out data that aren't useful to my model

I also did the same process for the missing values on children and replaced it with 0.

In [17]:
test_data['total_guests'] = test_data['children'] + test_data['adults'] + test_data['babies']
test_data['total_night'] = test_data['stays_in_week_nights'] + test_data['stays_in_weekend_nights']
test_data['total_book'] = test_data['previous_bookings_not_canceled'] + test_data['previous_cancellations']
new_test_data = test_data.drop(columns=['adults','children','babies','stays_in_week_nights','stays_in_weekend_nights', 'previous_bookings_not_canceled','previous_cancellations'],axis=1)

pred = rfc_randomSearch.predict(preprocess_data.transform(new_test_data))


I also did the same process by augmenting total guests, total night, and total book and dropping the original values so that the model can learn accurately and predict if the hotel reservation will be canceled or not.

Then I used my random forest classifier and the pre-processor from the pre-processing step to predict the is_canceled data

In [18]:
kaggle = pd.DataFrame({'index': range(0, len(new_test_data)), 'target': pred})
kaggle.to_csv('submission.csv', index = False)

Summary: 

1.) I deleted name, email, and phone numbers features because they do not contain useful information and won't provide much help in the prediction accuracy. 

2.) I checked for missing values and found that children has 3 instances of NA. I did the same process before by filling it with 0 because it won't affect the model's accuracy because it is too large. 

3.) I augmented the 3 features: total nights, total guests, and total book and dropping the original features so that the model can learn accurately and used the previous pre-processing code block. Then I used my RFC model to predict the is_canceled data.

4.) Explaining the RFC model and why I chose it: I first experimented with KNNs, Logistic Regression, and Decision trees without any parallel processing and randomSearchCV. First, KNNs' best score was roughly 82% and to that I say was not enough. So then I checked with Logistic Regression to see if it's better. I got roughly 80%, which is worse than KNNs. Then, I moved towards SVMs and MLP and these models take forever, even hours to run! Because the dataset is so large these models can't comprehend the amount of samples, so I just scratched those models out. So then, I chose decision tree and it outputted roughly 83% best score and I uploaded that into Kaggle and I got around that score. I then began to use parallel processing with n_jobs 3 (to not kill my computer) and randomSearchCV. To my surpise, these additions slightly improved my models scores. However, it wasn't enough as I wanted to increase more accuracy. Then I searched about random forest classifier and polyfeatures. RFCs are great for large datasets and reduce overfitting so it's great for my model. I used RFC's parameters of 190 for n_estimators, 25 for max_depth, and 2 for min_samples_split. Finally, I got a best RFC score of 0.878! and I outputted that to kaggle and got 0.88212 :D. 