# Libraries and Data Information

In [None]:
import numpy as np 
import pandas as pd 

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV

from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier, Pool
from lightgbm import LGBMClassifier

from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, confusion_matrix

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

import warnings
warnings.filterwarnings("ignore")

### Problem Description

* The Spaceship Titanic Competition wants the participants to predict whether a passanger was successfuly transported to an alternate dimension. In order to make these predictions, we are provided with both training and testing datasets for which we will appy data exploration and preprocessing techniques in order to reach our end goal.

### Description of Features

**PassengerId** - A unique Id for each passenger. Each Id takes the form gggg_pp where gggg indicates a group the passenger is travelling with and pp is their number within the group. People in a group are often family members, but not always.

**HomePlanet** - The planet the passenger departed from, typically their planet of permanent residence.

**CryoSleep** - Indicates whether the passenger elected to be put into suspended animation for the duration of the voyage. Passengers in cryosleep are confined to their cabins.

**Cabin** - The cabin number where the passenger is staying. Takes the form deck/num/side, where side can be either P for Port or S for Starboard.

**Destination** - The planet the passenger will be debarking to.

**Age** - The age of the passenger.

**VIP** - Whether the passenger has paid for special VIP service during the voyage.

**RoomService, FoodCourt, ShoppingMall, Spa, VRDeck** - Amount the passenger has billed at each of the Spaceship Titanic's many luxury amenities.

**Name** - The first and last names of the passenger.

**Transported** - Whether the passenger was transported to another dimension. This is the target, the column you are trying to predict.

# Read and Check Data

In [None]:
train = pd.read_csv("/kaggle/input/spaceship-titanic/train.csv")
test = pd.read_csv("/kaggle/input/spaceship-titanic/test.csv")
submission = pd.read_csv("/kaggle/input/spaceship-titanic/sample_submission.csv")

In [None]:
print('Train set shape:', train.shape)
print('Test set shape:', test.shape)

In [None]:
train.head(3)

In [None]:
test.head(3)

In [None]:
# information about data frame
train.info()

In [None]:
# describe basic statistics of data
train.describe().T

In [None]:
# let's check our missing values
train.isnull().sum()

In [None]:
test.isnull().sum()

# Dealing with the outliers

In [None]:
# train - let's look at outliers for our numeric columns 

fig, axes = plt.subplots(2, 3, sharey=True, figsize=(15, 7))
train.plot.scatter(x='RoomService', y='Age', ax=axes[0, 0])
train.plot.scatter(x='FoodCourt', y='Age', ax=axes[0, 1])
train.plot.scatter(x='ShoppingMall', y='Age', ax=axes[0, 2])
train.plot.scatter(x='Spa', y='Age', ax=axes[1, 0])
train.plot.scatter(x='VRDeck', y='Age', ax=axes[1, 1])
plt.show()

In [None]:
# test - let's look at outliers for our numeric columns 

fig, axes = plt.subplots(2, 3, sharey=True, figsize=(15, 7))
test.plot.scatter(x='RoomService', y='Age', ax=axes[0, 0])
test.plot.scatter(x='FoodCourt', y='Age', ax=axes[0, 1])
test.plot.scatter(x='ShoppingMall', y='Age', ax=axes[0, 2])
test.plot.scatter(x='Spa', y='Age', ax=axes[1, 0])
test.plot.scatter(x='VRDeck', y='Age', ax=axes[1, 1])
plt.show()

In [None]:
# we can back off the endpoints to a certain upper limit 

def outliers(df):
    
    df.loc[df.RoomService.gt(9000), 'RoomService'] = 9000

    df.loc[df.FoodCourt.gt(22000), 'FoodCourt'] = 22000

    df.loc[df.ShoppingMall.gt(11000), 'ShoppingMall'] = 11000

    df.loc[df.Spa.gt(17000), 'Spa'] = 17000

    df.loc[df.VRDeck.gt(21000), 'VRDeck'] = 21000

In [None]:
outliers(train)
outliers(test)

# Missing Values

In [None]:
def missing_values_table(dataframe):
    na_columns = [col for col in dataframe.columns if dataframe[col].isnull().sum() > 0]

    n_miss = dataframe[na_columns].isnull().sum().sort_values(ascending=False)
    ratio = (dataframe[na_columns].isnull().sum() / dataframe.shape[0] * 100).sort_values(ascending=False)
    missing_df = pd.concat([n_miss, np.round(ratio, 2)], axis=1, keys=['n_miss', 'ratio'])
    print(missing_df, end="\n")
    sns.barplot(y=n_miss, x=na_columns)
    plt.xticks(rotation=45)
    plt.title("Bar Chart of Missing Values")
    plt.show()


In [None]:
missing_values_table(train)

In [None]:
missing_values_table(test)

In [None]:
# The process of manually filling in missing data

# train.HomePlanet = train.HomePlanet.fillna(train.HomePlanet.mode().values[0])
# test.HomePlanet = test.HomePlanet.fillna(train.HomePlanet.mode().values[0])

# train.VIP = train.VIP.fillna(train.VIP.mode().values[0])
# test.VIP = test.VIP.fillna(train.VIP.mode().values[0])

# train.CryoSleep = train.CryoSleep.fillna(train.CryoSleep.mode().values[0])
# test.CryoSleep = test.CryoSleep.fillna(train.CryoSleep.mode().values[0])

# train.Destination = train.Destination.fillna(train.Destination.mode().values[0])
# test.Destination = test.Destination.fillna(train.Destination.mode().values[0])

# train.Age = train.Age.fillna(train.Age.median())
# test.Age = test.Age.fillna(test.Age.median())

# # The median values of the following features are zero
# train[['RoomService','FoodCourt','ShoppingMall','Spa','VRDeck']] = train[['RoomService','FoodCourt','ShoppingMall','Spa','VRDeck']].fillna(0)
# test[['RoomService','FoodCourt','ShoppingMall','Spa','VRDeck']] = test[['RoomService','FoodCourt','ShoppingMall','Spa','VRDeck']].fillna(0)


In [None]:
# Fill missing data with SimpleImputer

num_cols = ['ShoppingMall','FoodCourt','RoomService','Spa','VRDeck','Age']
cat_cols = ['CryoSleep','VIP','HomePlanet','Destination']

num_imp = SimpleImputer(strategy='mean')
cat_imp = SimpleImputer(strategy='most_frequent')

train[num_cols] = pd.DataFrame(num_imp.fit_transform(train[num_cols]),columns=num_cols)
train[cat_cols] = pd.DataFrame(cat_imp.fit_transform(train[cat_cols]),columns=cat_cols)

test[num_cols] = pd.DataFrame(num_imp.fit_transform(test[num_cols]),columns=num_cols)
test[cat_cols] = pd.DataFrame(cat_imp.fit_transform(test[cat_cols]),columns=cat_cols)

* **PassengerId, Cabin and Name** features have high cardinality. We will handle the missing values of these features in the feature engineering process 

In [None]:
missing_values_table(train)

In [None]:
missing_values_table(test)

In [None]:
# Let's observe our data in a table 

def get_unique_values(df):
    
    output_data = []

    for col in df.columns:

        # If the number of unique values in the column is less than or equal to 5
        if df.loc[:, col].nunique() <= 10:
            # Get the unique values in the column
            unique_values = df.loc[:, col].unique()
            # Append the column name, number of unique values, unique values, and data type to the output data
            output_data.append([col, df.loc[:, col].nunique(), unique_values, df.loc[:, col].dtype])
        else:
            # Otherwise, append only the column name, number of unique values, and data type to the output data
            output_data.append([col, df.loc[:, col].nunique(),"-", df.loc[:, col].dtype])

    output_df = pd.DataFrame(output_data, columns=['Column Name', 'Number of Unique Values', ' Unique Values ', 'Data Type'])

    return output_df

In [None]:
get_unique_values(train)

# Distribution of Features

In [None]:
target = 'Transported'
features = [col for col in train.columns if col != target]

In [None]:
text_features = ["Cabin", "Name"]
cat_features = [col for col in features if train[col].nunique() < 25 and col not in text_features ]
cont_features = [col for col in features if train[col].nunique() >= 25 and col not in text_features ]

In [None]:
cat_features

In [None]:
cont_features

In [None]:
plt.figure(figsize=(8,6))
plt.pie(train.Transported.value_counts(), shadow=True, explode=[.1,.1], autopct='%.1f%%')
plt.title('Transported ', size=18)
plt.legend(['False', 'True'], loc='best', fontsize=12)
plt.show()

## Distribution of Categorical Features

In [None]:
fig=plt.figure(figsize=(10,16))
for i, var_name in enumerate(cat_features):
    ax=fig.add_subplot(4,1,i+1)
    sns.countplot(data=train, x=var_name, axes=ax, hue='Transported')
    ax.set_title(var_name, fontsize = 15)
    ax.bar_label(ax.containers[0])
    ax.bar_label(ax.containers[1])
fig.tight_layout()  
plt.show()

* **In Destination** feature we can observe that most of the passengers are transported to Trappist-1e.
* **In VIP** feature we can observe that one cateogry is dominating other category too much. So it doesn't seem to be usefull feature because it can lead to overfitting in our model.

## Distribution of Numerical Features

In [None]:
x = 0
plt.figure(figsize=(18, 4))
for i in cont_features:
    x += 1
    plt.subplot(1, 8, x)
    sns.boxplot(data=train[i])
    plt.title(i)
    
fig.tight_layout() 
plt.show()

* Except for age, the distribution is concentrated at zero for all other characteristics. Therefore, to examine the distribution of these features, we need to enter very low bins in the histogram plot 

In [None]:
sns.histplot(data=train, x="Age", hue = "Transported", kde = True)
plt.show()

* Most of the passengers were between age 18-32.
* We can create a new feature Age-Catgeory from age in which we can split ages into different categories.

In [None]:
# we plot our graph with a low bins value

fig, axs = plt.subplots(1, 5, figsize=(20, 5)) 

sns.histplot(data=train, x="RoomService", bins=10, kde=True, ax=axs[0]) 
sns.histplot(data=train, x="FoodCourt", bins=10, kde=True, ax=axs[1]) 
sns.histplot(data=train, x="ShoppingMall", bins=10, kde=True, ax=axs[2])
sns.histplot(data=train, x="Spa", bins=10, kde=True, ax=axs[3]) 
sns.histplot(data=train, x="VRDeck", bins=10, kde=True, ax=axs[4])

plt.show()

* Above, we have set the outliers to certain upper limits, but since most expenses are zero, values with higher expenses are again a type of outlier in our data.
* All the expenditure features distribution is Right-Skewed.
* Since, all expenditure features are having right-skewed distribution. So before Model Building we will transform these features to normal distribution using log-transformation

In [None]:
# Let's look at the correlation relationship of our numeric columns
train.corr()

In [None]:
corr = train.corr()
mask = np.zeros_like(corr) 
mask[np.triu_indices_from(mask)] = True 
with sns.axes_style("white"):
    f, ax = plt.subplots(figsize=(8, 8))
    ax = sns.heatmap(corr,mask=mask,square=True,linewidths=.8, cmap='YlGnBu', annot=True)

# Creating New Features

### PassengerId

In [None]:
train.PassengerId[:10]

* In this feature, the first 4 numbers indicate the group the passenger is traveling with and the last two numbers indicate the number of people in the group.
* We can therefore create a new feature that will show the total number of members in each group.
* We can also create a new property that shows whether the traveler is traveling alone or with a group

In [None]:
def passengerid_new_features(df):
    # Splitting Group and Member values from "PassengerId" column.
    df["Group"], df["Member"] = df["PassengerId"].str.split("_", 1).str

    # Grouping the "Group" feature with respect to "Member" feature to check which group is travelling with how many members.
    group_counts = df.groupby("Group")["Member"].count()

    # Creating a set of group values which are travelling with more than 1 member.
    groups_with_multiple_members = set(group_counts[group_counts > 1].index)

    # Creating a new feature "Travelling_Solo" which will indicate whether the person is travelling solo or not.
    df["Travelling_Solo"] = ~df["Group"].isin(groups_with_multiple_members)

    # Creating a new feature "Group_Size" which will indicate each group's number of members.
    df["Group_Size"] = df["Group"].map(group_counts)

In [None]:
passengerid_new_features(train)
passengerid_new_features(test)

In [None]:
# We don't require Group and Member features any more so we will drop those feature from both datasets

train.drop(columns=["Group","Member"],inplace=True)
test.drop(columns=["Group","Member"],inplace=True)

In [None]:
# train data
plt.figure(figsize=(12,4))
plt.subplot(1,2,1)
ax = sns.countplot(data=train, x='Group_Size', hue='Transported')
ax.bar_label(ax.containers[0])
ax.bar_label(ax.containers[1])
plt.title('Group Size')

plt.subplot(1,2,2)
ax = sns.countplot(data=train, x='Travelling_Solo', hue='Transported')
ax.bar_label(ax.containers[0])
ax.bar_label(ax.containers[1])
plt.title('Passenger travelling solo or not')
plt.tight_layout()
plt.show()

In [None]:
# test data
plt.figure(figsize=(10,4))
plt.subplot(1,2,1)
ax = sns.countplot(data=test, x='Group_Size')
ax.bar_label(ax.containers[0])
plt.title('Group Size')

plt.subplot(1,2,2)
ax = sns.countplot(data=test, x='Travelling_Solo')
ax.bar_label(ax.containers[0])
plt.title('Passenger travelling solo or not')
plt.tight_layout()
plt.show()

### Cabin

In [None]:
train.Cabin.head().to_frame()

*  We know that the cabin property consists of deck/num//side. So we can separate all these 3 values from the cabin and create three new properties Cabin_Deck, Cabin_Number and Cabin_Side.
* We also know that Cabin feature is having NaN values so to avoid error while splitting we have to replace it in such a way taht we can split those NaN Values in all three new features respectively.

In [None]:
def cabin_new_feature(df):
    
    # In this way we can split NaN values into all three categories
    df["Cabin"].fillna("np.nan/np.nan/np.nan", inplace=True)  

    df["Cabin_Deck"], df["Cabin_Number"], df["Cabin_Side"] = df["Cabin"].str.split("/", 2).str

    # Replacing string "np.nan" values with numpy.nan values.
    cols = ["Cabin_Deck", "Cabin_Number", "Cabin_Side"]
    df[cols] = df[cols].replace("np.nan", np.nan)

    # Filling missing values in new features created.
    df["Cabin_Deck"].fillna(df["Cabin_Deck"].mode()[0], inplace=True)
    df["Cabin_Side"].fillna(df["Cabin_Side"].mode()[0], inplace=True)
    df["Cabin_Number"].fillna(df["Cabin_Number"].median(), inplace=True)

In [None]:
cabin_new_feature(train)
cabin_new_feature(test)

In [None]:
plt.figure(figsize=(15,4))
plt.subplot(1,2,1)
ax = sns.countplot(x="Cabin_Deck",hue="Transported", data=train, order=['A','B','C','D','E','F','G','T'])
ax.bar_label(ax.containers[0])
ax.bar_label(ax.containers[1])
plt.title("Cabin_Deck Distribution")

plt.subplot(1,2,2)
ax = sns.countplot(x="Cabin_Side", hue="Transported", data=train)
ax.bar_label(ax.containers[0])
ax.bar_label(ax.containers[1])
plt.title("Cabin_Side Distribution")
plt.tight_layout()
plt.show()

In [None]:
# We set the Cabin_Number column, which is object, to integer
train["Cabin_Number"]=train["Cabin_Number"].astype(int)
test["Cabin_Number"]=test["Cabin_Number"].astype(int)

In [None]:
train["Cabin_Number"].describe()

In [None]:
test["Cabin_Number"].describe()

In [None]:
fig=plt.figure(figsize=(10,12))
plt.subplot(3,1,1)
sns.countplot(data=train, x='Cabin_Deck', hue='Transported', order=['A','B','C','D','E','F','G','T'])
plt.title('Cabin Deck')

plt.subplot(3,1,2)
sns.histplot(data=train, x='Cabin_Number', hue='Transported',binwidth=20)
plt.vlines(300, ymin=0, ymax=200, color='black')
plt.vlines(600, ymin=0, ymax=200, color='black')
plt.vlines(900, ymin=0, ymax=200, color='black')
plt.vlines(1200, ymin=0, ymax=200, color='black')
plt.vlines(1500, ymin=0, ymax=200, color='black')
plt.vlines(1800, ymin=0, ymax=200, color='black')
plt.title('Cabin Number')
plt.xlim([0,2000])
fig.tight_layout()
plt.show()

* **Cabin_Number** does not express numerical magnitude. If we consider the **Cabin_Deck** column, Cabin_Number actually shows the distribution of regions. Based on the min and max points in the column, we get 6 regions when divided into 5 parts. The cabin deck 'T' must be a very specific zone because there are only 5 people in total. Therefore we did not include it as a zone

In [None]:
def cabin_regions(df):
    df["Cabin_Region1"] = (df["Cabin_Number"]<300)
    df["Cabin_Region2"] = (df["Cabin_Number"]>=300) & (df["Cabin_Number"]<600)
    df["Cabin_Region3"] = (df["Cabin_Number"]>=600) & (df["Cabin_Number"]<900)
    df["Cabin_Region4"] = (df["Cabin_Number"]>=900) & (df["Cabin_Number"]<1200)
    df["Cabin_Region5"] = (df["Cabin_Number"]>=1200) & (df["Cabin_Number"]<1500)
    df["Cabin_Region6"] = (df["Cabin_Number"]>=1500)

In [None]:
cabin_regions(train)
cabin_regions(test)

In [None]:
train.drop(columns=["Cabin_Number"],inplace=True)
test.drop(columns=["Cabin_Number"],inplace=True)

In [None]:
cols = ["Cabin_Region1","Cabin_Region2","Cabin_Region3","Cabin_Region4","Cabin_Region5","Cabin_Region6"]

plt.figure(figsize=(15,15))
for idx,value in enumerate(cols):
    plt.subplot(4,2,idx+1)
    ax = sns.countplot(x=value, hue="Transported", data=train)
    plt.title(f"{value} Distribution")
    ax.bar_label(ax.containers[0])
    ax.bar_label(ax.containers[1])
    plt.tight_layout()
plt.show()

* We can observe that passengers from Cabin_Region1 are Highly Transported when compared with other cabin regions.
* We can also observe that as the cabin region number is increasing passengers transport is decreasing.

### RoomService, FoodCourt, ShoppingMall, Spa, VRDeck

* We can create a Total Expenditure Feature by combining all the expenditures.
* We can create a No Spending boolean feature from Total Expenditure feature indicating True for those passengers who have spent 0 expense.

In [None]:
exp_cols = ["RoomService","FoodCourt","ShoppingMall","Spa","VRDeck"]

def new_exp_features(df):
    df["Total_Expenditure"] = df[exp_cols].sum(axis=1)
    df["No_Spending"] = (df["Total_Expenditure"]==0)

In [None]:
new_exp_features(train)
new_exp_features(test)

In [None]:
plt.figure(figsize=(10,6))
sns.histplot(x="Total_Expenditure", hue="Transported", data=train, kde=True,bins=100)
plt.title("Total Expenditure Distribution")
plt.show()

In [None]:
train.Total_Expenditure.describe()

In [None]:
test.Total_Expenditure.describe()

In [None]:
plt.figure(figsize=(12,5))
ax = sns.countplot(x="No_Spending",hue="Transported", data=train)
ax.bar_label(ax.containers[0])
ax.bar_label(ax.containers[1])
plt.title("No_Spending Distribution")
plt.show()

* In No Spending feature we can observe that passenger having No Spending are highly transported.

* **'RoomService','FoodCourt','ShoppingMall','Spa','VRDeck','Total Expenditure'**: All of these features had a right-skewed distribution
* So we can simply say there is a presence of large amount of outliers in these features.
* So we can tranform these features to normal distribution using Log Transformation.

In [None]:
cols = ['RoomService','FoodCourt','ShoppingMall','Spa','VRDeck','Total_Expenditure']

for value in cols:
    train[value] = np.log(1+train[value])
    test[value]=np.log(1+test[value])

In [None]:
# Visualizing these features after Transformation

x=1
plt.figure(figsize=(15,20))
for i in cols:
    plt.subplot(6,2,x)
    sns.distplot(train[i])
    plt.title(f"{i} Distribution")
    plt.tight_layout()
    x+=1
plt.show()

In [None]:
# Dropping Categorical Features with High Cardinality

train.drop(columns=["PassengerId","Cabin","Name"],inplace=True)
test.drop(columns=["PassengerId","Cabin","Name"],inplace=True)

In [None]:
get_unique_values(train)

## Feature Encoding

In [None]:
train.info()

In [None]:
# One Hot Encoding

categorical_columns = ['HomePlanet', 'CryoSleep', 'Destination',"VIP","Travelling_Solo","Cabin_Side","Cabin_Deck","Cabin_Region1",
                       "Cabin_Region2","Cabin_Region3","Cabin_Region4","Cabin_Region5","Cabin_Region6","No_Spending"]


train = pd.get_dummies(train, columns=categorical_columns)
test = pd.get_dummies(test, columns=categorical_columns)

In [None]:
# let's also transform our target variable

train["Transported"].replace({False:0,True:1},inplace=True)

In [None]:
get_unique_values(train)

In [None]:
train.head(3)

In [None]:
test.head(3)

In [None]:
# Let's see the correlation relationship in the last case

train.drop('Transported', axis=1).corrwith(train.Transported).plot(kind='bar', grid=True, figsize=(12, 10), title="Correlation with target");

# Modelling

In [None]:
X = train.drop(columns=["Transported"])
y = train[["Transported"]]


#scaler = StandardScaler()
#X_scaled = scaler.fit_transform(X)
#test_scaled = scaler.fit_transform(test)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 42)

print(X_train.shape, y_train.shape)

In [None]:
print(X_test.shape,y_test.shape)

In [None]:
# CatBoost
cat=CatBoostClassifier(iterations=1500,
                       eval_metric='Accuracy',
                       verbose=0)

In [None]:
cat.fit(X_train,y_train)

pred = cat.predict(X_train)
pred_y = cat.predict(X_test)

print(accuracy_score(y_train.values,pred))
print(accuracy_score(y_test.values,pred_y))

In [None]:
# observe the importance of our features for the target variable

train_pool = Pool(data=X_train, label=y_train)

feature_importance = cat.get_feature_importance(train_pool)

for feature_name, importance in zip(X_train.columns, feature_importance):
    print(f"{feature_name}: {importance}")

In [None]:
sorted_indices = feature_importance.argsort()[::-1]
sorted_features = X_train.columns[sorted_indices]
sorted_importance = feature_importance[sorted_indices]

plt.figure(figsize=(10, 10))
plt.barh(sorted_features, sorted_importance)
plt.xlabel('Importance')
plt.ylabel('Features')
plt.title('Feature Importance (Sorted)')
plt.show()

* The VIP and Travelling_Solo columns had very little impact. The distribution of these categorical columns was also unbalanced 
* Let's delete these columns and observe the results again

In [None]:
train.drop(columns=["VIP_False","VIP_True","Travelling_Solo_False","Travelling_Solo_True"],inplace=True)
test.drop(columns=["VIP_False","VIP_True","Travelling_Solo_False","Travelling_Solo_True"],inplace=True)

In [None]:
X = train.drop(columns=["Transported"])
y = train[["Transported"]]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 42)

cat=CatBoostClassifier(iterations=1500,
                       eval_metric='Accuracy',
                       verbose=0)

cat.fit(X_train,y_train)

pred = cat.predict(X_train)
pred_y = cat.predict(X_test)

print(accuracy_score(y_train.values,pred))
print(accuracy_score(y_test.values,pred_y))

In [None]:
pred_test = cat.predict(test)

In [None]:
submission["Transported"] = pred_test
submission

In [None]:
submission["Transported"].replace({1:True,0:False},inplace=True)
submission

In [None]:
submission.to_csv('submission_base_model.csv', index=False)

# Ensemble Models

### LGBMClassifier

In [None]:
model1 = LGBMClassifier()

In [None]:
parameters1 = {"n_estimators":[100,300,500,600,650],
              "learning_rate":[0.01,0.02,0.03],
              "random_state":[0,42,48,50],
               "num_leaves":[16,17,18]}

In [None]:
grid_search1 = GridSearchCV(model1, parameters1, cv=5, n_jobs=-1)

In [None]:
grid_search1.fit(X_train,y_train.values.ravel())

In [None]:
grid_search1.best_score_

In [None]:
best_parameters1 = grid_search1.best_params_
best_parameters1

In [None]:
# Creating LGBM Model Using Best Parameters
model1 = LGBMClassifier(**best_parameters1)

In [None]:
model1.fit(X_train,y_train)

In [None]:
x_test_pred1 = model1.predict(X_test)

In [None]:
accuracy_score(y_test,x_test_pred1)

In [None]:
print("LGBMClassifier Model accuracy on Testing Data is:",accuracy_score(y_test,x_test_pred1)*100)

### CatBoostClassifier

In [None]:
model2 = CatBoostClassifier(verbose=False)

In [None]:
parameters2 = {"learning_rate":[0.1,0.3,0.5,0.6,0.7],
              "random_state":[0,42,48,50],
               "depth":[8,9,10],
               "iterations":[35,40,50]}

In [None]:
grid_search2 = GridSearchCV(model2, parameters2, cv=5, n_jobs=-1)

In [None]:
grid_search2.fit(X_train,y_train)

In [None]:
grid_search2.best_score_

In [None]:
best_parameters2 = grid_search2.best_params_
best_parameters2

In [None]:
# Creating Cat Boost Model Using Best Parameters
model2 = CatBoostClassifier(**best_parameters2,verbose=False)

In [None]:
model2.fit(X_train,y_train)

In [None]:
x_test_pred2 = model2.predict(X_test)

In [None]:
accuracy_score(y_test,x_test_pred2)

In [None]:
print("CatBoost Model accuracy on Testing Data is:",accuracy_score(y_test,x_test_pred2)*100)

### XGBClassifier

In [None]:
model3 = XGBClassifier()

In [None]:
parameters3 = {"n_estimators":[50,100,150],
             "random_state":[0,42,50],
             "learning_rate":[0.1,0.3,0.5,1.0]}

In [None]:
grid_search3 = GridSearchCV(model3, parameters3 , cv=5, n_jobs=-1)

In [None]:
grid_search3.fit(X_train,y_train)

In [None]:
grid_search3.best_score_

In [None]:
best_parameters3 = grid_search3.best_params_
best_parameters3

In [None]:
# Creating XGBoost Model Using Best Parameters
model3 = XGBClassifier(**best_parameters3)

In [None]:
model3.fit(X_train,y_train)

In [None]:
x_test_pred3 = model3.predict(X_test)

In [None]:
accuracy_score(y_test,x_test_pred3)

In [None]:
print("XGBClassifier Model accuracy on Testing Data is:",accuracy_score(y_test,x_test_pred3)*100)

### RandomForestClassifier

In [None]:
model4 = RandomForestClassifier()

In [None]:
parameters4 = {'n_estimators': [100,300,500,550],
               'min_samples_split':[7,8,9],
               'max_depth': [10,11,12], 
               'min_samples_leaf':[4,5,6]}

In [None]:
grid_search4 = GridSearchCV(model4, parameters4, cv=5, n_jobs=-1)

In [None]:
grid_search4.fit(X_train,y_train.values.ravel())

In [None]:
grid_search4.best_score_

In [None]:
best_parameters4 = grid_search4.best_params_
best_parameters4

In [None]:
# Creating Random Forest Model Using Best Parameters
model4 = RandomForestClassifier(**best_parameters4)

In [None]:
model4.fit(X_train,y_train)

In [None]:
x_test_pred4 = model4.predict(X_test)

In [None]:
accuracy_score(y_test,x_test_pred4)

In [None]:
print("RandomForest Model accuracy on Testing Data is:",accuracy_score(y_test,x_test_pred4)*100)

### Stacking Classifier Model

* Stacking Classifier is an ensemble model that aims to achieve better prediction performance by combining machine learning models. By combining the different strengths of different models, this method allows us to make more effective and stable predictions.

Intended Use:
- To achieve better performance by combining multiple base classifiers.
- To improve the overall performance of the model by using the advantages of different algorithms.
- To make more robust and reliable predictions by combining multiple classifiers instead of a single classifier.

In [None]:
stacking_model = StackingClassifier(estimators=[('LGBM', model1), 
                                                ('CAT Boost', model2),
                                                ("XGBoost", model3),
                                                ('RF', model4)])

In [None]:
stacking_model.fit(X_train, y_train)

In [None]:
x_train_pred5 = stacking_model.predict(X_train)

In [None]:
x_test_pred5 = stacking_model.predict(X_test)

In [None]:
print("Stacking Model accuracy on Training Data is:",accuracy_score(y_train,x_train_pred5)*100)

In [None]:
print("Stacking Model accuracy on Testing Data is:",accuracy_score(y_test,x_test_pred5)*100)

In [None]:
pred_test = stacking_model.predict(test)

In [None]:
submission["Transported"] = pred_test
submission

In [None]:
submission["Transported"].replace({1:True,0:False},inplace=True)
submission

In [None]:
submission.to_csv('submission_stacking_model.csv', index=False)