# LightBGM Classification

## Importing the libraries

In [20]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Importing the dataset

In [21]:
!git clone https://github.com/AdSh0102/CogoportPythonTitanic.git

fatal: destination path 'CogoportPythonTitanic' already exists and is not an empty directory.


In [22]:
dataset = pd.read_csv('/content/CogoportPythonTitanic/train.csv')
X = dataset.drop(columns=['Survived'])
y = dataset[['Survived']]

## Data Preprocessing (removing columns that might not be useful)

In [23]:
X.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [24]:
# Calculate the total number of rows in the DataFrame
total_rows = len(X)

# Calculate the number of null values in each column
null_counts = X.isnull().sum()

# Calculate the percentage of null values in each column
null_percentages = (null_counts / total_rows) * 100

# Display the percentage of null values in each column
print(null_percentages)

PassengerId     0.000000
Pclass          0.000000
Name            0.000000
Sex             0.000000
Age            19.865320
SibSp           0.000000
Parch           0.000000
Ticket          0.000000
Fare            0.000000
Cabin          77.104377
Embarked        0.224467
dtype: float64


In [25]:
X = X.drop('Cabin', axis=1)
X = X.drop('Name', axis=1)
# drop as high %age of missing values

In [26]:
X.columns

Index(['PassengerId', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket',
       'Fare', 'Embarked'],
      dtype='object')

## Splitting the dataset into the Training set and Test set

In [27]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

## More data preprocessing (handling the remaining null values)

In [28]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')

columns_to_impute = ['Age']
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer.fit(X_test[columns_to_impute])

X_test[columns_to_impute] = imputer.transform(X_test[columns_to_impute])

In [29]:
# Step 1: Find the mode of the column
mode_value = X_test['Embarked'].mode()[0]

# Step 2: Replace the null values with the mode
X_test['Embarked'].fillna(mode_value, inplace=True)

In [30]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')

columns_to_impute = ['Age']
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer.fit(X_train[columns_to_impute])

X_train[columns_to_impute] = imputer.transform(X_train[columns_to_impute])

In [31]:
# Step 1: Find the mode of the column
mode_value = X_train['Embarked'].mode()[0]

# Step 2: Replace the null values with the mode
X_train['Embarked'].fillna(mode_value, inplace=True)

In [32]:
# Calculate the total number of rows in the DataFrame
total_rows = len(X_train)

# Calculate the number of null values in each column
null_counts = X_train.isnull().sum()

# Calculate the percentage of null values in each column
null_percentages = (null_counts / total_rows) * 100

# Display the percentage of null values in each column
print(null_percentages)

PassengerId    0.0
Pclass         0.0
Sex            0.0
Age            0.0
SibSp          0.0
Parch          0.0
Ticket         0.0
Fare           0.0
Embarked       0.0
dtype: float64


In [33]:
# Calculate the total number of rows in the DataFrame
total_rows = len(X_test)

# Calculate the number of null values in each column
null_counts = X_test.isnull().sum()

# Calculate the percentage of null values in each column
null_percentages = (null_counts / total_rows) * 100

# Display the percentage of null values in each column
print(null_percentages)

PassengerId    0.0
Pclass         0.0
Sex            0.0
Age            0.0
SibSp          0.0
Parch          0.0
Ticket         0.0
Fare           0.0
Embarked       0.0
dtype: float64


## Encoding the variables with categorical data

In [34]:
X_train.dtypes

PassengerId      int64
Pclass           int64
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Embarked        object
dtype: object

In [35]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
X_train['Sex'] = le.fit_transform(X_train['Sex'])
X_train['Embarked'] = le.fit_transform(X_train['Embarked'])
X_train['Ticket'] = le.fit_transform(X_train['Ticket'])
X_test['Sex'] = le.fit_transform(X_test['Sex'])
X_test['Embarked'] = le.fit_transform(X_test['Embarked'])
X_test['Ticket'] = le.fit_transform(X_test['Ticket'])

## Training the LightBGM Classification model and using Bagging Classifier on the Training set

In [36]:
from sklearn.ensemble import BaggingClassifier
from lightgbm import LGBMClassifier
lgbm_classifier = LGBMClassifier()
# classifier =  LGBMClassifier()
classifier = BaggingClassifier(estimator=lgbm_classifier)
classifier.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


## Making the Confusion Matrix

In [37]:
from sklearn.metrics import confusion_matrix, accuracy_score
y_pred = classifier.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[115  24]
 [ 20  64]]


0.8026905829596412

In [38]:
y_train_pred = classifier.predict(X_train)

cm1 = confusion_matrix(y_train, y_train_pred)
print(cm1)
accuracy_score(y_train, y_train_pred)

[[403   7]
 [ 12 246]]


0.9715568862275449