### Import libraries

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

### import dataset

In [None]:
train_df = pd.read_csv('./dataset/train.csv')
test_df = pd.read_csv('./dataset/test.csv')

In [None]:
train_df.head()

In [None]:
test_df.head()

In [None]:
copy_train_df = train_df.copy()

In [None]:
# get number of rows and columns
copy_train_df.shape

In [None]:
copy_train_df.info()

In [None]:
copy_train_df.describe()

In [None]:
copy_train_df['Embarked'].value_counts()

In [None]:
copy_train_df['Age'].skew()

the skew value is close to 0. So We can consider the age column is symmetrically distribute.

### Remove the unwanted columns

In [None]:
copy_train_df.drop(columns=['PassengerId'], inplace=True)

In [None]:
copy_train_df.columns

### Check numerical and categorical columns

In [None]:
# get numberical column list
numeric_columns = [feature for feature in copy_train_df.columns if copy_train_df[feature].dtype != 'object']
numeric_columns

In [None]:
# get categorical column list
categorical_columns = [feature for feature in copy_train_df.columns if copy_train_df[feature].dtype == 'object']
categorical_columns

### Check duplicate

In [None]:
# get duplicates and count of duplicates
duplicates = copy_train_df.duplicated()
copy_train_df[duplicates]

### Check missing values

In [None]:
# get count of missing values in each column
copy_train_df.isnull().sum()

In [None]:
copy_train_df[(copy_train_df['Age'].isnull())].head()

### Handling Missing Values

In [None]:
# Using regression model to replace missing values of Age column
from sklearn.linear_model import LinearRegression

train_data = copy_train_df[copy_train_df['Age'].notna()]
test_data = copy_train_df[copy_train_df['Age'].isna()]

X_train = train_data[['Pclass', 'Sex', 'Fare', 'Embarked']]
y_train = train_data['Age']

X_test = test_data[['Pclass', 'Sex', 'Fare', 'Embarked']]

# convert categorical columns into numeric (e.g using pd.get_dummies)
X_train = pd.get_dummies(X_train, drop_first = True)
X_test = pd.get_dummies(X_test, drop_first = True)

model = LinearRegression()
model.fit(X_train, y_train)

predicted_ages = model.predict(X_test)

copy_train_df.loc[copy_train_df['Age'].isna(), 'Age'] = predicted_ages

In [None]:
# Using regression model to replace missing values of Cabin column
from sklearn.ensemble import RandomForestClassifier

copy_train_df['Cabin'] = copy_train_df['Cabin'].fillna('Unknown')

dummy_train_df = copy_train_df.copy()

dummy_train_df['CabinLetter'] = dummy_train_df['Cabin'].str[0]

dummy_train_df = pd.get_dummies(dummy_train_df, columns = ['CabinLetter', 'Sex', 'Embarked'], drop_first=True)

In [None]:
X_train_cabin = dummy_train_df[dummy_train_df['Cabin'] != 'Unknown'][['Pclass', 'Fare', 'Sex_male', 'Embarked_Q', 'Embarked_S','CabinLetter_B', 'CabinLetter_C', 'CabinLetter_D','CabinLetter_E','CabinLetter_F','CabinLetter_G','CabinLetter_T','CabinLetter_U']]
y_train_cabin = dummy_train_df[dummy_train_df['Cabin'] != 'Unknown']['Cabin']

X_test_cabin = dummy_train_df[dummy_train_df['Cabin'] == 'Unknown'][['Pclass', 'Fare', 'Sex_male', 'Embarked_Q', 'Embarked_S','CabinLetter_B', 'CabinLetter_C', 'CabinLetter_D','CabinLetter_E','CabinLetter_F','CabinLetter_G','CabinLetter_T','CabinLetter_U']]

cabin_model = RandomForestClassifier()
cabin_model.fit(X_train_cabin, y_train_cabin)

predicted_cabin = cabin_model.predict(X_test_cabin)

copy_train_df.loc[copy_train_df['Cabin'] == 'Unknown', 'Cabin'] = predicted_cabin

### Get details using independent variables

In [None]:
cabin_grouped = copy_train_df.groupby(['Cabin', 'Sex'])['Survived'].count()
for (cabin, gender), survived in cabin_grouped.items():
    if survived > 20:
        print(f"cabin: {cabin}, gender: {gender} and survived: {survived}")

In [None]:
# Group by both 'Sex' and 'Survived' to get the count of survivors and non-survivors by gender
survival_gender_grouped = copy_train_df.groupby(['Sex', 'Survived']).size().reset_index(name = 'Count')

# Display the result
print(survival_gender_grouped)

In [None]:
age_grouped = copy_train_df.groupby('Age')['Survived'].count()
for age, survived in age_grouped.items():
    if age > 70:
        print(f"age: {age} and survived: {survived}")

In [None]:
age_wise_count = copy_train_df['Age'].value_counts().sort_values(ascending=False)
for i, (age, count) in enumerate(age_wise_count.items()):
        if i == 10:
                break
        print(f"age: {age} and count: {count}")

### Create a new features (feature engineering)

create a FamilySize column using the SibSp and Parch columns

### Find Outliers in numeric columns

In [None]:
for feature in numeric_columns[1:]:
    # find outlier using boxplot
    sns.boxplot(y = copy_train_df[feature], data = copy_train_df)
    plt.title(feature)
    plt.show()

### Outlier Detection

In [None]:
# detect outliers in age column using interquartile range (IQR)
q1 = copy_train_df['Age'].quantile(0.25)
q3 = copy_train_df['Age'].quantile(0.75)
iqr = q3 - q1

lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr
outliers = copy_train_df[(copy_train_df['Age'] < lower_bound) | (copy_train_df['Age'] > upper_bound)]
print(outliers[['Age','Name']].head(20))

In [None]:
age_outlier_percentage = (len(outliers) / len(copy_train_df)) * 100
print(age_outlier_percentage)

In [None]:
# detect outliers using Tukey's Fences
inner_lower_fence = q1 - 1.5 * iqr
inner_upper_fence = q3 + 1.5 * iqr

outer_lower_fence = q1 - 3 * iqr
outer_upper_fence = q3 + 3 * iqr

mild_outliers = copy_train_df[(copy_train_df['Age'] < inner_lower_fence) | (copy_train_df['Age'] > inner_upper_fence)]
extreme_outliers = copy_train_df[(copy_train_df['Age'] < outer_lower_fence) | (copy_train_df['Age'] > outer_upper_fence)]
print(mild_outliers['Age'].tail(10))
print(extreme_outliers['Age'].tail(10))

In [None]:
print(inner_lower_fence)
print(inner_upper_fence)
print(outer_lower_fence)
print(outer_upper_fence)

In [None]:
# detect outliers using Isolation Forest
# from sklearn.ensemble import IsolationForest
# iso = IsolationForest(contamination=0.1)
# copy_train_df['anomaly'] = iso.fit_predict(copy_train_df[['Age']])
# outliers = copy_train_df[copy_train_df['anomaly'] == -1]
# print(outliers['Age'])

In [None]:
# find outlier using histogram
plt.hist(copy_train_df['Age'], bins = 50, edgecolor = 'black')
plt.show()

In [None]:
# find distribution of age using kde plot
sns.kdeplot(copy_train_df['Age'], fill = True)
plt.show()

In [None]:
for feature in numeric_columns[1:]:
    print(f"{feature} : {copy_train_df[feature].skew()}")

### Handle outliers using IQR and Decision Tree and Random Forest

In [None]:
# from sklearn.ensemble import RandomForestRegressor
# import pandas as pd

# # Assuming lower_bound and upper_bound are defined
# train_data = copy_train_df[copy_train_df['Age'].between(lower_bound, upper_bound)]
# test_data = copy_train_df[(copy_train_df['Age'] < lower_bound) | (copy_train_df['Age'] > upper_bound)]

# X_train = train_data[['Fare', 'Embarked', 'Pclass', 'Sex']]
# y_train = train_data['Age']
# X_test = test_data[['Fare', 'Embarked', 'Pclass', 'Sex']]

# X_train = pd.get_dummies(X_train, drop_first=True)
# X_test = pd.get_dummies(X_test, drop_first=True)

# # Align the columns of X_test to X_train
# X_test = X_test.reindex(columns=X_train.columns, fill_value=0)

# random_regressor = RandomForestRegressor(random_state=42)
# random_regressor.fit(X_train, y_train)

# predicted_ages = random_regressor.predict(X_test)

# # Ensure outliers are defined correctly
# outliers = test_data.index  # or however you define your outliers
# copy_train_df.loc[outliers, 'Age'] = predicted_ages

In [None]:
remaining_outliers = copy_train_df[(copy_train_df['Age'] < lower_bound) | (copy_train_df['Age'] > upper_bound)]
print(remaining_outliers[['Age','Name']].tail(10))

In [None]:
sns.histplot(copy_train_df['Fare'])
plt.show()

In [None]:
copy_train_df[copy_train_df['Parch'] == 6].head()

### Outliers and non outliers columns in numerical columns
1. Fare - is a financial data. so we can handle with log transformation
2. Parch
3. Sibsp
4. Age

#### Handle outliers in Fare column using LOG TRANSFORMATION

In [None]:
copy_train_df['Fare_log'] = np.log1p(copy_train_df['Fare'])

In [None]:
sns.boxplot(y = copy_train_df['Fare'], data = copy_train_df)
plt.title('Fare')
plt.show()

In [None]:
copy_train_df['Fare_log'].skew()

Still some outliers are stayed in Fare column. So we can apply Robust Scaler

In [None]:
from sklearn.preprocessing import RobustScaler
scaler = RobustScaler()
copy_train_df['Fare_log_scaled'] = scaler.fit_transform(copy_train_df[['Fare_log']])


In [None]:
sns.boxplot(y = copy_train_df['Fare_log_scaled'], data = copy_train_df)
plt.title('Fare_log_scaled')
plt.show()

In [None]:
copy_train_df['Fare_log_scaled'].skew()

There is no impact in outlier, So we can choose Winsorization (capping extreme values)

In [None]:
lower_bound = copy_train_df['Fare_log'].quantile(0.01)
upper_bount = copy_train_df['Fare_log'].quantile(0.99)
copy_train_df['Fare_log_capped'] = np.clip(copy_train_df['Fare_log'], lower_bound, upper_bound)

In [None]:
copy_train_df['Fare_log_capped'].skew()

In [None]:
sns.boxplot(y = copy_train_df['Fare_log_capped'], data = copy_train_df)
plt.title('Fare_log_capped')
plt.show()

In [None]:
from scipy.stats.mstats import winsorize
copy_train_df['Fare_log_winsorize'] = winsorize(copy_train_df['Fare_log'], limits = [0.05, 0.05])

In [None]:
copy_train_df['Fare_log_winsorize'].skew()

In [None]:
sns.boxplot(y = copy_train_df['Fare_log_winsorize'], data = copy_train_df)
plt.show()

All outliers are handles in the Fare column. The next column is Parch

### Handle outliers in Parch Column Using LOG TRANSFORMATION

In [None]:
copy_train_df['Parch_log'] = np.log1p(copy_train_df['Parch'])

In [None]:
sns.boxplot(y = copy_train_df['Parch_log'], data = copy_train_df)
plt.show()

There is no changes. So apply winsorization

In [None]:
from scipy.stats.mstats import winsorize
copy_train_df['Parch_log_winsorize'] = winsorize(copy_train_df['Parch_log'], limits = [0.05, 0.25])

In [None]:
sns.boxplot(y = copy_train_df['Parch_log_winsorize'], data = copy_train_df)
plt.show()

Outliers are handled in Parch column using Winsorization

### Handle outliers in Sibsp column

In [None]:
copy_train_df['SibSp_log'] = np.log1p(copy_train_df['SibSp'])

In [None]:
sns.boxplot(y = copy_train_df['SibSp_log'], data = copy_train_df)
plt.show()

In [None]:
from scipy.stats.mstats import winsorize
copy_train_df['SibSp_winsorize'] = winsorize(copy_train_df['SibSp_log'], limits = [0,0.05])

In [None]:
sns.boxplot(y = copy_train_df['SibSp_winsorize'], data = copy_train_df)
plt.show()

Outliers handled in sibsp column

### handling outliers in age column using capping method

In [None]:
copy_train_df['Age'] = copy_train_df['Age'].apply(lambda x: min(x, 57) if x > 57 else max(x, 1))

In [None]:
sns.boxplot(y = copy_train_df['Age'], data = copy_train_df)
plt.show()

In [None]:
copy_train_df.head(2)

### Feature Engineering

Create family size column

In [None]:
copy_train_df['FamilySize'] = copy_train_df['SibSp'] + copy_train_df['Parch']

In [None]:
copy_train_df.head(2)

Create a new column Fare per person

In [None]:
copy_train_df['FarePerPerson'] = copy_train_df['Fare'] / (copy_train_df['FamilySize'] + 1)

Create a new column IsAlone

In [None]:
copy_train_df['IsAlone'] = (copy_train_df['FamilySize'] == 0).astype(int)

Title extraction from name

In [None]:
copy_train_df['Title'] = copy_train_df['Name'].str.extract(r' ([A-Za-z]+)\.', expand = False)

In [None]:
copy_train_df['Title'].head(10)

create age group

In [None]:
copy_train_df['AgeGroup'] = pd.cut(copy_train_df['Age'], bins = [0,12,18,60,100], labels = ['Child', 'Teen', 'Adult', 'Senior'])

Create deck column from cabin

In [None]:
copy_train_df['Deck'] = copy_train_df['Cabin'].str[0]

Fare binning

In [None]:
copy_train_df['FareGroup'] = pd.qcut(copy_train_df['Fare'], 4, labels = ['Low', 'Medium', 'High', 'Very High'])

Tiket Frequency

In [None]:
tickets_count = copy_train_df['Ticket'].value_counts()
copy_train_df['TickerFrequency'] = copy_train_df['Ticket'].map(tickets_count)

PClass and Age intraction

In [None]:
copy_train_df['Pclass_Age'] = copy_train_df['Pclass'] * copy_train_df['Age']

Embarked and Pclass Interaction

In [None]:
copy_train_df['Embarked_Pclass'] = copy_train_df['Embarked'].astype(str) + '_' + copy_train_df['Pclass'].astype(str)

Survival Probability by Group

In [None]:
# title_survival_rate = copy_train_df.groupby('Title')['Survived'].mean()
# copy_train_df['Title_Survival_Rate'] = copy_train_df['Title'].map(title_survival_rate)

Family Survival Rate

In [None]:
# copy_train_df['LastName'] = copy_train_df['Name'].apply(lambda x: x.split('.')[0])
# family_survival_rate = copy_train_df.groupby('LastName')['Survived'].transform('mean')
# copy_train_df['FamilySurvivalRate'] = family_survival_rate

In [None]:
copy_train_df.columns

In [None]:
lst = ['FamilySize',
       'FarePerPerson', 'IsAlone', 'Title', 'AgeGroup', 'Deck', 'FareGroup',
       'TickerFrequency', 'Pclass_Age', 'Embarked_Pclass']
for i in lst:
    print(f"{i}: {copy_train_df[i].dtype}")

In [None]:
lst = ['FamilySize','FarePerPerson', 'TickerFrequency', 'Pclass_Age']

for i in lst:
    sns.boxplot(x = copy_train_df[i], data = copy_train_df)
    plt.title(i)
    plt.show()

In [None]:
columns_to_drop = ['Name', 'Ticket', 'Cabin', 
                   'Fare', 'Fare_log', 'Fare_log_scaled', 
                   'Fare_log_capped', 'Parch', 'SibSp']
copy_train_df = copy_train_df.drop(columns = columns_to_drop)

In [None]:
columns_to_drop = ['Parch_log', 'Parch_log_winsorize', 'SibSp_log', 'SibSp_winsorize']
copy_train_df = copy_train_df.drop(columns=columns_to_drop)

In [None]:
copy_train_df.head(1)

In [None]:
# sns.boxplot(x = copy_train_df['FamilySize_win'], data = copy_train_df)
# plt.title('FamilySize')
# plt.show()

Handle the outliers in FarePerPerson column

In [None]:
from scipy.stats.mstats import winsorize
copy_train_df['FarePerPerson'] = np.log1p(copy_train_df['FarePerPerson'])
copy_train_df['FarePerPerson'] = winsorize(copy_train_df['FarePerPerson'], limits = [0.02,0.02])

Handle outliers in TicketFrequency column

In [None]:
copy_train_df['TickerFrequency'] = winsorize(copy_train_df['TickerFrequency'], limits=[0,0.13])

Handle outliers in Pclass_Age column

In [None]:
copy_train_df['Pclass_Age'] = winsorize(copy_train_df['Pclass_Age'], (0, 0.03))

Handle outliers in FamilySize column

In [None]:
copy_train_df['FamilySize'] = winsorize(copy_train_df['FamilySize'], limits = [0,0.11])

In [None]:
# copy_train_df.drop(columns = ['FamilySize_win'], inplace = True)

Handle outliers in Title column

In [None]:
title_count = copy_train_df['Title'].value_counts()
print(title_count)

In [None]:
rare_title = ['Jonkheer','Countess','Capt','Sir','Lady','Don','Mme','Ms','Major','Mlle','Col','Rev','Dr']
copy_train_df['Title'] = copy_train_df['Title'].replace(rare_title, 'Rare')
copy_train_df['Title'].value_counts()

In [None]:
import scipy.stats as stats

stats.probplot(copy_train_df['Age'], dist="norm", plot=plt)
plt.title("Q-Q Plot of Age")
plt.show()

In [None]:
from scipy.stats import shapiro

stat, p_value = shapiro(copy_train_df['Age'])
print(f'Shapiro-Wilk Test: Statistic={stat}, p-value={p_value}')

The age column is not normally distributed. So we choose Normalization instead of Standardization.


Normalization - Non Normal Distribution


Standardization - Normal Distribution

In [None]:
copy_train_df.head()

In [None]:
from scipy.stats import shapiro
lst = ['Pclass', 'Age', 'Fare_log_winsorize', 'FamilySize', 'FarePerPerson', 'IsAlone','TickerFrequency','Pclass_Age']
for i in lst:
    stat, p_value = shapiro(copy_train_df[i])
    print(f'Shapiro-Wilk Test for {i}: Statistic={stat}, p-value={p_value}')

the PValues are <= 0.05, so we can reject the null hypothesis. (the columns are not normally distributed)

Normalization

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
columns_to_normalize = ['Pclass', 'Age', 'Fare_log_winsorize', 'FamilySize', 'FarePerPerson', 'IsAlone','TickerFrequency','Pclass_Age']
copy_train_df[columns_to_normalize] = scaler.fit_transform(copy_train_df[columns_to_normalize])

In [None]:
cat = list(copy_train_df.select_dtypes('object').columns)
cat

### Handling Categorical Features

Column Name: SEX

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
copy_train_df['Sex'] = le.fit_transform(copy_train_df['Sex'])

Column Name: EMBARKED, TITLE, DECK, EMBARKED_PCLASS

In [None]:
copy_train_df = pd.get_dummies(copy_train_df, columns = ['AgeGroup','FareGroup','Embarked','Title','Deck','Embarked_Pclass'], drop_first=True)

save the dataframe into csv

In [None]:
copy_train_df.to_csv('dataset/final_df.csv', index = False)

In [None]:
copy_train_df.columns