In [7]:
import sys
import pandas as pd
import numpy as np

df = pd.read_csv("train.csv")
# Data Cleaning
df = df.drop(['PassengerId', 'Name', 'Ticket','Cabin'], axis=1)
df.dropna(inplace=True)
df.drop_duplicates(inplace=True)
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


In [8]:
# Discretize "Age" into 4 age groups
bins = [0, 18, 30, 50, 80]
labels = ['Child', 'Young Adult', 'Adult', 'Elderly']
df['AgeGroup'] = pd.cut(df['Age'], bins=bins, labels=labels)
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,AgeGroup
0,0,3,male,22.0,1,0,7.25,S,Young Adult
1,1,1,female,38.0,1,0,71.2833,C,Adult
2,1,3,female,26.0,0,0,7.925,S,Young Adult
3,1,1,female,35.0,1,0,53.1,S,Adult
4,0,3,male,35.0,0,0,8.05,S,Adult


In [9]:
from sklearn.preprocessing import MinMaxScaler

# Data Transformation

# Encode categorical variables using one-hot encoding
df = pd.get_dummies(df, columns=['Sex', 'Embarked'])

df['Age_Disc'] = df['Age']
# Scale numerical variables using MinMaxScaler
scaler = MinMaxScaler()
df[['Age', 'Fare']] = scaler.fit_transform(df[['Age', 'Fare']])
# Create new feature "FamilySize"
df['FamilySize'] = df['SibSp'] + df['Parch'] + 1

df.head()


Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,AgeGroup,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,Age_Disc,FamilySize
0,0,3,0.271174,1,0,0.014151,Young Adult,False,True,False,False,True,22.0,2
1,1,1,0.472229,1,0,0.139136,Adult,True,False,True,False,False,38.0,2
2,1,3,0.321438,0,0,0.015469,Young Adult,True,False,False,False,True,26.0,1
3,1,1,0.434531,1,0,0.103644,Adult,True,False,False,False,True,35.0,2
4,0,3,0.434531,0,0,0.015713,Adult,False,True,False,False,True,35.0,1


In [13]:
# insight 1: Summary statistics
summary_stats = df.describe()
with open('eda-in-1.txt', 'w') as f:
    f.write("Summary Statistics:\n")
    f.write(summary_stats.to_string())

# insight 2: Correlation matrix of numerical features
correlation_matrix = df.drop("AgeGroup",axis=1).corr()
with open('eda-in-2.txt', 'w') as f:
    f.write("Correlation Matrix:\n")
    f.write(correlation_matrix.to_string())

# insight 3: Distribution of the 'Survived' column
survived_distribution = df['Survived'].value_counts(normalize=True)
with open('eda-in-3.txt', 'w') as f:
    f.write("Distribution of 'Survived' column:\n")
    f.write(survived_distribution.to_string())