In [1]:
import pandas as pd
from scipy import stats
from sklearn.preprocessing import StandardScaler

In [2]:
#Step 1: Loading the Dataset
from google.colab import files


uploaded = files.upload()

Saving titanic.csv to titanic.csv


In [4]:
data = pd.read_csv('titanic.csv')
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
# Step 2: Handling Missing Values
print("\nMissing values before handling:")
print(data.isnull().sum())


Missing values before handling:
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64


In [6]:
# Fill missing 'Age' and 'Fare' values with the mean
data['Age'].fillna(data['Age'].mean(), inplace=True)
data['Fare'].fillna(data['Fare'].mean(), inplace=True)

# Fill missing 'Embarked' values with the most common value
data['Embarked'].fillna(data['Embarked'].mode()[0], inplace=True)

# Fill missing 'Cabin' values with 'Unknown'
data['Cabin'].fillna('Unknown', inplace=True)

In [7]:
print("\nMissing values after handling:")
print(data.isnull().sum())


Missing values after handling:
PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
dtype: int64


In [8]:
#Step 3: Cleaning Irrelevant Data
# Drop columns that are not useful for prediction
data.drop(['Name', 'Ticket', 'Cabin'], axis=1, inplace=True)

In [9]:
#Step 4: Encoding Categorical Variables
# Convert 'Sex' into numerical values
data['Sex'] = data['Sex'].map({'male': 0, 'female': 1})

# Convert 'Embarked' into numerical values using one-hot encoding
test_df = pd.get_dummies(data, columns=['Embarked'], drop_first=True)

In [10]:
#Step 5: Feature Engineering
# Create a new feature 'FamilySize' from 'SibSp' and 'Parch'
test_df['FamilySize'] = test_df['SibSp'] + test_df['Parch'] + 1

# Create a new feature 'IsAlone'
test_df['IsAlone'] = 1  # Initialize to 1
test_df['IsAlone'].loc[test_df['FamilySize'] > 1] = 0  # Update to 0 if FamilySize > 1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df['IsAlone'].loc[test_df['FamilySize'] > 1] = 0  # Update to 0 if FamilySize > 1


In [11]:
#Step 6: Transformation
from scipy import stats

# Identify numerical features
numerical_features = ['Age', 'Fare']

# Calculate the Z-scores of the numerical features
z_scores = stats.zscore(test_df[numerical_features])

# Identify outliers as points with a Z-score > 3 or < -3
outliers = (abs(z_scores) > 3).any(axis=1)

# Remove outliers from the dataset
test_df = test_df[~outliers]

In [12]:
#Step 7: Normalization
from sklearn.preprocessing import StandardScaler

# Initialize the StandardScaler
scaler = StandardScaler()

# Normalize numerical features
numerical_features = ['Age', 'Fare', 'FamilySize']
test_df[numerical_features] = scaler.fit_transform(test_df[numerical_features])

# Display the processed test dataframe
print(test_df.head())


   PassengerId  Survived  Pclass  Sex       Age  SibSp  Parch      Fare  \
0            1         0       3    0 -0.586622      1      0 -0.655160   
1            2         1       1    1  0.695098      1      0  1.524090   
2            3         1       3    1 -0.266192      0      0 -0.632188   
3            4         1       1    1  0.454776      1      0  0.905257   
4            5         0       3    0  0.454776      0      0 -0.627934   

   Embarked_Q  Embarked_S  FamilySize  IsAlone  
0       False        True    0.069358        0  
1       False       False    0.069358        0  
2       False        True   -0.554867        1  
3       False        True    0.069358        0  
4       False        True   -0.554867        1  


In [13]:
# Display the processed test dataframe
print(data.head())

   PassengerId  Survived  Pclass  Sex   Age  SibSp  Parch     Fare Embarked
0            1         0       3    0  22.0      1      0   7.2500        S
1            2         1       1    1  38.0      1      0  71.2833        C
2            3         1       3    1  26.0      0      0   7.9250        S
3            4         1       1    1  35.0      1      0  53.1000        S
4            5         0       3    0  35.0      0      0   8.0500        S


In [14]:
# Save the processed test dataframe to a CSV file
test_df.to_csv('processed_test.csv', index=False)