In [42]:
import pandas as pd
import numpy as np


**1. Load the dataset into python environment**

In [43]:
data =pd.read_csv('/content/titanic_dataset.csv')

In [44]:
data

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


**2. Make ‘PassengerId’ as the index column**

In [45]:
data.set_index('PassengerId', inplace=True)

In [46]:
data.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


**3. Check the basic details of the dataset**

In [47]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Name      891 non-null    object 
 3   Sex       891 non-null    object 
 4   Age       714 non-null    float64
 5   SibSp     891 non-null    int64  
 6   Parch     891 non-null    int64  
 7   Ticket    891 non-null    object 
 8   Fare      891 non-null    float64
 9   Cabin     204 non-null    object 
 10  Embarked  889 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 83.5+ KB


In [48]:
data.describe()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,714.0,891.0,891.0,891.0
mean,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,0.0,1.0,0.42,0.0,0.0,0.0
25%,0.0,2.0,20.125,0.0,0.0,7.9104
50%,0.0,3.0,28.0,0.0,0.0,14.4542
75%,1.0,3.0,38.0,1.0,0.0,31.0
max,1.0,3.0,80.0,8.0,6.0,512.3292


In [49]:
data.describe(include=['object'])

Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked
count,891,891,891,204,889
unique,891,2,681,147,3
top,"Braund, Mr. Owen Harris",male,347082,B96 B98,S
freq,1,577,7,4,644


**4. Fill in all the missing values present in all the columns in the dataset**

In [50]:
data.fillna(0, inplace=True)   # Filled missing values with 0 for numerical columns

for column in data.select_dtypes(include=['object']):  # Filled missing values with mode for categorical columns
    data[column].fillna(data[column].mode()[0], inplace=True)


In [51]:
print(data.isnull().sum()) # check if there are any remaining null values in the dataset after filling missing values


Survived    0
Pclass      0
Name        0
Sex         0
Age         0
SibSp       0
Parch       0
Ticket      0
Fare        0
Cabin       0
Embarked    0
dtype: int64


**5. Check and handle outliers in at least 3 columns in the dataset**


In [52]:
numerical_columns = ['Age', 'Fare', 'SibSp']  #selected 3 columns
print(data[numerical_columns].describe())  #before handling outliers

              Age        Fare       SibSp
count  891.000000  891.000000  891.000000
mean    23.799293   32.204208    0.523008
std     17.596074   49.693429    1.102743
min      0.000000    0.000000    0.000000
25%      6.000000    7.910400    0.000000
50%     24.000000   14.454200    0.000000
75%     35.000000   31.000000    1.000000
max     80.000000  512.329200    8.000000


In [58]:
Q1 = data[numerical_columns].quantile(0.25)# to calculate IQR for each numerical column
Q3 = data[numerical_columns].quantile(0.75)
IQR = Q3 - Q1

# to identify outliers in each numerical column
outliers = ((data[numerical_columns] < (Q1 - 1.5 * IQR)) | (data[numerical_columns] > (Q3 + 1.5 * IQR)))

# to handle outliers, replace outliers with the median
data[numerical_columns] = np.where(outliers, data[numerical_columns].median(), data[numerical_columns])

In [59]:
print(data[numerical_columns].describe()) #after handling outliers


              Age        Fare  SibSp
count  738.000000  738.000000  738.0
mean    23.557249   15.293168    0.0
std     17.269787   10.583607    0.0
min      0.000000    0.000000    0.0
25%      5.000000    7.889575    0.0
50%     24.000000   11.500000    0.0
75%     34.000000   21.000000    0.0
max     74.000000   52.554200    0.0


**6. Do min max scaling on the feature set (Take ‘Survived’ as target)**

In [62]:

from sklearn.preprocessing import MinMaxScaler

# Taking df is your DataFrame with 'Survived' as the target variable and other features
# to select only numerical columns for scaling
numerical_columns = data.select_dtypes(include=['int64', 'float64']).columns
X = data[numerical_columns]  # Features
y = data['Survived']  # Target

# to initialize MinMaxScaler
scaler = MinMaxScaler()

# Fit and transform the feature set
X_scaled = scaler.fit_transform(X)

# to cnvert the scaled features back to a DataFrame
X_scaled_df = pd.DataFrame(X_scaled, columns=numerical_columns)

# to check the scaled feature set
print(X_scaled_df.head())


   Survived  Pclass       Age  SibSp  Parch      Fare
0       0.0     1.0  0.297297    0.0    0.0  0.137953
1       1.0     0.0       NaN    NaN    0.0       NaN
2       1.0     1.0  0.351351    0.0    0.0  0.150797
3       1.0     0.0  0.472973    0.0    0.0  0.218822
4       0.0     1.0  0.472973    0.0    0.0  0.153175
