In [1]:
import pandas as pd
import numpy as np

## 2. Description of the Data and Source

- **Dataset Name**: Titanic - Machine Learning from Disaster  
- **Description**: This dataset contains information about the passengers aboard the Titanic, including features such as age, sex, ticket class, and whether or not the individual survived. The dataset is commonly used for binary classification problems and predictive modeling.

- **Source URL**: [Kaggle Titanic Dataset](https://www.kaggle.com/c/titanic/data)



In [9]:
df = pd.read_csv("titanic.csv")  # assuming the file is in your working directory

In [11]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [17]:
#Checking for missing values
print(df.isnull().sum())


PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64


In [19]:
#Basic Statistics
print(df.describe(include='all'))#Quantitative + Categorical Variables

        PassengerId    Survived      Pclass                     Name   Sex  \
count    891.000000  891.000000  891.000000                      891   891   
unique          NaN         NaN         NaN                      891     2   
top             NaN         NaN         NaN  Braund, Mr. Owen Harris  male   
freq            NaN         NaN         NaN                        1   577   
mean     446.000000    0.383838    2.308642                      NaN   NaN   
std      257.353842    0.486592    0.836071                      NaN   NaN   
min        1.000000    0.000000    1.000000                      NaN   NaN   
25%      223.500000    0.000000    2.000000                      NaN   NaN   
50%      446.000000    0.000000    3.000000                      NaN   NaN   
75%      668.500000    1.000000    3.000000                      NaN   NaN   
max      891.000000    1.000000    3.000000                      NaN   NaN   

               Age       SibSp       Parch  Ticket        Fare 

In [23]:
# Dimensions of the DataFrame
df.shape

(891, 12)

In [25]:
#Variable Description & Types
df.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

In [17]:
#Convert Pclass (which is currently int64) into string type
df['Pclass'] = df['Pclass'].astype(str)
#Before:
#Pclass    int64
#After:
#Pclass    object  # (because string type is stored as object in pandas)"""

In [None]:

#Convert Categorical to Numerical Variables (3 Methods)


In [19]:
#1.Label Encoding (Sex)
df['Sex_encoded'] = df['Sex'].map({'male': 0, 'female': 1})


In [21]:
#To print the newly created Sex_encoded column in your DataFrame after encoding, you can use:

print(df['Sex_encoded'])

0      0
1      1
2      1
3      1
4      0
      ..
886    0
887    1
888    1
889    0
890    0
Name: Sex_encoded, Length: 891, dtype: int64


In [35]:
#Or, to view it alongside the original Sex column for comparison:

print(df[['Sex', 'Sex_encoded']])

        Sex  Sex_encoded
0      male            0
1    female            1
2    female            1
3    female            1
4      male            0
..      ...          ...
886    male            0
887  female            1
888  female            1
889    male            0
890    male            0

[891 rows x 2 columns]


In [23]:
#b. One-Hot Encoding (Embarked)
df = pd.get_dummies(df, columns=['Embarked'], drop_first=True)

In [27]:
print(df.head())


   PassengerId  Survived Pclass  \
0            1         0      3   
1            2         1      1   
2            3         1      3   
3            4         1      1   
4            5         0      3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin  Sex_encoded  Embarked_Q  Embarked_S  
0      0         A/5 21171   7.2500   NaN            0       False        True  
1      0          PC 17599  71.2833   C85            1       False       False  
2      0  STON/O2. 3101282   7.9250   NaN            1       False        True  
3 

##So, if:

both are 0(False) → Embarked was C

only Embarked_S = 1(True) → Embarked was S

only Embarked_Q = 1(True) → Embarked was Q