In [1]:
import pandas as pd
import numpy as np

In [3]:
titanic = pd.read_csv('titanic.csv')
titanic.head()


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
titanic.tail()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.45,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0,C148,C
890,891,0,3,"Dooley, Mr. Patrick",male,32.0,0,0,370376,7.75,,Q


In [5]:
titanic.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [6]:
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [7]:
# Removing the 'Name' column from the Titanic dataset is a reasonable decision as names are unlikely to directly influence a person's survival. Names are usually unique identifiers and do not inherently offer predictive value for survival prediction. However, it's worth noting that certain titles within the name (e.g., "Mr.," "Mrs.," "Miss," etc.) might provide some indication of social status or gender, which could be relevant for survival prediction. Therefore, it is important to perform a thorough data analysis and consider potential implications on the model's performance before proceeding with this deletion. Always validate the decision with data analysis and model evaluation to ensure it doesn't significantly affect the predictive capabilities of the model.

del titanic['Name']
titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,male,35.0,0,0,373450,8.05,,S


In [8]:
#'Fare' column can never decide survival of a person, hence we can safely delete it

del titanic['Fare']
titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Cabin,Embarked
0,1,0,3,male,22.0,1,0,A/5 21171,,S
1,2,1,1,female,38.0,1,0,PC 17599,C85,C
2,3,1,3,female,26.0,0,0,STON/O2. 3101282,,S
3,4,1,1,female,35.0,1,0,113803,C123,S
4,5,0,3,male,35.0,0,0,373450,,S


In [9]:
del titanic['Cabin']
titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Embarked
0,1,0,3,male,22.0,1,0,A/5 21171,S
1,2,1,1,female,38.0,1,0,PC 17599,C
2,3,1,3,female,26.0,0,0,STON/O2. 3101282,S
3,4,1,1,female,35.0,1,0,113803,S
4,5,0,3,male,35.0,0,0,373450,S


In [10]:
#changing values for "Male, Female" string values to numeric values, i.e, male = 1 and female = 0
def getNumber(str):
        if str == "male":
                return 1
        else:
                return 0
titanic["Gender"]=titanic['Sex'].apply(getNumber)
#We have created new column name "Gender" and filling it with values 1,0 based on the values of sex column
titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Embarked,Gender
0,1,0,3,male,22.0,1,0,A/5 21171,S,1
1,2,1,1,female,38.0,1,0,PC 17599,C,0
2,3,1,3,female,26.0,0,0,STON/O2. 3101282,S,0
3,4,1,1,female,35.0,1,0,113803,S,0
4,5,0,3,male,35.0,0,0,373450,S,1


In [11]:
from sklearn.preprocessing import LabelEncoder
labelencoder = LabelEncoder()
titanic['Sex']= labelencoder.fit_transform(titanic['Sex'])
titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Embarked,Gender
0,1,0,3,1,22.0,1,0,A/5 21171,S,1
1,2,1,1,0,38.0,1,0,PC 17599,C,0
2,3,1,3,0,26.0,0,0,STON/O2. 3101282,S,0
3,4,1,1,0,35.0,1,0,113803,S,0
4,5,0,3,1,35.0,0,0,373450,S,1
