In [1]:
import numpy as np 
import pandas as pd
import matplotlib as mpl

In [2]:
titanic=pd.read_csv("/kaggle/input/datasets/fatuzahra/titanic/titanic.csv")
titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
titanic.tail()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.45,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0,C148,C
890,891,0,3,"Dooley, Mr. Patrick",male,32.0,0,0,370376,7.75,,Q


In [4]:
titanic.shape

(891, 12)

In [5]:
titanic.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [6]:
titanic.info()
#here we can see that there are total of 891 entries but some features/columns have entries less that 891, i.e Age and Cabin
#this shows that the dataset has missing data

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [7]:
titanic.describe()
#describe() only gives us info about numerical features

#here we can see that the std of PassID is the highest
#however we cannot predict whether a person will survive or not based on their ID
#second hgihest std is that of Fare so this will help us predict
#which makes sense as well as we know that the higher paying class/elites are always given priority
#mean of survived shows us the survival rate, so 0.38->38% people survived
#survived is a label (prediction)

# IQR Interquartile Range 
# 25% percentile aka Q1 is quarterly value
# 50% persentile is the middle value
# 75% percentile is the third quarterly value
# data is arranged in an ascending order from min to max for this IQR
# here we can say that the min age is 0.4/4 months and the max age is 80
# 25% shows us that about 25% of the passengers were from age 4 months to 20 y/o 
# 50% shows us that this 25% was aged from 20 y/o to 28 y/o
# or that 50% were aged from 4 months to 28 y/o
# 75% shows us that this 25% is aged from 28 y/o to 38 y/o 
# this helps us determine that only 25% passengers were 38+ y/o so mostly were young

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [8]:
# the Pclass is a determinant of the class the passengers were travelling in, 1 being the elite and 3 being the lower
# the percentiles shows us that majority of the passengers were travlling in lower class 3
# and only 25% passengers were travelling in class 1 and class 2 combined

In [9]:
# filtering is used to find outliers and understand the dataset.
# we can filter a single or multiple column to study them
# or we can filter a column or multiple columns or the whole dataset on the basis of a condition

# example of filtering a single column
titanic["Age"].head()

0    22.0
1    38.0
2    26.0
3    35.0
4    35.0
Name: Age, dtype: float64

In [10]:
# example of filtering multiple columns

titanic[["Name","Age","Sex"]]


Unnamed: 0,Name,Age,Sex
0,"Braund, Mr. Owen Harris",22.0,male
1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,female
2,"Heikkinen, Miss. Laina",26.0,female
3,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,female
4,"Allen, Mr. William Henry",35.0,male
...,...,...,...
886,"Montvila, Rev. Juozas",27.0,male
887,"Graham, Miss. Margaret Edith",19.0,female
888,"Johnston, Miss. Catherine Helen ""Carrie""",,female
889,"Behr, Mr. Karl Howell",26.0,male


In [11]:
# filtering on the basis of a condition:
titanic[(titanic["Age"]>29)&(titanic["Sex"]=="female")]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
11,12,1,1,"Bonnell, Miss. Elizabeth",female,58.0,0,0,113783,26.5500,C103,S
15,16,1,2,"Hewlett, Mrs. (Mary D Kingcome)",female,55.0,0,0,248706,16.0000,,S
18,19,0,3,"Vander Planke, Mrs. Julius (Emelia Maria Vande...",female,31.0,1,0,345763,18.0000,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
862,863,1,1,"Swift, Mrs. Frederick Joel (Margaret Welles Ba...",female,48.0,0,0,17466,25.9292,D17,S
865,866,1,2,"Bystrom, Mrs. (Karolina)",female,42.0,0,0,236852,13.0000,,S
871,872,1,1,"Beckwith, Mrs. Richard Leonard (Sallie Monypeny)",female,47.0,1,1,11751,52.5542,D35,S
879,880,1,1,"Potter, Mrs. Thomas Jr (Lily Alexenia Wilson)",female,56.0,0,1,11767,83.1583,C50,C


In [12]:
#returns count of each unique occurance:

titanic["Pclass"].value_counts()

Pclass
3    491
1    216
2    184
Name: count, dtype: int64

In [13]:
# we can group two features together as well:

titanic.groupby("Sex")["Survived"].mean()

Sex
female    0.742038
male      0.188908
Name: Survived, dtype: float64

In [14]:
# isnull().sum() is used to find the number of missing values in each feature
titanic.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [15]:
# iloc means integer based indexing

# loc means label based indexing

# iloc is used to select data by their number of rows and columns
# for example:

titanic.iloc[:5, :3]
# selects first five rows and first 3 columns. the last indices (5, 3) are exclusive

Unnamed: 0,PassengerId,Survived,Pclass
0,1,0,3
1,2,1,1
2,3,1,3
3,4,1,1
4,5,0,3


In [16]:
# example of loc:

#display() is a built-in method to display result

display(titanic.loc[titanic["Pclass"]==1,["Name","Sex"]].head())

# returns the name and the sex of the first five passengers with Pclass 1

Unnamed: 0,Name,Sex
1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female
3,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female
6,"McCarthy, Mr. Timothy J",male
11,"Bonnell, Miss. Elizabeth",female
23,"Sloper, Mr. William Thompson",male
