In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/titanic/train.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/gender_submission.csv


In [2]:
train_data = pd.read_csv("/kaggle/input/titanic/train.csv")

In [3]:
train_data

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


### data description:

* Variable	Definition	Key
    * survival--	Survival	0 = No, 1 = Yes
    * pclass--	Ticket class	1 = 1st, 2 = 2nd, 3 = 3rd
    * sex--	Sex	
    * Age--	Age in years	
    * sibsp--	# of siblings / spouses aboard the Titanic	
    * parch--	# of parents / children aboard the Titanic	
    * ticket--	Ticket number	
    * fare--	Passenger fare	
    * cabin--	Cabin number	
    * embarked--	Port of Embarkation	C = Cherbourg, Q = Queenstown, S = Southampton

#### More of Description for the same 

* pclass: A proxy for socio-economic status (SES)

    1st = Upper
    
    2nd = Middle
    
    3rd = Lower
* age: Age is fractional if less than 1. If the age is estimated, is it in the form of xx.5

* sibsp: The dataset defines family relations in this way...

    Sibling = brother, sister, stepbrother, stepsister
    
    Spouse = husband, wife (mistresses and fiancés were ignored)
* parch: The dataset defines family relations in this way...

    Parent = mother, father
    
    Child = daughter, son, stepdaughter, stepson
    
    Some children travelled only with a nanny, therefore parch=0 for them.


In [4]:
train_data.columns.to_frame().reset_index(drop=True)

Unnamed: 0,0
0,PassengerId
1,Survived
2,Pclass
3,Name
4,Sex
5,Age
6,SibSp
7,Parch
8,Ticket
9,Fare


In [5]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [6]:
train_data.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [7]:
# Unique number of passengers in titianic
num_of_passengers=train_data['PassengerId'].nunique()
print(f"Number of passengers on titanic: {num_of_passengers:,}")

Number of passengers on titanic: 891


In [8]:
# number of passengers having SibSp
pass_with_sib_sp = train_data[train_data["SibSp"]>0]['PassengerId'].nunique()
print(f"Passengers having sibling or spouse : {pass_with_sib_sp}")

pass_with_sib_sp_and_survived = train_data[(train_data["SibSp"]>0) &(train_data['Survived']==1)]['PassengerId'].nunique()
print(f"Passengers having sibling or spouse and survived : {pass_with_sib_sp_and_survived}")

Passengers having sibling or spouse : 283
Passengers having sibling or spouse and survived : 132


In [9]:
# number of passengers having Parch
pass_with_parch = train_data[train_data["Parch"]>0]['PassengerId'].nunique()
print(f"Passengers having parents or children : {pass_with_parch}")

pass_with_parch_and_survived = train_data[(train_data["Parch"]>0) &(train_data['Survived']==1)]['PassengerId'].nunique()
print(f"Passengers having  parents or children and survived : {pass_with_parch_and_survived}")

Passengers having parents or children : 213
Passengers having  parents or children and survived : 109


In [10]:
# number of passengers survived
survived_count = train_data['Survived'].value_counts().reset_index()

In [11]:
survived_count

Unnamed: 0,index,Survived
0,0,549
1,1,342


In [12]:
import plotly.express as px

In [13]:
fig_survived=px.bar(data_frame = survived_count, x="index",y="Survived",color=['red',"green"],title="Survived Vs non-survived",text_auto=True)

In [14]:
fig_survived

In [15]:
pclass_count = train_data['Pclass'].value_counts().reset_index().rename(columns={'index':'clases',"Pclass":'count'})

In [16]:
fig = px.bar(data_frame= pclass_count, x="clases",y='count',color=['red',"green","blue"],text_auto=True,title="Passenger Class Distribution")

In [17]:
#Passenger Class distribution
fig

In [18]:
# Passenger Gender classificatio
px.bar(x=train_data['Sex'].unique(),y=train_data['Sex'].value_counts(),color=['red','green'],text_auto=True,color_discrete_map="identity")

In [19]:
# Passenger age distribution
train_data['Age']

0      22.0
1      38.0
2      26.0
3      35.0
4      35.0
       ... 
886    27.0
887    19.0
888     NaN
889    26.0
890    32.0
Name: Age, Length: 891, dtype: float64

In [20]:
train_data['Age'].max()

80.0

In [21]:
px.histogram(train_data,"Age",nbins=10,title="Age distribution")