In [1]:
import pandas as pd
import json
import numpy as np

In [None]:
# load dataset
df = pd.read_csv("../data/titanic_dataset.csv")

In [3]:
df.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [4]:
# data info
# Embarked: Port of embarkation - where each passenger boarded the Titanic
# Parch: Number of parents or children a passenger had aboard the Titanic
# SibSp: Number of siblings or spouses a passenger had aboard.


In [5]:
df.head(20)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [6]:
print(df.describe())

       PassengerId    Survived      Pclass         Age       SibSp  \
count   891.000000  891.000000  891.000000  714.000000  891.000000   
mean    446.000000    0.383838    2.308642   29.699118    0.523008   
std     257.353842    0.486592    0.836071   14.526497    1.102743   
min       1.000000    0.000000    1.000000    0.420000    0.000000   
25%     223.500000    0.000000    2.000000   20.125000    0.000000   
50%     446.000000    0.000000    3.000000   28.000000    0.000000   
75%     668.500000    1.000000    3.000000   38.000000    1.000000   
max     891.000000    1.000000    3.000000   80.000000    8.000000   

            Parch        Fare  
count  891.000000  891.000000  
mean     0.381594   32.204208  
std      0.806057   49.693429  
min      0.000000    0.000000  
25%      0.000000    7.910400  
50%      0.000000   14.454200  
75%      0.000000   31.000000  
max      6.000000  512.329200  


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [8]:
# percentage of missing values
((df.isnull().sum() / len(df)).round(2)) * 100

PassengerId     0.0
Survived        0.0
Pclass          0.0
Name            0.0
Sex             0.0
Age            20.0
SibSp           0.0
Parch           0.0
Ticket          0.0
Fare            0.0
Cabin          77.0
Embarked        0.0
dtype: float64

In [9]:
# percentage of zeros in each column
(((df == 0).sum() / len(df)) * 100).round(2)

PassengerId     0.00
Survived       61.62
Pclass          0.00
Name            0.00
Sex             0.00
Age             0.00
SibSp          68.24
Parch          76.09
Ticket          0.00
Fare            1.68
Cabin           0.00
Embarked        0.00
dtype: float64

In [10]:
# check distinct values for categorical columns
cat_cols = ['Embarked', 'Cabin', 'Sex', 'Pclass', 'SibSp', 'Parch' ]
unique_values = {col: df[col].unique().tolist() for col in cat_cols}
pretty_json = json.dumps(unique_values, indent=2)
print(pretty_json)

{
  "Embarked": [
    "S",
    "C",
    "Q",
    NaN
  ],
  "Cabin": [
    NaN,
    "C85",
    "C123",
    "E46",
    "G6",
    "C103",
    "D56",
    "A6",
    "C23 C25 C27",
    "B78",
    "D33",
    "B30",
    "C52",
    "B28",
    "C83",
    "F33",
    "F G73",
    "E31",
    "A5",
    "D10 D12",
    "D26",
    "C110",
    "B58 B60",
    "E101",
    "F E69",
    "D47",
    "B86",
    "F2",
    "C2",
    "E33",
    "B19",
    "A7",
    "C49",
    "F4",
    "A32",
    "B4",
    "B80",
    "A31",
    "D36",
    "D15",
    "C93",
    "C78",
    "D35",
    "C87",
    "B77",
    "E67",
    "B94",
    "C125",
    "C99",
    "C118",
    "D7",
    "A19",
    "B49",
    "D",
    "C22 C26",
    "C106",
    "C65",
    "E36",
    "C54",
    "B57 B59 B63 B66",
    "C7",
    "E34",
    "C32",
    "B18",
    "C124",
    "C91",
    "E40",
    "T",
    "C128",
    "D37",
    "B35",
    "E50",
    "C82",
    "B96 B98",
    "E10",
    "E44",
    "A34",
    "C104",
    "C111",
    "C92",
    "E38",
   

In [11]:
df[df['Cabin'].isna()]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.0750,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
884,885,0,3,"Sutehall, Mr. Henry Jr",male,25.0,0,0,SOTON/OQ 392076,7.0500,,S
885,886,0,3,"Rice, Mrs. William (Margaret Norton)",female,39.0,0,5,382652,29.1250,,Q
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S


In [12]:
df[df['Embarked'].isna()]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
61,62,1,1,"Icard, Miss. Amelie",female,38.0,0,0,113572,80.0,B28,
829,830,1,1,"Stone, Mrs. George Nelson (Martha Evelyn)",female,62.0,0,0,113572,80.0,B28,


In [13]:
df['Embarked'].value_counts()

Embarked
S    644
C    168
Q     77
Name: count, dtype: int64

In [14]:
# the most common (frequent) value in the Embarked column.
df['Embarked'].mode()

0    S
Name: Embarked, dtype: object

In [15]:
print(df[df['Fare']==0][['PassengerId', 'Pclass', 'Embarked', 'Sex', 'Ticket', 'Age']])

     PassengerId  Pclass Embarked   Sex  Ticket   Age
179          180       3        S  male    LINE  36.0
263          264       1        S  male  112059  40.0
271          272       3        S  male    LINE  25.0
277          278       2        S  male  239853   NaN
302          303       3        S  male    LINE  19.0
413          414       2        S  male  239853   NaN
466          467       2        S  male  239853   NaN
481          482       2        S  male  239854   NaN
597          598       3        S  male    LINE  49.0
633          634       1        S  male  112052   NaN
674          675       2        S  male  239856   NaN
732          733       2        S  male  239855   NaN
806          807       1        S  male  112050  39.0
815          816       1        S  male  112058   NaN
822          823       1        S  male   19972  38.0


In [18]:
req_numeric_columns = ['Age', 'Fare']

In [19]:
# Coefficient of variation
cv = ((df[req_numeric_columns].std()/df[req_numeric_columns].mean()) * 100).sort_values(ascending=False)
# Skewness of features
skew = df[req_numeric_columns].skew().sort_values(ascending=False)
# Kurtosis of features
kurt = df[req_numeric_columns].kurtosis().sort_values(ascending=False)
print("Coefficient of Variation:")
print(cv)
print("\nSkewness:")
print(skew)
print("\nKurtosis:")
print(kurt)

Coefficient of Variation:
Fare    154.307253
Age      48.912219
dtype: float64

Skewness:
Fare    4.787317
Age     0.389108
dtype: float64

Kurtosis:
Fare    33.398141
Age      0.178274
dtype: float64
