# Titanic Dataset

In [83]:
import pandas as pd
import numpy as np

## 1_ Let´s import the csv file and take a first look to the dataset

In [84]:
df = pd.read_csv('Titanic-Dataset.csv')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [85]:
df.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

## 2_ Cleaning/Formatting dataset

### 2.1_ Check for duplicated rows

In [86]:
def dropcheck (a,b):
    # a = number of rows BEFORE drop_duplicates()
    # b = number of rows AFTER drop_duplicates()
    
    if a == b:
        print("No Duplicated Rows were found")
    else:
        if a - b == 1:
            print("One duplicated row was found and deleted")
        else:    
            print(a - b, "duplicated rows were found and deleted")
    return

rows_before_drop = len(df)
df = df.drop_duplicates()
rows_after_drop = len(df)

#print (rows_before_drop, rows_after_drop)

dropcheck (rows_before_drop, rows_after_drop)

No Duplicated Rows were found


### 2.2_ Drop  columns we don´t need

In [87]:
df = df.drop(columns = ["SibSp", "Parch", "Ticket", "Cabin", "Age"])

### 2.3_ Format "Ticket_Price" column

In [88]:
df = df.rename(columns={ 'Fare': 'Ticket_price'})
df["Ticket_price"] = round(df["Ticket_price"], 2)

### 2.4_ Format "Embarked" column

In [89]:
df["Embarked"] = df["Embarked"].str.replace('S', 'Southhampton')
df["Embarked"] = df["Embarked"].str.replace('C', 'Cheburgo')
df["Embarked"] = df["Embarked"].str.replace('Q', 'Cove')

## 3_ Insights on dataset

### 3.1_ How many people embarked per class

In [90]:
df.groupby(["Pclass"])['Name'].count()

Pclass
1    216
2    184
3    491
Name: Name, dtype: int64

### 3.2_ How many people survived per class

In [91]:
df.groupby(["Pclass"])['Survived'].sum()

Pclass
1    136
2     87
3    119
Name: Survived, dtype: int64

### 3.3_ Mean and Sum of Survivers distributed per Pclass

In [1]:
df.groupby(["Pclass"])['Survived'].agg([np.mean, np.sum])
# np.mean can be understood as the survival probability according your PClass.

NameError: name 'df' is not defined

### 3.4_ Total Amount spent per Pclass

In [92]:
df.groupby(["Pclass"])["Ticket_price"].sum()

Unnamed: 0_level_0,Survived,Ticket_price
Pclass,Unnamed: 1_level_1,Unnamed: 2_level_1
1,136,18177.4
2,87,3801.84
3,119,6714.85


### 3.5_ How Many people Survived per Pclass and Sex

In [93]:
df.groupby(["Pclass", "Sex"], as_index = False)['Survived'].sum()

Unnamed: 0,Pclass,Sex,Survived
0,1,female,91
1,1,male,45
2,2,female,70
3,2,male,17
4,3,female,72
5,3,male,47


### 3.6_ How many people Embarked in each port.

In [94]:
df.groupby(["Embarked"], as_index = False)['Pclass'].count()

Unnamed: 0,Embarked,Pclass
0,Cheburgo,168
1,Cove,77
2,Southhampton,644


### 3.7_ 2nd 100's passenger registered on this dataset

In [96]:
df[100:200]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Ticket_price,Embarked
100,101,0,3,"Petranec, Miss. Matilda",female,7.90,Southhampton
101,102,0,3,"Petroff, Mr. Pastcho (""Pentcho"")",male,7.90,Southhampton
102,103,0,1,"White, Mr. Richard Frasar",male,77.29,Southhampton
103,104,0,3,"Johansson, Mr. Gustaf Joel",male,8.65,Southhampton
104,105,0,3,"Gustafsson, Mr. Anders Vilhelm",male,7.92,Southhampton
...,...,...,...,...,...,...,...
195,196,1,1,"Lurette, Miss. Elise",female,146.52,Cheburgo
196,197,0,3,"Mernagh, Mr. Robert",male,7.75,Cove
197,198,0,3,"Olsen, Mr. Karl Siegwart Andreas",male,8.40,Southhampton
198,199,1,3,"Madigan, Miss. Margaret ""Maggie""",female,7.75,Cove


### 3.8_ Get row for the last person registered on this dataset

In [97]:
df[-1:]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Ticket_price,Embarked
890,891,0,3,"Dooley, Mr. Patrick",male,7.75,Cove


### 3.9_ Average, Max, Min Ticket_Price paid per Pclass

In [98]:
df.groupby(["Pclass"], as_index = False)["Ticket_price"].agg([np.mean, np.max, np.min])

Unnamed: 0,Pclass,mean,amax,amin
0,1,84.15463,512.33,0.0
1,2,20.662174,73.5,0.0
2,3,13.675866,69.55,0.0


### 3.10_ Free Tickets

In [99]:
#From above code we realized there are people who didn´t pay. Let´s identify them.

free_tickets = df["Ticket_price"] == 0
free_tickets_table = df[free_tickets]
df_free_tickets = pd.DataFrame(free_tickets_table)
df_free_tickets



Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Ticket_price,Embarked
179,180,0,3,"Leonard, Mr. Lionel",male,0.0,Southhampton
263,264,0,1,"Harrison, Mr. William",male,0.0,Southhampton
271,272,1,3,"Tornquist, Mr. William Henry",male,0.0,Southhampton
277,278,0,2,"Parkes, Mr. Francis ""Frank""",male,0.0,Southhampton
302,303,0,3,"Johnson, Mr. William Cahoone Jr",male,0.0,Southhampton
413,414,0,2,"Cunningham, Mr. Alfred Fleming",male,0.0,Southhampton
466,467,0,2,"Campbell, Mr. William",male,0.0,Southhampton
481,482,0,2,"Frost, Mr. Anthony Wood ""Archie""",male,0.0,Southhampton
597,598,0,3,"Johnson, Mr. Alfred",male,0.0,Southhampton
633,634,0,1,"Parr, Mr. William Henry Marsh",male,0.0,Southhampton


In [100]:
#Let´s see where these people boarded the Titanic
df_free_tickets.groupby(["Embarked"])["Embarked"].count()

Embarked
Southhampton    15
Name: Embarked, dtype: int64

### 3.11_ First Class ticket price analysis

In [101]:
# Analyze the Standard deviation of each ticket from its mean

atts2 = ["Pclass", "Name", "Ticket_price"]
first_class_passengers = df["Pclass"] == 1
firstclass = df[atts2][first_class_passengers]
df_firstclass = pd.DataFrame(firstclass)
df_firstclass


df_firstclass["Avg_FirstClass_Price"] = round(df_firstclass["Ticket_price"].mean(),2)
df_firstclass["Variance"] = df_firstclass["Ticket_price"] - df_firstclass["Avg_FirstClass_Price"]
df_firstclass


Unnamed: 0,Pclass,Name,Ticket_price,Avg_FirstClass_Price,Variance
1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",71.28,84.15,-12.87
3,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",53.10,84.15,-31.05
6,1,"McCarthy, Mr. Timothy J",51.86,84.15,-32.29
11,1,"Bonnell, Miss. Elizabeth",26.55,84.15,-57.60
23,1,"Sloper, Mr. William Thompson",35.50,84.15,-48.65
...,...,...,...,...,...
871,1,"Beckwith, Mrs. Richard Leonard (Sallie Monypeny)",52.55,84.15,-31.60
872,1,"Carlsson, Mr. Frans Olof",5.00,84.15,-79.15
879,1,"Potter, Mrs. Thomas Jr (Lily Alexenia Wilson)",83.16,84.15,-0.99
887,1,"Graham, Miss. Margaret Edith",30.00,84.15,-54.15


### 3.12_ Identify death people

#### 3.12.A_ Men who died

In [102]:


male = df["Sex"] == "male"
fem = df["Sex"] == "female"
dead_people = df["Survived"] == 0

df_men_dead = df["Name"][male][dead_people]

df_men_dead_df = pd.DataFrame (df_men_dead)
df_men_dead_df



Unnamed: 0,Name
0,"Braund, Mr. Owen Harris"
4,"Allen, Mr. William Henry"
5,"Moran, Mr. James"
6,"McCarthy, Mr. Timothy J"
7,"Palsson, Master. Gosta Leonard"
...,...
881,"Markun, Mr. Johann"
883,"Banfield, Mr. Frederick James"
884,"Sutehall, Mr. Henry Jr"
886,"Montvila, Rev. Juozas"


#### 3.12.B_ Women who died

In [103]:


df_women_dead = df["Name"][fem][dead_people]

df_women_dead_df = pd.DataFrame (df_women_dead)
df_women_dead_df

Unnamed: 0,Name
14,"Vestrom, Miss. Hulda Amanda Adolfina"
18,"Vander Planke, Mrs. Julius (Emelia Maria Vande..."
24,"Palsson, Miss. Torborg Danira"
38,"Vander Planke, Miss. Augusta Maria"
40,"Ahlin, Mrs. Johan (Johanna Persdotter Larsson)"
...,...
854,"Carter, Mrs. Ernest Courtenay (Lilian Hughes)"
863,"Sage, Miss. Dorothy Edith ""Dolly"""
882,"Dahlberg, Miss. Gerda Ulrika"
885,"Rice, Mrs. William (Margaret Norton)"
