### Task 1.0: Import necessary libraries and load the dataset.

In [47]:
# Load libraries

import pandas as pd
import numpy as np
import plotly.express as px
import plotly.io as pio

# Styles

pd.options.display.max_columns = 200

In [48]:
# Load data

link = "https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv"
titanic = pd.read_csv(link)

titanic

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


#### 1.1 Variable descriptions:
* Pclass - Passenger Class (1 = 1st; 2 = 2nd; 3 = 3rd)  
* survival - Survival (0 = No; 1 = Yes)  
* name - Name  
* sex - Sex  
* age - Age  
* sibsp - Number of Siblings/Spouses Aboard  
* parch - Number of Parents/Children Aboard  
* ticket - Ticket Number  
* fare - Passenger Fare (British pound)  
* cabin - Cabin  
* embarked - Port of Embarkation (C = Cherbourg; Q = Queenstown; S = Southampton)  
* boat - Lifeboat  
* body - Body Identification Number  
* home.dest - Home/Destination  

#### 1.2 Data cleaning

* Check and change data types accordingly
* Check and remove duplicates
* Check and remove missing values
* Check and replace outliers

In [49]:
# Obtain the dimension (number of rows and columns) of the original dataset

titanic.shape

(891, 12)

#### Check and change data types accordingly

In [50]:
# Obtain data types for every column

titanic.dtypes

# All data in the series correspond to the data type

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

#### Check and remove duplicates

In [51]:
# Check duplicates in the data

titanic.loc[titanic.duplicated()]

# No duplicates in the data

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked


#### Check and remove missing values

In [52]:
# Check missing values

titanic.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [53]:
# Drop all rows with missing values

titanic = titanic.dropna()

In [54]:
# Obtain the dimension of the titanic data frame after dropping rows with missing values

titanic.shape

(183, 12)

In [55]:
# Check missing values after dropna()

titanic.isna().sum()

# There are no remaining missing values in the data.

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
dtype: int64

In [56]:
# Summarize numerical variables

titanic.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,183.0,183.0,183.0,183.0,183.0,183.0,183.0
mean,455.36612,0.672131,1.191257,35.674426,0.464481,0.47541,78.682469
std,247.052476,0.470725,0.515187,15.643866,0.644159,0.754617,76.347843
min,2.0,0.0,1.0,0.92,0.0,0.0,0.0
25%,263.5,0.0,1.0,24.0,0.0,0.0,29.7
50%,457.0,1.0,1.0,36.0,0.0,0.0,57.0
75%,676.0,1.0,1.0,47.5,1.0,1.0,90.0
max,890.0,1.0,3.0,80.0,3.0,4.0,512.3292


#### Check and replace outliers

In [57]:
# check outliers in the age variable using a box plot

fig = px.box(titanic, y=titanic["Age"])
fig.show()

# The age series has no outliers

In [58]:
# Check outliers in the Fare variable

fig = px.box(titanic, y= "Fare")  
fig.show()

#### Handling outliers in the fare series

There are outliers in the Fare variable. In this analysis, we will replace all values of Fare above the upper fence with the upper quartile of the series.

In [59]:
# Compute the Q1 and Q3

q3 = titanic["Fare"].quantile(0.75)
q1 = titanic["Fare"].quantile(0.25)

# Calculate the interquartile range (IQR)

iqr = q3 - q1

# Calculate the upper fence
# Upper fence = Q3 + 1.5*(IQR)

upper_fence = q3 + 1.5*iqr

# Define the outliers

outliers = titanic[titanic["Fare"] > upper_fence]

# Replace the outliers with upper quartile

titanic.loc[outliers.index, "Fare"] = q3

# Check for any outliers again using a box plot

fig = px.box(titanic, y=titanic["Fare"])  
fig.show()

# There are no remaining outliers in the Fare series.

In [60]:
# Summary statistics after removing outliers from the Fare series

titanic.describe()

# The average fare changed from 78.682 to 62.524

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,183.0,183.0,183.0,183.0,183.0,183.0,183.0
mean,455.36612,0.672131,1.191257,35.674426,0.464481,0.47541,62.523931
std,247.052476,0.470725,0.515187,15.643866,0.644159,0.754617,38.079942
min,2.0,0.0,1.0,0.92,0.0,0.0,0.0
25%,263.5,0.0,1.0,24.0,0.0,0.0,29.7
50%,457.0,1.0,1.0,36.0,0.0,0.0,57.0
75%,676.0,1.0,1.0,47.5,1.0,1.0,90.0
max,890.0,1.0,3.0,80.0,3.0,4.0,164.8667


#### Create new categorical variables from their respective numeric series.

In [61]:
# Create survived categories

def Survived_cats(value):
    if value == 1:
        return "Yes"
    else:
        return "No"
 
titanic['Survived_cats'] = titanic['Survived'].map(Survived_cats)
display(titanic.head())



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Survived_cats
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,Yes
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,Yes
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S,No
10,11,1,3,"Sandstrom, Miss. Marguerite Rut",female,4.0,1,1,PP 9549,16.7,G6,S,Yes
11,12,1,1,"Bonnell, Miss. Elizabeth",female,58.0,0,0,113783,26.55,C103,S,Yes


In [62]:
# Create class categories

def Pclass_cats(value):
    if value == 1:
        return "First class"
    if value == 2:
        return "Second class"
    elif value == 3:
        return "Third class"
 
titanic['Pclass_cats'] = titanic['Pclass'].map(Pclass_cats)
display(titanic.head())



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Survived_cats,Pclass_cats
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,Yes,First class
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,Yes,First class
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S,No,First class
10,11,1,3,"Sandstrom, Miss. Marguerite Rut",female,4.0,1,1,PP 9549,16.7,G6,S,Yes,Third class
11,12,1,1,"Bonnell, Miss. Elizabeth",female,58.0,0,0,113783,26.55,C103,S,Yes,First class


### Task 2: Create a bar plot to visualize the number of passengers who survived and did not survive, broken down by gender.

The .size() function in step 1 is often used in conjunction with the .groupby() function to get the number of elements in each group.  
It counts the number of rows in each group.  

In [63]:
# Step 1: Group the data by gender and survival, then store it as a dataframe

group_gend_surv = (
    titanic.groupby(["Sex", "Survived_cats"])
    .size()
    .to_frame(name="frequency")
    .reset_index()
                     )
group_gend_surv

Unnamed: 0,Sex,Survived_cats,frequency
0,female,No,6
1,female,Yes,82
2,male,No,54
3,male,Yes,41


In [64]:
# Step 2: Bar chart

fig_a = px.bar(group_gend_surv, x="Sex", y="frequency", color="Survived_cats", barmode='group',
      labels={"frequency": "Number of passengers", "Sex": "Gender", "Survived_cats": "Survived"},
      title="Distribution of passengers by survival and gender")
fig_a.show()

In [65]:
# Step 3: Calculate the percentages

group_gend_surv["Percentage"] = (group_gend_surv["frequency"]/group_gend_surv["frequency"].sum())*100
group_gend_surv

Unnamed: 0,Sex,Survived_cats,frequency,Percentage
0,female,No,6,3.278689
1,female,Yes,82,44.808743
2,male,No,54,29.508197
3,male,Yes,41,22.404372


In [66]:
# Step 4: Bar chart showing values and percentages (Percent of grand total)

fig_a = px.bar(
    group_gend_surv, 
    x="Sex", 
    y="frequency", 
    color="Survived_cats", 
    barmode='group',
    labels={"frequency": "Number of passengers", "Sex": "Gender", "Survived_cats": "Survived"},
    title="Distribution of passengers by survival and gender",
    text=[
        "{} ({:.1f}%)".format(count, proportion)
        for count, proportion in zip(group_gend_surv["frequency"], group_gend_surv["Percentage"])
         ],
)

fig_a.update_traces(textposition="inside")
fig_a.show()

### Bar chart interpretation

The distribution of Titanic passengers by gender and survival status is shown in the accompanying bar chart. It displays the number of passengers and the corresponding percentages for every combination of gender and survival outcome.

It is clear from the plot that a much greater percentage of female passengers (44.8%) than male passengers (22.4%) survived the Titanic disaster. Out of the 95 male passengers, less than half (41 men) managed to survive. On the other hand, a whopping 82 of the roughly 88 female passengers made it out alive.

The gender prejudice that persisted during the Titanic disaster is shown by the striking disparity in survival rates between the sexes. This could have been brought about by the fact that socioeconomic structure dictates saving women and children first in dangerous situations, despite no international marine law. It's probable that the rescue mission emulated humanitarian organizations who often evacuate vulnerable civilians, such as women and children, first (Gupta, K., Sharma, P., & Bouza Herrera, C. N. (2018). Surviving the Titanic tragedy: A sociological study using machine learning models. Suma de Negocios, 9(20), 86-92.). Females had a far higher survival rate than males as a result of this unfair treatment.

### Task 3: Create a histogram to display the distribution of passengers' ages. Use different colors for passengers who survived and did not survive.

In [67]:
# Histogram

fig_b = px.histogram(titanic, x="Age", color="Survived_cats", title="Age distribution",
                  labels={"Age": "Passenger Age", "Survived_cats": "Survived"})
fig_b.show()

#### Histogram interpretation

Based on their age and level of survival, the supplied histogram shows how the passengers were distributed. 
It shows how many passengers fell into each age group, emphasizing the distinctions between those who made it through and those 
who did not.

The histogram demonstrates that, on average, children and younger adults (persons from ages zero to 42.4) had higher survival rates than the older adults except for 47.5 - 52.4 age category. This distribution implies that can not solely explain survival and therefore variables other than age, such gender and class, might have affected the likelihood of surviving. The histogram also indicates that a larger percentage of passengers had their ages betweem 12.5 and 62.4. 


To sum up, the histogram offers insightful information about the connection between age and survival on the Titanic. It draws attention to the high survival rate for kids and young adults, the disparities in survival rates between age groups, and the general age distribution of the passengers.


### Task 4: Create a violin plot to visualize the distribution of fare prices paid by passengers, broken down by their class (1st, 2nd, and 3rd class).

In [68]:
# Violin plot

fig_c = px.violin(titanic, y="Fare", x="Pclass", title="Violin plot showing the distribution of fare across passenger class",
               labels={"Fare": "Fare Prices", "Pclass": "Passenger Class"})
fig_c.show()

#### Violin plot interpretation

The violin plot, which displays the distribution of Titanic fare costs by passenger class, offers several significant insights:

Class Disparity: The violin plot makes the striking variations in fare pricing between classes quite evident. First class travelers paid fare that was substantially more than that of second and third class passengers. This discrepancy could be as a result of differences in comfort and luxury provided to passengers in each class.

Distribution of Fare rates: Within each passenger class, the violin plot also displays the distribution of fare rates. A longer tail extends to higher charges, and first-class customers' fares are more widely dispersed. This implies that first-class passengers had access to a wider selection of fare options. Second-class and third-class passengers, on the other hand, have more closely clustered fare levels, with fewer individuals paying extremely high or extremely low fares.

Median Fare rates: The horizontal notches in the violin plot also show the median fare rates for each passenger class. First-class travelers often paid about £71.142 for their tickets, while second-class and third-class passengers typically paid about £13 and £10.463, respectively. The Titanic's fare pricing difference across classes is further supported by these median fares.

In summary, the violin plot offers an insightful representation of the Titanic fare distribution by passenger class. It draws attention to the obvious differences in fare pricing between classes and sheds light on how fare prices vary even within a class. By using this information, one can gain a deeper understanding of the effects of such socio-economic dynamics on the survival status during the Titanic disaster.

### Task 5: Generate a scatter plot to explore the relationship between passengers' ages and fare prices, using different colors and symbols for the different classes.

In [69]:
# Scatter plot

fig_d = px.scatter(titanic, x="Age", y="Fare", color="Pclass_cats", symbol="Pclass_cats",
                labels={"Fare": "Fare prices", "Age": "Passenger age", "Pclass_cats": "Class"},
                title="Fare distribution by age")
fig_d.show()

#### Scatter plot interpretation

Several noteworthy observations may be made from the scatterplot on the Titanic that illustrates the relationship between fare paid, passenger age, and passenger class:

Class Disparity: The substantial differences in fare prices between classes are amply demonstrated by the scatterplot. First-class passengers paid hihger fare accross all ages than second-class and third-class passengers. 


Age Distribution: Passengers of various ages were on board the Titanic, with a concentration in the middle age groups, according to the data points distributed along the age axis. 

Class-Specific Trends: Age and fare within each passenger class do not show any linear association. This implies that age played no part in determining the fare a ppassenger paid.

Outliers: A number of passengers that do not fit the general trend are also displayed in the scatterplot. There are three first-class passengers who paid less fare than second and third class passengers.


### Task 6: Use faceting to create a scatter plot matrix displaying the relationship between age, fare, and class for passengers who survived and did not survive.

In [70]:
# Facet scatter plot

fig_e = px.scatter(titanic, x="Age", y="Fare", color="Pclass_cats",
                facet_col="Pclass_cats", facet_row="Survived_cats",
                 labels={"Fare": "Fare prices", "Age": "Passenger age", "Pclass_cats": "Class", "Survived_cats": "Survived"},
                title="Fare distribution by age and survival")
fig_e.show()

#### Interpretation of scatter plot matrix

From the matrix, age and fare correlations differ depending on the class of traveler. There is no discernible relationship between fare and traveller's age in the first class panel. However, there seems to be a negative correlation between age and fare among first and second class passengers who survved. This is to mean that the younger the passenger, the higher the fare. 

There is no correlation between the fare and the age of the traveler among those who perished. Overall, the majority of the passengers were in the first class.



### Task 7: Export your final visualizations as HTML files to share with others.

In [71]:
# Export

pio.write_html(fig_a, "cluster_bar.html")

pio.write_html(fig_b, "histogram.html")

pio.write_html(fig_c, "violin_plot.html")

pio.write_html(fig_d, "scatter_plot.html")

pio.write_html(fig_e, "scatter_facet.html")


In [72]:
# Export the DataFrame to Excel

titanic.to_excel('titanic.xlsx')