In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Import Necessary Libraries

In [3]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Explore Dataset

In [4]:
df = pd.read_csv('../input/maritime-disasters-of-the-20th-century/Maritime disasters of the 20th century.csv')

In [5]:
df

# Data Preprocessing

In [6]:
df = df.drop(['Unnamed: 0'], axis = 1)

In [7]:
# Create a New Interval Column
df['Interval'] = (10 * (df['The year of the disaster'] // 10)).astype(str) + 's'

In [8]:
df.dtypes

In [9]:
df.head(5)

In [10]:
# Plot to see null values in our data
sns.heatmap(df.isnull(), cbar=False)

In [11]:
# Null Value Percentage Per column
percent_missing = df.isnull().sum() * 100 / len(df)
missing_value_df = pd.DataFrame({'column_name': df.columns,
                                 'percent_missing': percent_missing})
missing_value_df

In [12]:
df.to_csv('FINAL.csv')

# EDA

In [13]:
# plot correlation matrix of numeric variables
f = plt.figure(figsize=(8,5))
plt.matshow(df.corr(), fignum=f.number)
plt.xticks(range(df.select_dtypes(['number']).shape[1]), df.select_dtypes(['number']).columns, fontsize=14, rotation=45)
plt.yticks(range(df.select_dtypes(['number']).shape[1]), df.select_dtypes(['number']).columns, fontsize=14)
cb = plt.colorbar()
cb.ax.tick_params(labelsize=14)

In [14]:
data = df
cad = data[['The country ship belongs to','Number of deaths']].groupby(['The country ship belongs to']).sum().reset_index().sort_values('Number of deaths', ascending = False)

In [15]:
cad.head(5)

In [16]:
plt.figure(figsize = (15,15))
sns.barplot(y = 'The country ship belongs to', x = 'Number of deaths', data = cad, palette = 'mako')
plt.xlabel('Total Number of deaths', size = 15, color='midnightblue')
plt.ylabel('')
plt.xticks(color='midnightblue')
plt.yticks(color='midnightblue')
plt.title('Total Number of deaths with respect to Countries', size = 15, color='midnightblue')

In [17]:
df.head(5)

In [18]:
data = df
IAD = data[['Interval','Number of deaths']].groupby(['Interval']).sum().reset_index().sort_values('Number of deaths', ascending = False)

In [19]:
IAD

In [20]:
plt.figure(figsize = (18,15))
sns.barplot(y = 'Number of deaths', x = 'Interval', data = IAD, palette = 'magma')
plt.xlabel('Respective Decade', size = 15, color='midnightblue')
plt.ylabel('Total Number of deaths', size = 15, color='midnightblue')
plt.xticks(color='midnightblue')
plt.yticks(color='midnightblue')
plt.title('Total Number of deaths in respective Decade', size = 15, color='midnightblue')

In [21]:
for elem in df['The name of the ship'].unique():
    print(elem)

In [22]:
data = df
SAD = data[['The name of the ship','Number of deaths']].groupby(['The name of the ship']).sum().reset_index().sort_values('Number of deaths', ascending = False)

In [23]:
SAD.head(5)

In [24]:
plt.figure(figsize = (15,15))
sns.barplot(y = 'The name of the ship', x = 'Number of deaths', data = SAD.nlargest(30, 'Number of deaths'), palette = 'magma')
plt.xlabel('Total Number of deaths', size = 15, color='midnightblue')
plt.ylabel('The name of the Ship', size = 15, color='midnightblue')
plt.xticks(color='midnightblue')
plt.yticks(color='midnightblue')
plt.title('Total Number of deaths against Top 30 Ships', size = 15, color='midnightblue')

# PART TWO MAPS

In [83]:
df = pd.read_csv('./FINAL.csv', index_col=0)

In [84]:
df = df.rename(columns={'The country ship belongs to': 'Country'})

In [85]:
df = df.rename(columns={'Number of deaths': 'Number_of_Deaths'})

In [86]:
df.head(5)

In [87]:
df.loc[ df["Country"] == "UK", "Country"] = "United Kingdom"

In [88]:
df.loc[ df["Country"] == "USA", "Country"] = "United States"

In [89]:
df.head(10)

In [90]:
import pycountry 
def alpha3code(column):
    CODE=[]
    for country in column:
        try:
            code=pycountry.countries.get(name=country).alpha_3
           # .alpha_3 means 3-letter country code 
           # .alpha_2 means 2-letter country code
            CODE.append(code)
        except:
            CODE.append('None')
    return CODE
# create a column for code 
df['CODE']=alpha3code(df.Country)
df.head(5)

### **Reference:** https://melaniesoek0120.medium.com/data-visualization-how-to-plot-a-map-with-geopandas-in-python-73b10dcd4b4b|

In [92]:
import geopandas
# 'naturalearth_lowres' is geopandas datasets so we can use it directly
world = geopandas.read_file(geopandas.datasets.get_path('naturalearth_lowres'))
# rename the columns so that we can merge with our data
world.columns=['pop_est', 'continent', 'name', 'CODE', 'gdp_md_est', 'geometry']
# then merge with our data 
merge=pd.merge(world,df,on='CODE')

location=pd.read_csv('https://raw.githubusercontent.com/melanieshi0120/COVID-19_global_time_series_panel_data/master/data/countries_latitude_longitude.csv')

merge = merge.merge(location,on='name').sort_values(by='Number_of_Deaths',ascending=False).reset_index()

In [93]:
merge

In [94]:
merge.to_csv('Final_Map.csv')

In [112]:
merge.plot(column='Number_of_Deaths', scheme="quantiles",
           figsize=(25, 20),
           legend=True,cmap='coolwarm')
plt.title('Maritime disaster deaths in different countries',fontsize=25)
# add countries names and number of deaths 
for i in range(0,10):
    plt.text(float(merge.longitude[i]),float(merge.latitude[i]),str(merge.Country[i]),size=10)
plt.show()