# Data Visualization

The aviation accident database throughout the world, from 1908-2019.

* All civil and commercial aviation accidents of scheduled and non-scheduled passenger airliners worldwide, which resulted in a fatality (including all U.S. Part 121 and Part 135 fatal accidents)
* All cargo, positioning, ferry and test flight fatal accidents.
* All military transport accidents with 10 or more fatalities.
* All commercial and military helicopter accidents with greater than 10 fatalities.
* All civil and military airship accidents involving fatalities.
* Aviation accidents involving the death of famous people.
* Aviation accidents or incidents of noteworthy interest.

Data was scrapped from the Plane Crash info website (http://www.planecrashinfo.com/database.htm)

You can download this dataset from https://www.kaggle.com/cgurkan/airplane-crash-data-since-1908

## Data Cleaning

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import seaborn as sns
import numpy as np
import datetime as dt

df = pd.read_csv('Airplane_Crashes.csv')

In [None]:
print(df.shape)
df.head()

In [None]:
dict_errors = {'c: ':'', 'c:':'', 'c':'', '12\'20':'12:20', 
              '18.40':'18:40', '0943':'09:43', '22\'08':'22:08', '114:20':'00:00',
              '91:5':'9:15', '90:0':'9:00', '24:5':'2:45'}

for key,val in dict_errors.items():
    df['Time'] = df['Time'].str.replace(key, val)

    
df['DateTime'] = df['Date'] + ' ' + df['Time']

df = df[df['DateTime'].notnull()]

df['DateTime'] = pd.to_datetime(df['DateTime'])

print(df.shape)
df.head()

In [None]:
df.dtypes

# Accidents by year, month, day, hour

In [None]:
temp = df.groupby(df.DateTime.dt.year)[['DateTime']].count()

plt.figure(figsize=(10,5))
plt.style.use('bmh')
plt.plot(temp.index, 'DateTime', data=temp, color='crimson', marker = ".", linewidth=1)
plt.xlabel('Year', fontsize=10)
plt.ylabel('Count', fontsize=10)
plt.title('Count of accidents by Year', loc='Center', fontsize=14)
plt.show()

In [None]:
temp_month = df.groupby(df.DateTime.dt.month)[['DateTime']].size().to_frame('Count').reset_index()

plt.style.use('bmh')
plt.figure(figsize=(10,5))
sns.barplot('DateTime', 'Count', data=temp_month, color='slateblue', linewidth=2)
plt.xticks(temp_month.index, ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'])
plt.xlabel('Month', fontsize=10)
plt.ylabel('Count', fontsize=10)
plt.title('Count of accidents by Month', loc='Center', fontsize=14);

In [None]:
temp_day = df.groupby(df.DateTime.dt.weekday)[['DateTime']].size().to_frame('Count').reset_index()

plt.figure(figsize=(10,5))
plt.style.use('bmh')
sns.barplot('DateTime', 'Count', data=temp_day, color='steelblue', linewidth=2)
plt.xticks(temp_day.index, ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'])
plt.xlabel('Day', fontsize=10)
plt.ylabel('Count', fontsize=10)
plt.title('Count of accidents by day', loc='Center', fontsize=14);

In [None]:
temp_hour = df.groupby(df.DateTime.dt.hour)[['DateTime']].size().to_frame('Count').reset_index()

plt.figure(figsize=(10,5))
plt.style.use('bmh')
sns.barplot('DateTime', 'Count', data=temp_hour, color='indianred', linewidth=2)
plt.xticks(temp_hour.index)
plt.xlabel('Day', fontsize=10)
plt.ylabel('Count', fontsize=10)
plt.title('Count of accidents by hour of the day', loc='Center', fontsize=14);

In [None]:
temp_month = df.groupby(df.DateTime.dt.month)[['DateTime']].size().to_frame('Count').reset_index()
temp_day = df.groupby(df.DateTime.dt.weekday)[['DateTime']].size().to_frame('Count').reset_index()
temp_hour = df.groupby(df.DateTime.dt.hour)[['DateTime']].size().to_frame('Count').reset_index()

fig = plt.figure(figsize=(12,8))
grid = gridspec.GridSpec(2, 2)
plt.style.use('bmh')

#first row
ax1 = fig.add_subplot(grid[0, :])
sns.barplot('DateTime', 'Count', data=temp_month, color='slateblue', linewidth=2)
plt.xticks(temp_month.index, ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'])
plt.xlabel('Month', fontsize=10)
plt.ylabel('Count', fontsize=10)
plt.title('Count of accidents by Month', loc='Center', fontsize=14);

#second row, first column
ax2 = fig.add_subplot(grid[1, 0])
sns.barplot('DateTime', 'Count', data=temp_day, color='steelblue', linewidth=2)
plt.xticks(temp_day.index, ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'])
plt.xlabel('Day', fontsize=10)
plt.ylabel('Count', fontsize=10)
plt.title('Count of accidents by day', loc='Center', fontsize=14);

#second row, second column
ax3 = fig.add_subplot(grid[1, 1])
sns.barplot('DateTime', 'Count', data=temp_hour, color='indianred', linewidth=2)
plt.xticks(temp_hour.index)
plt.xlabel('Hour', fontsize=10)
plt.ylabel('Count', fontsize=10)
plt.title('Count of accidents by hour of the day', loc='Center', fontsize=14);

## Military vs Passenger Flights

In [None]:
df['Military'] = df['Operator'].str.contains('Military')
df['Passenger'] = df.Military == False
temp = df.groupby(df.DateTime.dt.year)[['Military', 'Passenger']].aggregate(np.count_nonzero)
temp.head()

In [None]:
plt.figure(figsize=(12,6))
plt.plot(temp.index, 'Military', data=temp, color='forestgreen', marker = ".", linewidth=1)
plt.plot(temp.index, 'Passenger', data=temp, color='crimson', marker = ".", linewidth=1)
plt.legend(fontsize=10)
plt.xlabel('Year', fontsize=10)
plt.ylabel('Count', fontsize=10)
plt.title('Count of accidents by Year', loc='Center', fontsize=14)
plt.show()

# Fatalities

In [None]:
fatalities = df.groupby(df.DateTime.dt.year).sum()
fatalities.tail()

In [None]:
# People aboard vs fatalities: fill between

plt.figure(figsize=(12,6))
plt.fill_between(fatalities.index, 'Aboard', data=fatalities, color="skyblue", alpha=0.2)
plt.plot(fatalities.index, 'Aboard', data=fatalities, marker = ".", color="Slateblue", alpha=0.6, linewidth=1)

plt.fill_between(fatalities.index, 'Fatalities', data=fatalities, color="olive", alpha=0.2)
plt.plot(fatalities.index, 'Fatalities', data=fatalities, color="olive", marker = ".", alpha=0.6, linewidth=1)

plt.legend(fontsize=10)
plt.xlabel('Year', fontsize=10)
plt.ylabel('Number of people', fontsize=10)
plt.title('Total number of people involved by Year', loc='Center', fontsize=14);

## Operators

In [None]:
total_by_op = df.groupby('Operator').size().to_frame('Count')
total_by_op = total_by_op.sort_values(by='Count', ascending=False).head(15)
total_by_op.head()

In [None]:
## Crashes by operator

plt.figure(figsize=(12,6))
sns.barplot(y=total_by_op.index, x="Count", data=total_by_op, palette="gist_heat", orient='h')
plt.xlabel('Count', fontsize=11)
plt.ylabel('Operator', fontsize=11)
plt.title('Total Crashes by Opeartor', loc='Center', fontsize=14)
plt.show()

In [None]:
fat_by_op = df.groupby('Operator').sum().sort_values(by='Fatalities', ascending=False).head(15)
#total_by_op = total_by_op.sort_values(by='Count', ascending=False).head(15)
fat_by_op.head()

In [None]:
## Fatalities by operator

plt.figure(figsize=(12,6))
sns.barplot(y=fat_by_op.index, x="Fatalities", data=fat_by_op, palette="gist_heat", orient='h')
plt.xlabel('Count', fontsize=11)
plt.ylabel('Operator', fontsize=11)
plt.title('Total Fatalities by Opeartor', loc='Center', fontsize=14)
plt.show()