<a href="https://colab.research.google.com/github/Khaled-Abdelhamid/Death-Big-data-Analytics/blob/Ahmed/main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Mounting Drive and setting up the environment

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://downloads.apache.org/spark/spark-2.4.7/spark-2.4.7-bin-hadoop2.7.tgz
!tar xf spark-2.4.7-bin-hadoop2.7.tgz
!pip install -q findspark

In [4]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.7-bin-hadoop2.7"
# os.environ["SPARK_HOME"] ="/content/drive/MyDrive/Colab Notebooks/BigData/spark-2.4.7-bin-hadoop2.7"


In [5]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()

In [6]:
from pyspark.sql import functions as F
from pyspark.sql.types import DoubleType, IntegerType, StringType

import json

In [None]:
import numpy as np 
import pandas as pd 
import time
import json
import gc
import xgboost as xgb 
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt

# Data loading and exploration

In [None]:
start = time.time()
data_path="/content/drive/MyDrive/Colab Notebooks/BigData/Final project/Death-Big-data-Analytics/archive"
df=spark.read.options(header=True,inferSchema=True).csv(data_path)
df.show(truncate=False)
print((time.time()-start)/60)

In [None]:
df.dtypes # see the datatypes of each column

In [None]:
# df2=spark.read.options(header=True,inferSchema=True).csv("/content/drive/MyDrive/Colab Notebooks/BigData/Final project/Death-Big-data-Analytics/archive/2015_data.csv")
df.groupBy('current_data_year').count().show()

In [None]:
df.filter("current_data_year in ('2005','2006','2007','2008','2009','2010','2011','2012','2013','2014','2015')")\
  .groupBy('current_data_year').count().show()


In [None]:
df.filter('sex in ("M","F")').groupBy('sex').count().show()

In [None]:
df.groupBy('sex').count().show()

In [None]:
codes_path="/content/drive/MyDrive/Colab Notebooks/BigData/Final project/Death-Big-data-Analytics/archive/2015_codes.json"
with open(codes_path) as json_file:
    codes = json.load(json_file) 
codes

# Getting the top 10 causes of death for each race

In [None]:
# df.groupBy('race','358_cause_recode','113_cause_recode','130_infant_cause_recode','39_cause_recode').count().show()
df_deaths=df.groupBy('race','358_cause_recode','113_cause_recode','130_infant_cause_recode','39_cause_recode').count()

In [None]:
for race_id in codes['race'].keys(): 
  path=f'/content/drive/MyDrive/Colab Notebooks/BigData/Final project/results/COD_race/race_{race_id}.csv'
  df_deaths.filter(df_deaths.race==race_id)\
           .sort(F.desc("count")).limit(10)\
           .toPandas()\
           .to_csv(path,header=True)

# Getting the top 10 causes of death for each gender

In [None]:
df_deaths_gender=df.groupBy('sex','358_cause_recode','113_cause_recode','130_infant_cause_recode','39_cause_recode').count()

In [None]:
df_deaths_gender.show()

In [None]:
for gender_id in codes['sex'].keys(): 
  path=f'/content/drive/MyDrive/Colab Notebooks/BigData/Final project/results/COD_gender/gender_{gender_id}.csv'
  df_deaths_gender.filter(df_deaths_gender.sex==gender_id)\
           .sort(F.desc("count")).limit(20)\
           .toPandas()\
           .to_csv(path,header=True)

# The distributaion of death per weekday

In [None]:
day_death=df.groupBy('day_of_week_of_death').count().sort(F.desc("count")).limit(7)\


In [None]:
day_death.show()

# The distributaion of death per month

In [None]:
month_death=df.groupBy('month_of_death').count().sort(F.desc("count")).limit(12)\


In [None]:
month_death.show()

## Correlation between cause of death and the season timing

In [None]:
seasons={"summer":("06","07","08"),
         "fall":("09","10","11"),
         "winter":("11","12","10"),
         "spring":("03","04","05")}


In [None]:
for season in seasons.keys(): 
  path=f'/content/drive/MyDrive/Colab Notebooks/BigData/Final project/results/COD_season/season_{season}.csv'
  m1,m2,m3=seasons[season]
  df.filter((df.month_of_death==m1) | (df.month_of_death==m2) | (df.month_of_death==m3)).groupBy('358_cause_recode','113_cause_recode','130_infant_cause_recode','39_cause_recode').count()\
    .sort(F.desc("count")).limit(20)\
    .toPandas()\
    .to_csv(path,header=True)
           

# Correlation between cause of death, engagement and the month.

In [None]:
seasons={"summer":("06","07","08"),
         "fall":("09","10","11"),
         "winter":("11","12","10"),
         "spring":("03","04","05")}


In [None]:
for season in seasons.keys(): 
  path=f'/content/drive/MyDrive/Colab Notebooks/BigData/Final project/results/COD_activities/season_{season}.csv'
  m1,m2,m3=seasons[season]
  df.filter((df.month_of_death==m1) | (df.month_of_death==m2) | (df.month_of_death==m3)).groupBy('activity_code','358_cause_recode','113_cause_recode','130_infant_cause_recode','39_cause_recode').count()\
    .sort(F.desc("count")).limit(20)\
    .toPandas()\
    .to_csv(path,header=True)
           

# Cause of death distribution for each year

In [None]:
years=["2005","2006","2007","2008","2009","2010","2011","2012","2013","2014","2015"]

In [None]:
for year in years: 
  path=f'/content/drive/MyDrive/Colab Notebooks/BigData/Final project/results/COD_per_year/year_{year}.csv'
  df.filter(df.current_data_year==year).groupBy('current_data_year','358_cause_recode','113_cause_recode','130_infant_cause_recode','39_cause_recode').count()\
    .sort(F.desc("count")).limit(20)\
    .toPandas()\
    .to_csv(path,header=True)
           