<a href="https://colab.research.google.com/github/Khaled-Abdelhamid/Death-Big-data-Analytics/blob/main/main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Mounting Drive and setting up the environment

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://downloads.apache.org/spark/spark-2.4.7/spark-2.4.7-bin-hadoop2.7.tgz
!tar xf spark-2.4.7-bin-hadoop2.7.tgz
!pip install -q findspark

In [None]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.7-bin-hadoop2.7"
# os.environ["SPARK_HOME"] ="/content/drive/MyDrive/Colab Notebooks/BigData/spark-2.4.7-bin-hadoop2.7"


In [None]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()

In [None]:
from pyspark.sql import functions as F
from pyspark.sql.types import DoubleType, IntegerType, StringType

import json

# Data loading and exploration

In [None]:
data_path="/content/drive/MyDrive/Colab Notebooks/BigData/Final project/Death-Big-data-Analytics/archive"
df=spark.read.options(header=True,inferSchema=True).csv(data_path)
df.show(truncate=False)

+---------------+-----------------------+-----------------------+------------------------+--------------+---+---------------+----------+---------------------+-------------+-------------+-------------+--------------------+-----------------------------------+--------------+--------------------+-----------------+--------------+---------------+---------------------+-------+-------------+------------------------------------------------------+----------------------+----------------+----------------+-----------------------+---------------+--------------------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+-------------------+-------------------+-------------------+-------------------+-------------------+-------------------+-------------------+-------------------+-------------------+-------------------+-------------------+--------------------------------+-----

In [None]:
df.dtypes # see the datatypes of each column

[('resident_status', 'string'),
 ('education_1989_revision', 'string'),
 ('education_2003_revision', 'string'),
 ('education_reporting_flag', 'string'),
 ('month_of_death', 'string'),
 ('sex', 'string'),
 ('detail_age_type', 'string'),
 ('detail_age', 'string'),
 ('age_substitution_flag', 'string'),
 ('age_recode_52', 'string'),
 ('age_recode_27', 'string'),
 ('age_recode_12', 'string'),
 ('infant_age_recode_22', 'string'),
 ('place_of_death_and_decedents_status', 'string'),
 ('marital_status', 'string'),
 ('day_of_week_of_death', 'string'),
 ('current_data_year', 'string'),
 ('injury_at_work', 'string'),
 ('manner_of_death', 'string'),
 ('method_of_disposition', 'string'),
 ('autopsy', 'string'),
 ('activity_code', 'string'),
 ('place_of_injury_for_causes_w00_y34_except_y06_and_y07_', 'string'),
 ('icd_code_10th_revision', 'string'),
 ('358_cause_recode', 'string'),
 ('113_cause_recode', 'string'),
 ('130_infant_cause_recode', 'string'),
 ('39_cause_recode', 'string'),
 ('number_of_en

In [None]:
# df2=spark.read.options(header=True,inferSchema=True).csv("/content/drive/MyDrive/Colab Notebooks/BigData/Final project/Death-Big-data-Analytics/archive/2015_data.csv")
df.groupBy('current_data_year').count().show()

+-----------------+-------+
|current_data_year|  count|
+-----------------+-------+
|             2012|2547864|
|             2014|2631171|
|             2013|2601452|
|             2005|2452506|
|            V89.9|      8|
|             null|   4517|
|             2009|2441219|
|             2006|2430725|
|      N15.8-N15.9|      4|
|             2011|2519842|
|             2008|2476811|
|             2007|2428343|
|            U04)"|      3|
|          V89.9)"|      4|
|             2015|2718198|
|             2010|2472542|
+-----------------+-------+



In [None]:
codes_path="/content/drive/MyDrive/Colab Notebooks/BigData/Final project/Death-Big-data-Analytics/archive/2015_codes.json"
with open(codes_path) as json_file:
    codes = json.load(json_file) 
codes

{'113_cause_recode': {'001': 'Salmonella infections (A01-A02)',
  '002': 'Shigellosis and amebiasis (A03,A06)',
  '003': 'Certain other intestinal infections (A04,A07-A09)',
  '004': 'Tuberculosis (A16-A19)',
  '005': 'Respiratory tuberculosis (A16)',
  '006': 'Other tuberculosis (A17-A19)',
  '007': 'Whooping cough (A37)',
  '008': 'Scarlet fever and erysipelas (A38,A46)',
  '009': 'Meningococcal infection (A39)',
  '010': 'Septicemia (A40-A41)',
  '011': 'Syphilis (A50-A53)',
  '012': 'Acute poliomyelitis (A80)',
  '013': 'Arthropod-borne viral encephalitis (A83-A84,A85.2)',
  '014': 'Measles (B05)',
  '015': 'Viral hepatitis (B15-B19)',
  '016': 'Human immunodeficiency virus (HIV) disease (B20-B24)',
  '017': 'Malaria (B50-B54)',
  '018': 'Other and unspecified infectious and parasitic diseases and their sequelae (A00,A05,A20-A36,A42-A44,A48-A49,A54-A79,A81-A82,A85.0-A85.1,A85.8, A86-B04,B06-B09,B25-B49,B55-B99)',
  '019': 'Malignant neoplasms (C00-C97)',
  '020': 'Malignant neoplas

# Getting the top 10 causes of death for each race

In [None]:
# df.groupBy('race','358_cause_recode','113_cause_recode','130_infant_cause_recode','39_cause_recode').count().show()
df_deaths=df.groupBy('race','358_cause_recode','113_cause_recode','130_infant_cause_recode','39_cause_recode').count()

In [None]:
for race_id in codes['race'].keys(): 
  path=f'/content/drive/MyDrive/Colab Notebooks/BigData/Final project/results/COD_race/race_{race_id}.csv'
  df_deaths.filter(df_deaths.race==race_id)\
           .sort(F.desc("count")).limit(10)\
           .toPandas()\
           .to_csv(path,header=True)

# Getting the top 10 causes of death for each gender

In [None]:
df_deaths_gender=df.groupBy('sex','358_cause_recode','113_cause_recode','130_infant_cause_recode','39_cause_recode').count()

In [None]:
df_deaths_gender.show()

+----------+----------------+----------------+-----------------------+---------------+------+
|       sex|358_cause_recode|113_cause_recode|130_infant_cause_recode|39_cause_recode| count|
+----------+----------------+----------------+-----------------------+---------------+------+
|         F|             357|             108|                    082|             33|  2505|
|         M|             410|             123|                   null|             39| 26567|
|         F|             368|             109|                   null|             34|  2181|
|         F|             261|             089|                   null|             37|  2522|
|         M|             096|             043|                    027|             15|     6|
|         F|             160|             111|                    038|             37|    22|
|         F|             023|             010|                    009|             37|  1105|
|         M|             397|             114|              

In [None]:
for gender_id in codes['sex'].keys(): 
  path=f'/content/drive/MyDrive/Colab Notebooks/BigData/Final project/results/COD_gender/gender_{gender_id}.csv'
  df_deaths_gender.filter(df_deaths_gender.sex==gender_id)\
           .sort(F.desc("count")).limit(20)\
           .toPandas()\
           .to_csv(path,header=True)

# The distributaion of death per weekday

In [None]:
day_death=df.groupBy('day_of_week_of_death').count().sort(F.desc("count")).limit(7)\


In [None]:
day_death.show()

+--------------------+-------+
|day_of_week_of_death|  count|
+--------------------+-------+
|                   7|4012103|
|                   6|3996471|
|                   2|3960250|
|                   5|3945930|
|                   4|3939328|
|                   1|3933510|
|                   3|3931885|
+--------------------+-------+



# The distributaion of death per month

In [None]:
month_death=df.groupBy('month_of_death').count().sort(F.desc("count")).limit(12)\


In [None]:
month_death.show()

+--------------+-------+
|month_of_death|  count|
+--------------+-------+
|            01|2571615|
|            03|2494294|
|            12|2479590|
|            02|2324679|
|            04|2296119|
|            10|2292374|
|            11|2285476|
|            05|2278865|
|            07|2205194|
|            08|2191365|
|            06|2154291|
|            09|2146811|
+--------------+-------+



# Correlation between cause of death and the season timing

In [None]:
seasons={"summer":("06","07","08"),
         "fall":("09","10","11"),
         "winter":("11","12","10"),
         "spring":("03","04","05")}


In [None]:
for season in seasons.keys(): 
  path=f'/content/drive/MyDrive/Colab Notebooks/BigData/Final project/results/COD_season/season_{season}.csv'
  m1,m2,m3=seasons[season]
  df.filter((df.month_of_death==m1) | (df.month_of_death==m2) | (df.month_of_death==m3)).groupBy('358_cause_recode','113_cause_recode','130_infant_cause_recode','39_cause_recode').count()\
    .sort(F.desc("count")).limit(20)\
    .toPandas()\
    .to_csv(path,header=True)
    # .select("month_of_death",'358_cause_recode','113_cause_recode','130_infant_cause_recode','39_cause_recode').show()
           

# Correlation between cause of death, engagement and the month.

In [None]:
seasons={"summer":("06","07","08"),
         "fall":("09","10","11"),
         "winter":("11","12","10"),
         "spring":("03","04","05")}


In [None]:
for season in seasons.keys(): 
  path=f'/content/drive/MyDrive/Colab Notebooks/BigData/Final project/results/COD_activities/season_{season}.csv'
  m1,m2,m3=seasons[season]
  df.filter((df.month_of_death==m1) | (df.month_of_death==m2) | (df.month_of_death==m3)).groupBy('activity_code','358_cause_recode','113_cause_recode','130_infant_cause_recode','39_cause_recode').count()\
    .sort(F.desc("count")).limit(20)\
    .toPandas()\
    .to_csv(path,header=True)
           

# Cause of death distribution for each year

In [None]:
years=["2005","2006","2007","2008","2009","2010","2011","2012","2013","2014","2015"]

In [None]:
for year in years: 
  path=f'/content/drive/MyDrive/Colab Notebooks/BigData/Final project/results/COD_per_year/year_{year}.csv'
  df.filter(df.current_data_year==year).groupBy('current_data_year','358_cause_recode','113_cause_recode','130_infant_cause_recode','39_cause_recode').count()\
    .sort(F.desc("count")).limit(20)\
    .toPandas()\
    .to_csv(path,header=True)
           

In [None]:
string=codes[column_name][code_number]