## NYC Leading causes of Death data analysis

### Group : Apurva Padwal(apadwal2@illinois.edu), Pranav Dange(pdange2@illinois.edu)          

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import altair as alt

In [2]:
#importing the data same as part 1
cause_of_death_data = pd.read_csv('https://data.cityofnewyork.us/api/views/jb7j-dtam/rows.csv?accessType=DOWNLOAD', na_values = {"Deaths": '.', "Death Rate": '.', "Age Adjusted Death Rate": '.'}).dropna()

In [3]:
cause_of_death_data.head(10)

Unnamed: 0,Year,Leading Cause,Sex,Race Ethnicity,Deaths,Death Rate,Age Adjusted Death Rate
0,2011,Chronic Lower Respiratory Diseases (J40-J47),M,White Non-Hispanic,444.0,33.3,27.8
1,2008,Malignant Neoplasms (Cancer: C00-C97),M,White Non-Hispanic,3361.0,250.4,214.3
2,2010,"Chronic Liver Disease and Cirrhosis (K70, K73)",M,Hispanic,150.0,13.1,17.6
3,2014,Malignant Neoplasms (Cancer: C00-C97),M,Hispanic,1146.0,96.0,143.5
5,2014,Influenza (Flu) and Pneumonia (J09-J18),F,Asian and Pacific Islander,80.0,12.8,14.1
6,2014,"Accidents Except Drug Posioning (V01-X39, X43,...",F,Asian and Pacific Islander,42.0,6.7,6.9
7,2013,"Accidents Except Drug Posioning (V01-X39, X43,...",M,Black Non-Hispanic,159.0,18.5,19.1
8,2011,Essential Hypertension and Renal Diseases (I10...,M,White Non-Hispanic,142.0,10.6,8.8
9,2013,"Chronic Liver Disease and Cirrhosis (K70, K73)",M,Hispanic,149.0,12.6,15.7
11,2009,Cerebrovascular Disease (Stroke: I60-I69),F,Black Non-Hispanic,245.0,23.4,21.3


In [4]:
cause_of_death_data.tail(10)

Unnamed: 0,Year,Leading Cause,Sex,Race Ethnicity,Deaths,Death Rate,Age Adjusted Death Rate
1771,2017,Malignant Neoplasms (Cancer: C00-C97),Male,Non-Hispanic White,3017.0,224.615373,176.084244
1772,2017,Mental and Behavioral Disorders due to Acciden...,Male,Non-Hispanic White,451.0,33.576909,30.976422
1773,2017,Influenza (Flu) and Pneumonia (J09-J18),Male,Non-Hispanic White,409.0,30.450012,22.39998
1774,2017,Chronic Lower Respiratory Diseases (J40-J47),Male,Non-Hispanic White,374.0,27.844266,20.874867
1775,2017,"Accidents Except Drug Poisoning (V01-X39, X43,...",Male,Non-Hispanic White,290.0,21.590473,18.051476
1776,2017,Cerebrovascular Disease (Stroke: I60-I69),Male,Non-Hispanic White,266.0,19.803676,14.881485
1777,2017,Diabetes Mellitus (E10-E14),Male,Non-Hispanic White,238.0,17.719078,13.951702
1778,2017,"Intentional Self-Harm (Suicide: U03, X60-X84, ...",Male,Non-Hispanic White,194.0,14.443282,12.952286
1779,2017,Essential Hypertension and Renal Diseases (I10...,Male,Non-Hispanic White,172.0,12.805384,9.466143
1780,2017,All Other Causes,Male,Non-Hispanic White,2167.0,161.332951,127.553288


In [5]:
cause_of_death_data.columns

Index(['Year', 'Leading Cause', 'Sex', 'Race Ethnicity', 'Deaths',
       'Death Rate', 'Age Adjusted Death Rate'],
      dtype='object')

In [6]:
cause_of_death_data['Leading Cause'] = cause_of_death_data['Leading Cause'].str.replace(r'\s+\(.*\)', '', regex=True)

In [7]:
cause_of_death_data['Sex'] = cause_of_death_data['Sex'].replace({'F': 'Female', 'M': 'Male'})

In [8]:
#printing the data to verify the above data cleaning operation was successful
cause_of_death_data['Leading Cause']

0              Chronic Lower Respiratory Diseases
1                             Malignant Neoplasms
2             Chronic Liver Disease and Cirrhosis
3                             Malignant Neoplasms
5                                       Influenza
                          ...                    
1776                      Cerebrovascular Disease
1777                            Diabetes Mellitus
1778                        Intentional Self-Harm
1779    Essential Hypertension and Renal Diseases
1780                             All Other Causes
Name: Leading Cause, Length: 1196, dtype: object

In [9]:
#printing the data to verify the above data cleaning operation was successful
cause_of_death_data['Sex']

0         Male
1         Male
2         Male
3         Male
5       Female
         ...  
1776      Male
1777      Male
1778      Male
1779      Male
1780      Male
Name: Sex, Length: 1196, dtype: object

In [10]:
#printing the data to verify the above data cleaning operation was successful
cause_of_death_data['Leading Cause'].unique()

array(['Chronic Lower Respiratory Diseases', 'Malignant Neoplasms',
       'Chronic Liver Disease and Cirrhosis', 'Influenza',
       'Accidents Except Drug Posioning',
       'Essential Hypertension and Renal Diseases',
       'Cerebrovascular Disease', 'Human Immunodeficiency Virus Disease',
       'Mental and Behavioral Disorders due to Accidental Poisoning and Other Psychoactive Substance Use',
       'Assault', 'Diabetes Mellitus', 'Diseases of Heart',
       'Nephritis, Nephrotic Syndrome and Nephrisis', 'All Other Causes',
       'Septicemia', 'Intentional Self-Harm', "Alzheimer's Disease",
       'Certain Conditions originating in the Perinatal Period',
       'Congenital Malformations, Deformations, and Chromosomal Abnormalities',
       'Viral Hepatitis', 'Accidents Except Drug Poisoning',
       "Parkinson's Disease",
       'Mental and Behavioral Disorders due to Use of Alcohol'],
      dtype=object)

In [11]:
#printing the data to verify the above data cleaning operation was successful
cause_of_death_data['Sex'].unique()

array(['Male', 'Female'], dtype=object)

In [12]:
median_death = cause_of_death_data.groupby(['Leading Cause','Race Ethnicity','Sex','Year'],as_index=False)['Deaths'].median()

In [13]:
median_death

Unnamed: 0,Leading Cause,Race Ethnicity,Sex,Year,Deaths
0,Accidents Except Drug Poisoning,Asian and Pacific Islander,Female,2015,34.0
1,Accidents Except Drug Poisoning,Asian and Pacific Islander,Female,2016,43.0
2,Accidents Except Drug Poisoning,Asian and Pacific Islander,Female,2017,36.0
3,Accidents Except Drug Poisoning,Asian and Pacific Islander,Female,2018,33.0
4,Accidents Except Drug Poisoning,Asian and Pacific Islander,Female,2019,40.0
...,...,...,...,...,...
1191,Septicemia,White Non-Hispanic,Female,2010,107.0
1192,Septicemia,White Non-Hispanic,Female,2012,113.0
1193,Septicemia,White Non-Hispanic,Female,2013,135.0
1194,Septicemia,White Non-Hispanic,Female,2014,118.0


In [14]:
#cleaning the dataset further
median_death = median_death[median_death["Race Ethnicity"] != 'Not Stated/Unknown'] 

In [15]:
#choosing a sample of data from 2010 to 2015
median_death = median_death[(median_death.Year > 2009) & (median_death.Year < 2016)]

In [16]:
#verify the  data after manipulation
median_death

Unnamed: 0,Leading Cause,Race Ethnicity,Sex,Year,Deaths
0,Accidents Except Drug Poisoning,Asian and Pacific Islander,Female,2015,34.0
5,Accidents Except Drug Poisoning,Asian and Pacific Islander,Male,2015,64.0
10,Accidents Except Drug Poisoning,Hispanic,Female,2015,65.0
13,Accidents Except Drug Poisoning,Hispanic,Male,2015,178.0
22,Accidents Except Drug Poisoning,Non-Hispanic White,Female,2015,191.0
...,...,...,...,...,...
1191,Septicemia,White Non-Hispanic,Female,2010,107.0
1192,Septicemia,White Non-Hispanic,Female,2012,113.0
1193,Septicemia,White Non-Hispanic,Female,2013,135.0
1194,Septicemia,White Non-Hispanic,Female,2014,118.0


#### Central Interactive Visualization

For the first Dashboard, we are visualizing two charts with interactive capability. First chart of the Dashboard will be a line chart displaying Median number of deaths based on Leading cause of people's death. The second chart is Grouped bar chart that displays Median deaths based on Race and Ethnicity of people seperated by their Gender. The data used shows details about these deaths from New York city.

In [32]:
# Define the selection interval
brush = alt.selection_interval(encodings=['x','y'])

# Creating a Line Chart
line_chart = alt.Chart(median_death).mark_line().encode(
    x=alt.X('Leading Cause:N', title='Leading Cause'), # N is for nominal
    y=alt.Y('median(Deaths):Q', title='Median Deaths'),  # Q is for quantitative
    color=alt.Color('Sex:N', title='Sex'), # Color by sex
    tooltip=(['Leading Cause:N', 'Race Ethnicity:N', 'Sex:N', 'Year:Q', 'median(Deaths):Q'])
).add_selection(
    brush
).properties(
    width=350,
    title=alt.TitleParams(
        text='Median Deaths by Leading Cause',
        align='center',
        anchor='middle',
        fontSize=14
    )
)

# Creating a Grouped Bar Chart
bar_chart =  alt.Chart(median_death).mark_bar().encode(
    x=alt.X('Year:Q', title='Year', axis=alt.Axis(format='d')),  
    y=alt.Y('median(Deaths):Q', axis=alt.Axis(grid=False), title ='Median Deaths'), 
    color=alt.Color('Race Ethnicity:N', title='Race/Ethnicity'), 
    column=alt.Column('Sex:N', title='Sex'),
    tooltip=(['Year:Q', 'Sex:N', 'median(Deaths):Q'])
).transform_filter(
    brush
).properties(
    width=150,
    title=alt.TitleParams(
        text='',
        align='center',
        anchor='middle',
        fontSize=14
    )
)

#Merging the visualizations
Dashboard_Final = (line_chart | bar_chart).resolve_scale(
    y='independent'
).configure_axis(
    titleFontSize=14,
    labelFontSize=12
).configure_legend(
    titleFontSize=12,
    labelFontSize=10
)

In [33]:
Dashboard_Final = Dashboard_Final.properties(
    title='New Tork City: Median Number of Deaths based on Leading Cause, Sex, Race and Ethnicity'
                                            )

Dashboard_Final

In [19]:
#myJekyllDir = 'C:/Users/prana/pdange21.github.io/assets/json/'

In [20]:
#Dashboard_Final.save(myJekyllDir+"Dashboard_Final.json")

#### Plot Choice Explaination

For this Dashboard, we have a combination of Line chart and grouped bar chart to display median deaths based on leading cause, year, sex, and race/ethnicity. The line chart depicts the median deaths for each primary cause, which are classified and colored by gender. The user may choose an area on the line chart, which filters the information in the grouped bar chart to display the median deaths by year and gender, grouped by, and colored by race/ethnicity, for the chosen Region.

We felt that, the line chart is suitable for displaying the median deaths by leading cause, sex, and year. The "brush" variable defined by the "alt.selection_interval()" function can also be used by the user to pick an area of interest. The grouped bar chart is appropriate for comparing median deaths for the specified region by year, gender, and race/ethnicity. The graphic is changed by filtering the data according to the region provided by the "brush" variable.

The two charts are combined using the "Dashboard_Final" variable, which merges both charts and resolves the y-axis scales to ensure that the charts are aesthetically consistent. Overall, the plot selection showed complicated data with several variables, and it allows the user to interactively pick a location of interest to reveal further details.

### How to use the dashboard:

We have built two visualizations for the dashboard. The chart on the left depicts the median deaths based on Leading Cause, which includes the cause for the death. A line graph is used to illustrate this view. This line graph has been color coded according on the gender, and the legend is located at the right end of the graphic. The green color graph represents the male, while the red color graph represents the female. This graphic demonstrates that Heart Diseases were the leading cause of mortality. However, the median number of male and female deaths from this cause differed. When compared to males, females had a greater median death rate. The X axis in this graphic displays the Leading Causes of Death, while the Y axis indicates the Median Deaths and is color coded by gender.
A Grouped Bar Chart is the second graphic. We segmented the display depending on gender, thus male and female data is separated. The X axis represents the Year from 2010 to 2015 for each Male and Female, while the Y axis represents the Median Number of deaths. The bar is color coded and represents the quantity of Race and Ethinicity, with a caption on the right side of the picture. So, each bar depicts the median death distribution depending on race and ethnicity, and the quantity of the bar coded clearly aids in visualizing this.
Now we utilize the left visualization to choose the number of median deaths based on the Leading Cause, and the right visualization to demonstrate how this is divided in terms of Male and Female, Race and Ethnicity distribution.

#### A list of 1 or more contextual datasets you have identified, links to where they reside, and a sentence about why they might be useful in telling the final story.

- We have identitfied one contextual dataset which will help us telling our story in a better way.

#### Contextual Dataset Information:
- NCHS - Leading Causes of Death: United States(Metadata Updated: April 21, 2022)
- Website Url: https://catalog.data.gov/dataset/nchs-leading-causes-of-death-united-states
- Dataset Url in CSV: https://data.cdc.gov/api/views/bi63-dtpu/rows.csv?accessType=DOWNLOAD
- Information about the dataset: The dataset provides information on the age-adjusted death rates for the top 10 causes of death in the United States, starting from 1999.The data is derived from death certificates of residents filed in all 50 states and the District of Columbia, which includes demographic and medical characteristics.
- Why they might be useful in telling our story: So this dataset contains the cause of Death throught the United States of America from 1999 to 2017 and we are analyzing the death rate in New York during the same duration this dataset will help in finding the pattern if there was some particular disease outbreak that caused major death throught the contry or to find any peculiar pattern in the cause of death by comparing the two dataset. Also, we will look to see the pattern of death based on Race and Ethinicity to see if they show any particular trend. These characteristics of the dataset will help us telling a better story about our dataset which we have chosen. 
- As the size of our dataset was 157 Kb which is very small compared to github limit we will not need a revised plan to host the data on github.

#### Extra interactive visualization

In [21]:
# Define the selection interval
brush = alt.selection_interval(encodings=['x','y'])

# Creating Scatter Plot
scatter_plot = alt.Chart(median_death).mark_circle().encode(
    x='Race Ethnicity:N', 
    y='average(Deaths):Q',  
    size=alt.Size('average(Deaths):Q', title='Average Deaths'),
    color='Race Ethnicity:N', 
    tooltip=(['Race Ethnicity:N','average(Deaths):Q'])
).add_selection(
    brush
)

# Creating Grouped Bar Chart
bar_chart =  alt.Chart(median_death).mark_bar().encode(
    alt.X('Sex:N'),  # N is for nominal
    alt.Y('median(Deaths):Q', axis=alt.Axis(grid=False), title ='Median Deaths'), 
    alt.Color('Sex:N'), 
    column='Year:Q', 
    tooltip=(['Sex:N','median(Deaths):Q'])
).transform_filter(
    brush
)

#Merging the Visualizations
Dashboard_1 = scatter_plot.properties(width=250) |  bar_chart.properties(width=50)

In [22]:
Dashboard_1