# Data exploration 

---

Group name: O

---


## Introduction

*This section includes a short description of the data* 

## Setup

In [83]:
import pandas as pd
import altair as alt
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

alt.data_transformers.disable_max_rows()


DataTransformerRegistry.enable('default')

## Data

## Import data

In [84]:
PoliceKillingsData = "https://raw.githubusercontent.com/FabioRaab/bigData-HW1/main/data/external/police_killings.csv"

### Data structure

In [85]:
df = pd.read_csv(PoliceKillingsData)
print(df)

df.info()

                   name age  gender    raceethnicity     month  day  year  \
0    A'donte Washington  16    Male            Black  February   23  2015   
1        Aaron Rutledge  27    Male            White     April    2  2015   
2           Aaron Siler  26    Male            White     March   14  2015   
3          Aaron Valdez  25    Male  Hispanic/Latino     March   11  2015   
4          Adam Jovicic  29    Male            White     March   19  2015   
..                  ...  ..     ...              ...       ...  ...   ...   
462  William Chapman II  18    Male            Black     April   22  2015   
463    William Dick III  28    Male  Native American     April    4  2015   
464       William Poole  52    Male            White     March   16  2015   
465   Yuvette Henderson  38  Female            Black  February    3  2015   
466       Zaki Shinwary  48    Male          Unknown   January   16  2015   

                                 streetaddress          city state  ...  \


### Data corrections

In [86]:
df['cause'] = df['cause'].astype("category")

In [87]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 467 entries, 0 to 466
Data columns (total 34 columns):
 #   Column                Non-Null Count  Dtype   
---  ------                --------------  -----   
 0   name                  467 non-null    object  
 1   age                   467 non-null    object  
 2   gender                467 non-null    object  
 3   raceethnicity         467 non-null    object  
 4   month                 467 non-null    object  
 5   day                   467 non-null    int64   
 6   year                  467 non-null    int64   
 7   streetaddress         463 non-null    object  
 8   city                  467 non-null    object  
 9   state                 467 non-null    object  
 10  latitude              467 non-null    float64 
 11  longitude             467 non-null    float64 
 12  state_fp              467 non-null    int64   
 13  county_fp             467 non-null    int64   
 14  tract_ce              467 non-null    int64   
 15  geo_id

### Variable lists

In [88]:
# make a list of variables you want to use
var_list = ['gender', 'cause']

In [89]:
source = df[var_list]

In [90]:
source

Unnamed: 0,gender,cause
0,Male,Gunshot
1,Male,Gunshot
2,Male,Gunshot
3,Male,Gunshot
4,Male,Gunshot
...,...,...
462,Male,Gunshot
463,Male,Taser
464,Male,Gunshot
465,Female,Gunshot


# Visualization 1: Bar plot analyzing cause of death

I want to analyse the main causes of death accordig to the data set. In order to do so, I will be visualizing all causes of death in a bar chart since this is the easiest way to see, which cause of death is the most frequent one. We'll also make this bar plot interactive to enable the viewer to see the exact count of records.

In [91]:
ChartCause = alt.Chart(source).mark_bar().encode(
     x=alt.X('cause',
        sort='-y' ),
    y=alt.Y('count(cause)')
)

In [92]:
ChartCause = alt.Chart(source).mark_bar().encode(
    x=alt.X('cause', 
            sort= '-y',
            #für die Achseneinstellung benutzen wir axis
            axis=alt.Axis(title="Cause of death", # title of x axis: Cause of death
                          labelAngle=0)), # angle of x axis text: 0
      y=alt.Y('count(cause)',
            axis=alt.Axis(title = "Count of deaths due to this cause", 
                        titleAnchor="end")),
      color= alt.Color ('cause', legend=alt.Legend(title="Which cause?")),
      tooltip=['cause', 'count(cause)' ]

).interactive(

).configure_title(
    fontSize=11,
    font='Arial',
    anchor='start',
    color='black'

).properties( 
    title= 'What is the most common cause of death among police killings?',
    width= 500,
    height= 450
)
ChartCause

-------------------------------------------------------------

# Visualization 2: Pie chart analysing race/ethnicity of deceased

Next, I want to analyse the distrubution of race/ethincity of the deceased people of the data set to find out why race/ethinicity is most likely to be targeted.

In [109]:
df.raceethnicity = df.raceethnicity.astype("category")

# create data for pie chart
source = pd.DataFrame(df.raceethnicity.value_counts())

# set index to column
source = source.reset_index()

# rename columns
source.rename(columns={"index": "race", "raceethnicity": "value"}, inplace=True)

In [111]:
chartRace = alt.Chart(source).mark_arc().encode(
    theta=alt.Theta(field="value", type="quantitative"),
    color= alt.Color ('race', 
                     legend=alt.Legend(title="Which race?")),
    tooltip = ["race"]
).properties( 
    title= 'Which race is most effected by police killings?',
    width= 300,
    height= 300
).configure_title(
    fontSize=12,
    font='Arial',
    anchor='start',
    color='black'
)

# chartRace.drop[3]

pie = chartRace.mark_arc(outerRadius=125)

pie

-------------------------------------------------------------

# Visualization 3: Map of all police killings

In [95]:
var3_list = ['state', 'city','latitude', 'longitude']
sourceV3 = df[var3_list]
sourceV3.reset_index(inplace=True)
sourceV3

Unnamed: 0,index,state,city,latitude,longitude
0,0,AL,Millbrook,32.529577,-86.362829
1,1,LA,Pineville,31.321739,-92.434860
2,2,WI,Kenosha,42.583560,-87.835710
3,3,CA,South Gate,33.939298,-118.219463
4,4,OH,Munroe Falls,41.148575,-81.429878
...,...,...,...,...,...
462,462,VA,Portsmouth,36.829014,-76.341438
463,463,WA,Tonasket,48.708542,-119.436829
464,464,NC,Gaston,35.205776,-81.240669
465,465,CA,Oakland,37.827129,-122.284492


In [112]:
from vega_datasets import data

In [116]:
states = alt.topo_feature(data.us_10m.url, feature='states')

background = alt.Chart(states).mark_geoshape(
    fill='lightgray',
    stroke='white'
).project('albersUsa').properties(
    width=500,
    height=300
)

points = alt.Chart(sourceV3).mark_circle().encode(
    longitude='longitude:Q',
    latitude='latitude:Q',
    size=alt.value(10),
    tooltip=['city', 'state']
)

background + points


-------------------------------------------------------------

## Visualization 4: Which gender is most effected by police killings?

In [100]:
var4_list = ['gender', 'age']

sourceV4 = df[var4_list]

sourceV4

Unnamed: 0,gender,age
0,Male,16
1,Male,27
2,Male,26
3,Male,25
4,Male,29
...,...,...
462,Male,18
463,Male,28
464,Male,52
465,Female,38


In [101]:


chartV4 = alt.Chart(sourceV4).mark_bar(
    cornerRadiusTopLeft=3,
    cornerRadiusTopRight=3
).encode(
    x='gender:O',
    y='count():Q',
    tooltip= ['gender', 'count(gender)' ],
    color= alt.Color ('gender', legend=alt.Legend(title="Which cause?"))
).interactive(

).properties( 
    title= 'Which gender is more effected by police killings?',
    width= 300,
    height= 500

).configure_title(
    fontSize=13,
    font='Arial',
    anchor='start',
    color='black'
)

chartV4


# Visualization 5: Crosstab showing relation of death's cause and whether deseased was armed

In [102]:
# A crosstab with row proportions for cause and whether deseased was armed.

Crosstab = pd.crosstab(df['cause'],    # rows: cause
            df['armed'],    # columns: armed
            normalize='index',
            margins=True).round(3)*100 #4 steht für 3 Nachkommastellen und 100 für auf 100%

Crosstab

armed,Disputed,Firearm,Knife,No,Non-lethal firearm,Other,Unknown,Vehicle
cause,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Death in custody,0.0,42.9,21.4,28.6,0.0,7.1,0.0,0.0
Gunshot,0.5,48.4,14.8,22.1,3.4,5.6,1.2,3.9
Struck by vehicle,0.0,41.7,8.3,33.3,0.0,0.0,8.3,8.3
Taser,0.0,70.4,7.4,11.1,0.0,3.7,3.7,3.7
Unknown,0.0,33.3,33.3,0.0,0.0,33.3,0.0,0.0
All,0.4,49.3,14.6,21.8,3.0,5.6,1.5,3.9


In [103]:
Crosstab.drop(columns=["Unknown", "Non-lethal firearm", "Disputed"], inplace=True)

Crosstab

armed,Firearm,Knife,No,Other,Vehicle
cause,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Death in custody,42.9,21.4,28.6,7.1,0.0
Gunshot,48.4,14.8,22.1,5.6,3.9
Struck by vehicle,41.7,8.3,33.3,0.0,8.3
Taser,70.4,7.4,11.1,3.7,3.7
Unknown,33.3,33.3,0.0,33.3,0.0
All,49.3,14.6,21.8,5.6,3.9


In [104]:
CrosstabCauseAndArmed = Crosstab.drop(index=('Unknown'))

In [105]:
CrosstabCauseAndArmed

# Heatmap anschauen

armed,Firearm,Knife,No,Other,Vehicle
cause,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Death in custody,42.9,21.4,28.6,7.1,0.0
Gunshot,48.4,14.8,22.1,5.6,3.9
Struck by vehicle,41.7,8.3,33.3,0.0,8.3
Taser,70.4,7.4,11.1,3.7,3.7
All,49.3,14.6,21.8,5.6,3.9


# Visualization 6: Age

In [135]:
var_list_age = ['name', 'age', 'day', 'state']
source_guns = df[var_list_age]
source_guns

Unnamed: 0,name,age,day,state
0,A'donte Washington,16,23,AL
1,Aaron Rutledge,27,2,LA
2,Aaron Siler,26,14,WI
3,Aaron Valdez,25,11,CA
4,Adam Jovicic,29,19,OH
...,...,...,...,...
462,William Chapman II,18,22,VA
463,William Dick III,28,4,WA
464,William Poole,52,16,NC
465,Yuvette Henderson,38,3,CA


In [136]:
import altair as alt
from vega_datasets import data

source = source_guns

alt.Chart(source).mark_circle(size=60).encode(
    x='day',
    y='age',
    color='state',
    tooltip=['name', 'state', 'day', 'age']
).interactive()

In [129]:
import altair as alt
from vega_datasets import data

source = data.cars()

alt.Chart(source).mark_circle(size=60).encode(
    x='Horsepower',
    y='Miles_per_Gallon',
    color='Origin',
    tooltip=['Name', 'Origin', 'Horsepower', 'Miles_per_Gallon']
).interactive()

-------------------------------------------------------------

## Visualisierung: Jahre

-------------------------------------------------------------

## (Visualisierungen, die es nicht geschafft haben haha)

In [106]:

# df['armed'] = df['armed'].astype("category")

var4_list = ['p_income', 'age']

sourceV4 = df[var4_list]

sourceV4



Unnamed: 0,p_income,age
0,28375,16
1,14678,27
2,25286,26
3,17194,25
4,33954,29
...,...,...
462,25262,18
463,18470,28
464,21175,52
465,26971,38


In [107]:
new=alt.Chart(sourceV4).mark_point().encode(
    x='age',
    y='p_income',
).properties( 
    title= 'What is the most common cause of death among police killings?',
    width= 1500,
    height= 1000
).interactive()

new

In [108]:
# Hoppala anscheinend keine Relation