# Rapport final - <Titre de l'analyse>
> Mohamedlamine Amimi <mohamedlamine.amimi@supinfo.com>
> Théo Bezin <theo.bezin@supinfo.com>
> Alexis Lecomte <alexis.lecomte@supinfo.com>
> Maxence Pawlowski <maxence.pawlowski@supinfo.com>

In [1]:
# Calculs mathématiques
import math
import numpy

# Chargement et prise en charge des données
import pandas

# Prise en charge des données géographique
import geopandas
import folium
from folium import plugins

# Création de statistiques et de graphiques
import scipy.stats
import matplotlib.pyplot as pyplot
from matplotlib.ticker import NullFormatter
import plotly.express as px

# Autres
import os
import re
import time
from datetime import datetime
from tabulate import tabulate
import webbrowser
from colorama import Fore, Style

%matplotlib notebook
pyplot.style.use("ggplot")
pyplot.rcParams["figure.figsize"] = [17, 7]

## Analyse de "WashingtonPostDatabase.csv"

In [2]:
# Faute dans le nom du CSV
data_wp = pandas.read_csv("../data/WahsingtonPostDatabase.csv", parse_dates=["date"], date_parser=pandas.to_datetime)

data_wp["date"] = pandas.to_datetime(data_wp["date"]) 				# change date type to datetime
data_wp["name"] = data_wp["name"].fillna("Unknown") 				# replace NaN  by unknown
data_wp["armed"] = data_wp["armed"].fillna("Unknown") 				# replace NaN  by unknown
data_wp["gender"] = data_wp["gender"].fillna("Unknown") 			# replace NaN  by unknown
data_wp["race"] = data_wp["race"].fillna("Unknown") 				# replace NaN  by unknown
data_wp["threat_level"] = data_wp["threat_level"].fillna("Unknown")	# replace NaN  by unknown
data_wp["flee"] = data_wp["flee"].fillna("Unknown") 				# replace NaN  by unknown
data_wp["latitude"] = data_wp["latitude"].fillna("Unknown") 		# replace NaN  by unknown
data_wp["longitude"] = data_wp["longitude"].fillna(0) 				# replace NaN  by unknown
age_mean = data_wp["age"].mean()
data_wp["age"] = data_wp["age"].fillna(round(age_mean))  			# replace NaN age by mean

data_wp

Unnamed: 0,id,name,date,manner_of_death,armed,age,gender,race,city,state,signs_of_mental_illness,threat_level,flee,body_camera,longitude,latitude,is_geocoding_exact
0,3,Tim Elliot,2015-01-02,shot,gun,53.0,M,A,Shelton,WA,True,attack,Not fleeing,False,-123.122,47.247,True
1,4,Lewis Lee Lembke,2015-01-02,shot,gun,47.0,M,W,Aloha,OR,False,attack,Not fleeing,False,-122.892,45.487,True
2,5,John Paul Quintero,2015-01-03,shot and Tasered,unarmed,23.0,M,H,Wichita,KS,False,other,Not fleeing,False,-97.281,37.695,True
3,8,Matthew Hoffman,2015-01-04,shot,toy weapon,32.0,M,W,San Francisco,CA,True,attack,Not fleeing,False,-122.422,37.763,True
4,9,Michael Rodriguez,2015-01-04,shot,nail gun,39.0,M,H,Evans,CO,False,attack,Not fleeing,False,-104.692,40.384,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7267,7915,Unknown,2022-04-11,shot,gun,37.0,M,Unknown,Manitou Springs,CO,False,attack,Unknown,False,-104.917,38.858,True
7268,7916,Dwayne Jackson,2022-04-11,shot,gun,37.0,M,Unknown,Tulsa,OK,False,attack,Not fleeing,False,-95.904,36.05,True
7269,7931,Unknown,2022-04-12,shot,sharp object,37.0,M,Unknown,Edison,NJ,False,undetermined,Unknown,False,-74.331,40.548,True
7270,7929,Trevan Bonner,2022-04-13,shot,gun,18.0,M,Unknown,Lafayette,LA,False,attack,Unknown,False,-92.032,30.25,True


## Analyse de "PoliceDeaths.csv"

In [3]:
data_pd = pandas.read_csv("../data/PoliceDeaths.csv", index_col=None, parse_dates=["date"], date_parser=pandas.to_datetime)
data_pd["state"] = data_pd["state"].map(lambda state: state.strip())

pd_state_data = pandas.DataFrame(
	data=data_pd["state"].value_counts().reset_index().values,
	columns=["State", "Number of deaths"]
).sort_values("State", ascending=True)

data_pd

Unnamed: 0,person,dept,eow,cause,cause_short,date,year,canine,dept_name,state
0,Constable Darius Quimby,"Albany County Constable's Office, NY","EOW: Monday, January 3, 1791",Cause of Death: Gunfire,Gunfire,1791-01-03,1791,False,Albany County Constable's Office,NY
1,Sheriff Cornelius Hogeboom,"Columbia County Sheriff's Office, NY","EOW: Saturday, October 22, 1791",Cause of Death: Gunfire,Gunfire,1791-10-22,1791,False,Columbia County Sheriff's Office,NY
2,Deputy Sheriff Isaac Smith,"Westchester County Sheriff's Department, NY","EOW: Thursday, May 17, 1792",Cause of Death: Gunfire,Gunfire,1792-05-17,1792,False,Westchester County Sheriff's Department,NY
3,Marshal Robert Forsyth,United States Department of Justice - United S...,"EOW: Saturday, January 11, 1794",Cause of Death: Gunfire,Gunfire,1794-01-11,1794,False,United States Department of Justice - United S...,US
4,Sheriff Robert Maxwell,"Greenville County Sheriff's Office, SC","EOW: Sunday, November 12, 1797",Cause of Death: Gunfire,Gunfire,1797-11-12,1797,False,Greenville County Sheriff's Office,SC
...,...,...,...,...,...,...,...,...,...,...
22795,K9 Bruno,"Amarillo Police Department, TX","EOW: Sunday, June 12, 2016",Cause of Death: Accidental,Accidental,2016-06-12,2016,True,Amarillo Police Department,TX
22796,K9 Lazer,United States Department of Homeland Security ...,"EOW: Monday, June 20, 2016",Cause of Death: Heat exhaustion,Heat exhaustion,2016-06-20,2016,True,United States Department of Homeland Security ...,US
22797,K9 Tyson,"Fountain County Sheriff's Office, IN","EOW: Monday, June 27, 2016",Cause of Death: Heat exhaustion,Heat exhaustion,2016-06-27,2016,True,Fountain County Sheriff's Office,IN
22798,K9 Credo,"Long Beach Police Department, CA","EOW: Tuesday, June 28, 2016",Cause of Death: Gunfire (Accidental),Gunfire (Accidental),2016-06-28,2016,True,Long Beach Police Department,CA


In [4]:
data_pd.head()

Unnamed: 0,person,dept,eow,cause,cause_short,date,year,canine,dept_name,state
0,Constable Darius Quimby,"Albany County Constable's Office, NY","EOW: Monday, January 3, 1791",Cause of Death: Gunfire,Gunfire,1791-01-03,1791,False,Albany County Constable's Office,NY
1,Sheriff Cornelius Hogeboom,"Columbia County Sheriff's Office, NY","EOW: Saturday, October 22, 1791",Cause of Death: Gunfire,Gunfire,1791-10-22,1791,False,Columbia County Sheriff's Office,NY
2,Deputy Sheriff Isaac Smith,"Westchester County Sheriff's Department, NY","EOW: Thursday, May 17, 1792",Cause of Death: Gunfire,Gunfire,1792-05-17,1792,False,Westchester County Sheriff's Department,NY
3,Marshal Robert Forsyth,United States Department of Justice - United S...,"EOW: Saturday, January 11, 1794",Cause of Death: Gunfire,Gunfire,1794-01-11,1794,False,United States Department of Justice - United S...,US
4,Sheriff Robert Maxwell,"Greenville County Sheriff's Office, SC","EOW: Sunday, November 12, 1797",Cause of Death: Gunfire,Gunfire,1797-11-12,1797,False,Greenville County Sheriff's Office,SC


#### Infos

In [5]:
data_pd.shape

(22800, 10)

In [6]:
data_pd.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22800 entries, 0 to 22799
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   person       22800 non-null  object        
 1   dept         22800 non-null  object        
 2   eow          22800 non-null  object        
 3   cause        22800 non-null  object        
 4   cause_short  22800 non-null  object        
 5   date         22800 non-null  datetime64[ns]
 6   year         22800 non-null  int64         
 7   canine       22800 non-null  bool          
 8   dept_name    22800 non-null  object        
 9   state        22800 non-null  object        
dtypes: bool(1), datetime64[ns](1), int64(1), object(7)
memory usage: 1.6+ MB


22800 personnes en tout avec 9 informations par personne.

#### Données manquantes

In [7]:
data_pd.isna().sum()

person         0
dept           0
eow            0
cause          0
cause_short    0
date           0
year           0
canine         0
dept_name      0
state          0
dtype: int64

Il ne nous manque aucune donnée.

#### Nombre de données totales

In [8]:
nb_data = data_pd.shape[1] * data_pd.shape[0]
nb_data

228000

#### Données dupliquées

In [9]:
data_pd.duplicated().sum()

0

Nous n'avons aucunes données dupliquées

#### Moyenne, écart-type, minimum, etc

In [10]:
data_pd.describe()

Unnamed: 0,year
count,22800.0
mean,1951.751272
std,38.233234
min,1791.0
25%,1924.0
50%,1952.0
75%,1983.0
max,2016.0


#### Nombre de valeurs canine

In [11]:
data_pd.canine.value_counts()

False    22537
True       263
Name: canine, dtype: int64

In [12]:
data_pd.canine.value_counts(normalize=True)*100 #Transformation %

False    98.846491
True      1.153509
Name: canine, dtype: float64

In [21]:
_ = data_pd.canine.value_counts().plot(kind="pie", autopct='%1.2f%%')

#### Répartition par État

In [20]:
# _ = data_pd.state.value_counts().plot(kind="pie")

L'état comprenant le plus de morts de policiers est le Texas, suivi par la Californie et l'état de New York.