# Let's discover our dataset !
### Reading and showing the dataset

In [4]:
import csv

file = csv.reader(open('guns.csv', encoding = 'utf-8'))
dataset = list(file)

In [6]:
print(dataset[0:5])

[['', 'year', 'month', 'intent', 'police', 'sex', 'age', 'race', 'hispanic', 'place', 'education'], ['1', '2012', '01', 'Suicide', '0', 'M', '34', 'Asian/Pacific Islander', '100', 'Home', '4'], ['2', '2012', '01', 'Suicide', '0', 'F', '21', 'White', '100', 'Street', '3'], ['3', '2012', '01', 'Suicide', '0', 'M', '60', 'White', '100', 'Other specified', '4'], ['4', '2012', '02', 'Suicide', '0', 'M', '64', 'White', '100', 'Home', '4']]


Let's extract the header :

In [7]:
header = dataset[0:1]
guns = dataset[1:]

In [8]:
print(header)

[['', 'year', 'month', 'intent', 'police', 'sex', 'age', 'race', 'hispanic', 'place', 'education']]


In [9]:
print(guns[0:5])

[['1', '2012', '01', 'Suicide', '0', 'M', '34', 'Asian/Pacific Islander', '100', 'Home', '4'], ['2', '2012', '01', 'Suicide', '0', 'F', '21', 'White', '100', 'Street', '3'], ['3', '2012', '01', 'Suicide', '0', 'M', '60', 'White', '100', 'Other specified', '4'], ['4', '2012', '02', 'Suicide', '0', 'M', '64', 'White', '100', 'Home', '4'], ['5', '2012', '02', 'Suicide', '0', 'M', '31', 'White', '100', 'Other specified', '2']]


### Exploring the dataset

### Number of deceased by firearms in the United States :

In [24]:
years = [gun[1] for gun in guns]
death = 0

for row in guns:
    death += 1
print(death)

100798


In [12]:
year_counts = {}
for year in years:
    if year in year_counts:
        year_counts[year] += 1
    else:
        year_counts[year] =1

print(year_counts)

{'2012': 33563, '2013': 33636, '2014': 33599}


As we can see, our data set is from 2012 to 2014, and in those 3 years, approximatively 100 000 people died from firearms uses.

### Let's define the dates more precisely

In [15]:
import datetime

dates = [datetime.datetime(year = int(row[1]), month = int(row[2]), day = 1) for row in guns]

In [16]:
print(dates[0:5])

[datetime.datetime(2012, 1, 1, 0, 0), datetime.datetime(2012, 1, 1, 0, 0), datetime.datetime(2012, 1, 1, 0, 0), datetime.datetime(2012, 2, 1, 0, 0), datetime.datetime(2012, 2, 1, 0, 0)]


In [17]:
dates_counts = {}

for date in dates:
    if date not in dates_counts:
        dates_counts[date] = 0
    dates_counts[date] += 1

print(dates_counts)

{datetime.datetime(2012, 1, 1, 0, 0): 2758, datetime.datetime(2012, 2, 1, 0, 0): 2357, datetime.datetime(2012, 3, 1, 0, 0): 2743, datetime.datetime(2012, 4, 1, 0, 0): 2795, datetime.datetime(2012, 5, 1, 0, 0): 2999, datetime.datetime(2012, 6, 1, 0, 0): 2826, datetime.datetime(2012, 7, 1, 0, 0): 3026, datetime.datetime(2012, 8, 1, 0, 0): 2954, datetime.datetime(2012, 9, 1, 0, 0): 2852, datetime.datetime(2012, 10, 1, 0, 0): 2733, datetime.datetime(2012, 11, 1, 0, 0): 2729, datetime.datetime(2012, 12, 1, 0, 0): 2791, datetime.datetime(2013, 1, 1, 0, 0): 2864, datetime.datetime(2013, 2, 1, 0, 0): 2375, datetime.datetime(2013, 3, 1, 0, 0): 2862, datetime.datetime(2013, 4, 1, 0, 0): 2798, datetime.datetime(2013, 5, 1, 0, 0): 2806, datetime.datetime(2013, 6, 1, 0, 0): 2920, datetime.datetime(2013, 7, 1, 0, 0): 3079, datetime.datetime(2013, 8, 1, 0, 0): 2859, datetime.datetime(2013, 9, 1, 0, 0): 2742, datetime.datetime(2013, 10, 1, 0, 0): 2808, datetime.datetime(2013, 11, 1, 0, 0): 2758, datet

As we can see on this list, firearms related deaths seems to be more important during summer season (may/june to september/october).

 ## Exploring the dataset in function of sex and race

#### Gender equality ?

In [33]:
gender_counts = {}

for row in guns:
    gender = row[5]
    if gender not in gender_counts:
        gender_counts[gender] = 0
    gender_counts[gender] += 1
print(gender_counts)

male = gender_counts['M'] / (death/100)
male = round(male, 2)
female = round(100 - male, 2)
print('% of male death : ' + str(male))
print('% of female death : ' + str(female))


{'M': 86349, 'F': 14449}
% of male death : 85.67
% of female death : 14.33


More than 85% of the deaths by firearms concerns men, it doesn't seem very fair ... (of course we should look at male/female ratio in US)

### Origins

In [39]:
origin_counts = {}

for row in guns:
    origin = row[7]
    if origin not in origin_counts:
        origin_counts[origin] = 0
    origin_counts[origin] += 1
    
print(origin_counts)

white = round(origin_counts['White'] / (death/100), 2)
black = round(origin_counts['Black'] / (death/100), 2)
hispanic = round(origin_counts['Hispanic'] / (death/100), 2)
asian = round(origin_counts['Asian/Pacific Islander'] / (death/100), 2)
native = round(origin_counts['Native American/Native Alaskan'] / (death/100), 2)

print('% of white people deaths :' + str(white))
print('% of black people deaths :' + str(black))
print('% of hispanic people deaths :' + str(hispanic))
print('% of asian people deaths :' + str(asian))
print('% of native people deaths :' + str(native))

{'Asian/Pacific Islander': 1326, 'White': 66237, 'Native American/Native Alaskan': 917, 'Black': 23296, 'Hispanic': 9022}
% of white people deaths :65.71
% of black people deaths :23.11
% of hispanic people deaths :8.95
% of asian people deaths :1.32
% of native people deaths :0.91


Most of firearms related deaths concern white people, followed by black people, (Native american come last, but this dataset is about 2012 to 2014, things could have been different à few centuries ago). These numbers need to be compared with the population ratio before making sudden conclusions...

# Personal bonuses : Causes 

In [41]:
type_counts = {}

for row in guns:
    death_type = row[3]
    if death_type not in type_counts:
        type_counts[death_type] = 0
    type_counts[death_type] += 1

print(type_counts)

suicide = round(type_counts['Suicide'] / (death/100),2)
homicide = round(type_counts['Homicide'] / (death/100),2)
accidental = round(type_counts['Accidental'] / (death/100),2)

print('% of suicide : ' + str(suicide))
print('% of homicide : ' + str(homicide))
print('% of accidents : ' + str(accidental))

{'Suicide': 63175, 'Undetermined': 807, 'Accidental': 1639, 'Homicide': 35176, 'NA': 1}
% of suicide : 62.67
% of homicide : 34.9
% of accidents : 1.63


The most common firearm related death in the United States is Suicide (with more than 60%), a third are homicides and a very few are accidents.

### Let's try to find relations between gender, dates, causes and origins.

#### Types of deaths by gender :

In [54]:
male_type_counts = {}
female_type_counts = {}

list_type = ['Accidental', 'Homicide', 'Suicide', 'Undetermined']
for row in guns:
    gender = row[5]
    death_type = row[3]
    if gender == 'M':
        if death_type not in male_type_counts:
            male_type_counts[death_type] = 0
        male_type_counts[death_type] += 1
            
    if gender == 'F':
        if death_type not in female_type_counts:
            female_type_counts[death_type] = 0
        female_type_counts[death_type] += 1
        
print('Male : ' + str(male_type_counts))
print('Female : ' + str(female_type_counts))

print('\nCauses Ratio men/women : \n')
for row in list_type:
    print(str(row) + ' : ' + str(round(male_type_counts[row] / female_type_counts[row], 2)))
    
    
print('Genral ratio : ' + str(round(gender_counts['M']/gender_counts['F'], 2)))

Male : {'Suicide': 54486, 'Undetermined': 638, 'Accidental': 1421, 'Homicide': 29803, 'NA': 1}
Female : {'Suicide': 8689, 'Homicide': 5373, 'Undetermined': 169, 'Accidental': 218}

Causes Ratio men/women : 

Accidental : 6.52
Homicide : 5.55
Suicide : 6.27
Undetermined : 3.78
Genral ratio : 5.98


In general, 6 men die from firearms for 1 women, we can see a little increment for accidents and suicide and a decrement for homicide.

### Let's have a look to seasons and type of death

In [68]:
suicide_month = {}
homicide_month = {}
accident_month = {}

months = {'01' : "January", '02' : "February", '03' : "March", '04' : "April", '05' : "May", '06' : "June", '07' : "July", '08' : "August", '09' : "September", '10' : "October", '11' : "November", '12' : "December"}


for row in guns:
    month = months[row[2]]
    death_type = row[3]
    if death_type == 'Suicide':
        if month not in suicide_month:
            suicide_month[month] =0
        suicide_month[month] +=1
    elif death_type == 'Accidental':
        if month not in accident_month:
            accident_month[month] =0
        accident_month[month] +=1
    elif death_type == 'Homicide':
        if month not in homicide_month:
            homicide_month[month] =0
        homicide_month[month] +=1
        
        
print('Suicides per months : \n' + str(suicide_month))
print('Homicides per months : \n' +str(homicide_month))
print('Accidents per months : \n' + str(accident_month))

Suicides per months : 
{'January': 5220, 'February': 4732, 'March': 5309, 'April': 5438, 'May': 5506, 'June': 5367, 'July': 5514, 'August': 5421, 'September': 5343, 'October': 5256, 'November': 5086, 'December': 4983}
Homicides per months : 
{'March': 2780, 'April': 2845, 'June': 3130, 'July': 3269, 'August': 3125, 'September': 2966, 'October': 2968, 'December': 3191, 'January': 2829, 'February': 2178, 'May': 2976, 'November': 2919}
Accidents per months : 
{'February': 127, 'July': 149, 'August': 164, 'September': 118, 'December': 179, 'January': 152, 'March': 134, 'April': 98, 'May': 115, 'November': 160, 'October': 131, 'June': 112}


I would have expected more suicides in december/january due to loneliness in holydays than as seen in our data.
It seems that the summer is more comfortable to end your days.

Same things for homicide, except for a pyke in december, summer's months are more deadly.

The tendances are different concerning the accidents, it seems more random (accidental indeed) but we can see that less accidents occurs in april, may, june and september.

# Back to the project Udemy

## Dataset population

In [69]:
file_2 = csv.reader(open('census.csv','r'))

In [70]:
census = list(file_2)

In [71]:
census

[['Id',
  'Year',
  'Id',
  'Sex',
  'Id',
  'Hispanic Origin',
  'Id',
  'Id2',
  'Geography',
  'Total',
  'Race Alone - White',
  'Race Alone - Hispanic',
  'Race Alone - Black or African American',
  'Race Alone - American Indian and Alaska Native',
  'Race Alone - Asian',
  'Race Alone - Native Hawaiian and Other Pacific Islander',
  'Two or More Races'],
 ['cen42010',
  'April 1, 2010 Census',
  'totsex',
  'Both Sexes',
  'tothisp',
  'Total',
  '0100000US',
  '',
  'United States',
  '308745538',
  '197318956',
  '44618105',
  '40250635',
  '3739506',
  '15159516',
  '674625',
  '6984195']]

### Calculate the ratio deaths/origins

In [74]:
mapping = {'Asian/Pacific Islander': 15159516 + 674625,
           'Black' : 40250635,
           'Native American/Native Alaskan' : 3739506,
           'Hispanic': 44618105,
           'White' : 197318956
          }
ratio = {}

for k,v in origin_counts.items():
    ratio[k] = (v / mapping[k]) * 100000
print(ratio)

{'Asian/Pacific Islander': 8.374309664161762, 'White': 33.56849303419181, 'Native American/Native Alaskan': 24.521955573811088, 'Black': 57.8773477735196, 'Hispanic': 20.220491210910907}


### Filter by homicide

In [81]:
intents = [row[3] for row in guns]
races = [row[7] for row in guns]

In [85]:
homicides_counts = {}
for i, race in enumerate(races):
    if race not in homicides_counts:
        homicides_counts[race] = 0
    if intents[i] == 'Homicide':
        homicides_counts[race] += 1

homicide_ratio = {}
for k, v in homicides_counts.items():
    homicide_ratio[k] = (v / mapping[k]) * 100000

print(homicide_ratio)

{'Asian/Pacific Islander': 3.530346230970155, 'White': 4.6356417981453335, 'Native American/Native Alaskan': 8.717729026240365, 'Black': 48.471284987180944, 'Hispanic': 12.627161104219914}
