# Mass Shooting Analysis_ Data Exploration

In [1]:
import json
import pandas as pd
import numpy as np
import random
from pathlib import Path
import seaborn as sns
import ipywidgets as widgets
from ipywidgets import interact, interactive, fixed, interact_manual
from IPython import get_ipython
from IPython.display import display, HTML
from matplotlib_inline.backend_inline import set_matplotlib_formats
import myst_nb
import wikipedia as wp
import math

import matplotlib.pyplot as plt
import plotly
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.io as pio

In [2]:
vga_dat = pd.read_csv('MSA Data\msd_vga_archive.csv')

In [3]:
vga_dat

Unnamed: 0,Incident ID,Incident Date,State,City Or County,Address,Victims Killed,Victims Injured,Suspects Killed,Suspects Injured,Suspects Arrested,Operations
0,3288844,"August 17, 2025",Virginia,Richmond,1100 block of Hull St,0,3,0,0,0,
1,3288842,"August 17, 2025",Texas,Houston,,1,0,0,0,0,
2,3288830,"August 17, 2025",Delaware,Wilmington,50 block of Vandever Ave,0,1,0,0,0,
3,3288820,"August 17, 2025",Texas,Houston,Harrisburg Blvd,0,1,0,0,0,
4,3288818,"August 17, 2025",Texas,Spring,23700 block of Pennington Hills Dr,0,1,0,0,0,
...,...,...,...,...,...,...,...,...,...,...,...
1995,3273181,"July 26, 2025",Kentucky,Whitesburg,11125 KY-160,0,0,1,0,0,
1996,3272773,"July 26, 2025",Illinois,Madison,Washington Ave and Wayne Lanter Ave,1,0,0,0,1,
1997,3272564,"July 26, 2025",Illinois,Chicago,5200 block of S Laflin St,0,1,0,0,0,
1998,3272403,"July 26, 2025",New York,Rochester (Gates),1500 Brooks Ave,1,0,0,0,1,


 Records cut off at 2,000 records limit, difficult to download whole dataset

In [4]:
html = wp.page("List of U.S. states and territories by population", auto_suggest=False).html().encode("UTF-8")
try: 
    df = pd.read_html(html)[2]  # Try 2nd table first as most pages contain contents table first
except IndexError:
    df = pd.read_html(html)[1]
df.head(5)

Unnamed: 0,State/federal district/territory/ division/region,#,2020 pop.,#.1,2010 pop.,#.2,2000 pop.,#.3,2010â 2020 change,Geo. sort
0,Massachusetts,15,7029917,14,6547629,13,6349097,21,7.4%,NEng
1,Connecticut,29,3605944,29,3574097,29,3405565,47,0.9%,NEng
2,New Hampshire,41,1377529,42,1316470,41,1235786,30,4.6%,NEng
3,Maine,42,1362359,41,1328361,40,1274923,42,2.6%,NEng
4,Rhode Island,43,1097379,43,1052567,43,1048319,31,4.3%,NEng


In [5]:
html = wp.page("List_of_mass_shootings_in_the_United_States_in_2024", auto_suggest=False).html().encode("UTF-8")
try: 
    w24_df = pd.read_html(html)[1]  # Try 2nd table first as most pages contain contents table first
except IndexError:
    w24_df = pd.read_html(html)[0]
w24_df.head(5)

Unnamed: 0,2024 date,Location,State or territory,Dead,Injured,Total,Description
0,December 31,Adams County,Mississippi,1,5,6,A man was killed and five others were injured ...
1,December 31,Oakland (4),California,1,3,4,A group of people were fired at in the Jack Lo...
2,December 30,Rochester (3),New York,0,4,4,Three teenagers and a young adult were shot in...
3,December 30,New York City (10),New York,0,6,6,Six people were shot in the Williamsbridge nei...
4,December 28,Signal Hill,California,1,6,7,"Seven teenagers, including a girl who died, we..."


In [6]:
html = wp.page("Mass_Shootings_in_United_States_2025").html().encode("UTF-8")
w25_df = pd.read_html(html)[1]
w25_df.head(5)

Unnamed: 0,2025 date,Location,State or territory,Dead,Injured,Total,Description
0,September 12,Chicago (19),Illinois,0,4,4,Four men were injured in a shooting in the Eng...
1,September 11,Tampa (2),Florida,1,5,6,A 17-year-old boy was killed and five men were...
2,September 9,San Francisco (3),California,0,6,6,A shooting during a marijuana event in the Ind...
3,September 8,Santa Ana,California,1,3,4,A gang-related shooting killed a 13-year-old b...
4,September 7,Memphis (8),Tennessee,0,4,4,"Four juveniles were shot, aged 3 to 15, in the..."


In [7]:
w24_df.dtypes

2024 date             object
Location              object
State or territory    object
Dead                  object
Injured               object
Total                  int64
Description           object
dtype: object

In [8]:
def sec_cap(state):
    if state[1].islower():
        return state.upper()
    else:
        return state

In [9]:
def df_col_sel_sort(df):
    df = df[['date',
             'day', 
             'dow', 
             'month',
             'year', 
             'week',
             'killed', 
             'wounded', 
             'total', 
             'city', 
             'state']]
    return df

In [10]:
def get_json_2_df(start=13, end=25):
    
    # function to load json data from included years into a dataframe & correct Column Types & Add Columns
    first_file = '20' + str(start) + '-data.json'
    file_path = Path() / 'MSA Data' / 'MST-Data_json' / first_file
    first = json.load(open(file_path, encoding='utf-8'))
    dataf = pd.json_normalize(first)
    
    # Loop to populate the dataframe
    for i in range(start+1,end+1):
        tfile = '20' + str(i) + '-data.json'
        tfile_path = Path() / 'MSA Data' / 'MST-Data_json' / tfile
        nfile=  json.load(open(tfile_path, encoding='utf-8'))
        ndf = pd.json_normalize(nfile)
        dataf = pd.concat([dataf, ndf])
    
    # Type Setting Columns and Adding New Columns
    dataf['killed'], dataf['wounded'] = dataf['killed'].astype(int), dataf['wounded'].astype(int)
    dataf['total'] = dataf['killed'] + dataf['wounded']
    dataf['city'], dataf['state'] = dataf['city'].astype(str), dataf['state'].astype(str)
    dataf['date'] = pd.to_datetime(dataf['date'])
    dataf['dow'] = dataf['date'].dt.day_name().str[:3]
    dataf['month'] = dataf['date'].dt.month_name().str[:3]
    dataf['day'] = dataf['date'].dt.day
    dataf['week'] = dataf['date'].dt.isocalendar().week
    dataf['year'] = dataf['date'].dt.year
    dataf['date'] = dataf['date'].dt.date
    
    # Correcting Data Input Inaccuracies
    dataf.loc[dataf.state=='D.C.', 'state'] = 'DC'
    dataf.loc[dataf.state=='PUERTO RICO', 'state'] = 'PR'
    dataf['state'] = dataf['state'].apply(sec_cap)
    dataf = dataf.reset_index(drop=True)
    return dataf

In [11]:
mst_df = get_json_2_df(14,24)
scope = 'USA, 2014-24'
mst_df.head(5)

Unnamed: 0,date,killed,wounded,city,state,names,sources,total,dow,month,day,week,year
0,2014-01-01,2,2,Norfolk,VA,[Unknown],[http://wtkr.com/2014/01/01/two-men-dead-in-no...,4,Wed,Jan,1,1,2014
1,2014-01-03,1,3,New York (Queens),NY,[Unknown],[http://www.nytimes.com/2014/01/04/nyregion/gu...,4,Fri,Jan,3,1,2014
2,2014-01-04,2,2,Rock Falls,IL,"[Leonard ""Frank"" Harris Jr]",[http://wqad.com/2014/01/04/man-reportedly-kil...,4,Sat,Jan,4,1,2014
3,2014-01-05,1,3,Erie,OH,[Unknown],[http://www.goerie.com/man-shot-dead-at-privat...,4,Sun,Jan,5,1,2014
4,2014-01-05,0,4,Atlanta,GA,[Unknown],[http://www.cbsatlanta.com/story/24366624/4-me...,4,Sun,Jan,5,1,2014
