In [1]:
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Load the dataset
gun_df = pd.read_csv('gun-violence-data_01-2013_03-2018.csv')

In [None]:
gun_df.head(2)

In [None]:
gun_df.shape

In [None]:
gun_df.isnull().sum()

In [6]:
# Drop unwanted columns
gun_df = gun_df.drop(columns=[
    'gun_stolen', 'gun_type', 'location_description', 'n_guns_involved', 
    'participant_name', 'participant_relationship', 'notes', 'sources'
])

In [7]:
# Fill missing values in useful columns
gun_df['congressional_district'] = gun_df['congressional_district'].fillna(gun_df['congressional_district'].mode()[0])
gun_df['participant_age'] = gun_df['participant_age'].fillna(gun_df['participant_age'].mode()[0])
gun_df['participant_age_group'] = gun_df['participant_age_group'].fillna(gun_df['participant_age_group'].mode()[0])
gun_df['participant_gender'] = gun_df['participant_gender'].fillna(gun_df['participant_gender'].mode()[0])
gun_df['participant_status'] = gun_df['participant_status'].fillna(gun_df['participant_status'].mode()[0])
gun_df['participant_type'] = gun_df['participant_type'].fillna(gun_df['participant_type'].mode()[0])
gun_df['state_house_district'] = gun_df['state_house_district'].fillna(gun_df['state_house_district'].mode()[0])
gun_df['state_senate_district'] = gun_df['state_senate_district'].fillna(gun_df['state_senate_district'].mode()[0])

In [8]:
# Convert data types
gun_df['date'] = pd.to_datetime(gun_df['date'])
gun_df['congressional_district'] = gun_df['congressional_district'].astype(int)
gun_df['state_house_district'] = gun_df['state_house_district'].astype(int)
gun_df['state_senate_district'] = gun_df['state_senate_district'].astype(int)

In [None]:
gun_df.describe()

In [None]:
# Distribution of incidents over the years
gun_df['year'] = gun_df['date'].dt.year
incident_count_per_year = gun_df['year'].value_counts().sort_index()

print(incident_count_per_year)

In [None]:
# Incidents per year (Bar plot with hover data using Plotly)
fig = px.bar(
    x=incident_count_per_year.index,
    y=incident_count_per_year.values,
    labels={'x': 'Year', 'y': 'Number of Incidents'},
    title='Number of Gun Violence Incidents per Year'
)
fig.update_traces(hovertemplate='Year: %{x}<br>Number of Incidents: %{y}')
fig.show()

In [None]:
# Create 'year_month' column
gun_df['year_month'] = gun_df['date'].dt.to_period('M')

# Group by 'year_month' to count incidents
incident_count_per_month = gun_df.groupby('year_month').size().reset_index(name='incident_count')

# Convert 'year_month' to string for Plotly compatibility (if needed)
incident_count_per_month['year_month'] = incident_count_per_month['year_month'].astype(str)

# Create a line plot with Plotly
fig = px.line(incident_count_per_month, x='year_month', y='incident_count',
              title='Gun Violence Incidents Over Time',
              labels={'year_month': 'Year-Month', 'incident_count': 'Number of Incidents'},
              hover_data={'year_month': True, 'incident_count': True})
fig.update_xaxes(tickangle=45)
fig.show()

In [None]:
# Top 10 states with the highest number of incidents
top_states = gun_df['state'].value_counts().head(10)
print(top_states)

In [None]:
# Top 10 states with the highest number of incidents
plt.figure(figsize=(10, 6))
sns.barplot(x=top_states.values, y=top_states.index)
plt.title('Top 10 States with the Highest Number of Gun Violence Incidents')
plt.xlabel('Number of Incidents')
plt.ylabel('State')
plt.show()

In [None]:

# Calculate total incidents, killed, and injured per state
state_summary = gun_df.groupby('state').agg({'incident_id': 'count', 'n_killed': 'sum', 'n_injured': 'sum'}).reset_index()
state_summary = state_summary.sort_values(by='incident_id', ascending=False).head(10)  # Top 10 states by incident count

# Melt the data to create a stacked bar plot
state_summary_melted = state_summary.melt(id_vars='state', var_name='severity', value_name='count')

# Create a stacked bar plot with Seaborn
plt.figure(figsize=(12, 8))
sns.barplot(x='count', y='state', hue='severity', data=state_summary_melted,
            palette={'incident_id': 'b', 'n_killed': 'r', 'n_injured': 'g'},
            edgecolor='black')

# Customize plot
plt.title('Gun Violence Incidents by State and Severity')
plt.xlabel('Number of Incidents')
plt.ylabel('State')
plt.legend(title='Severity', loc='upper right')
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:

# Filter out incidents with missing latitude or longitude
filtered_gun_df = gun_df.dropna(subset=['latitude', 'longitude'])

# Create a scatter plot map using Plotly
fig = px.scatter_mapbox(filtered_gun_df, lat='latitude', lon='longitude', 
                        hover_name='state', hover_data=['n_killed', 'n_injured'],
                        color_discrete_sequence=['blue'], zoom=3)

# Customize the map layout
fig.update_layout(mapbox_style='open-street-map')
fig.update_layout(title='Geographical Distribution of Gun Violence Incidents in the US')
fig.update_layout(margin=dict(l=0, r=0, t=50, b=0))
fig.show()

### Machine learning model to predict

In [17]:
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor

In [None]:
# RandomForest Regressor

# Select features and target
features = ['state', 'n_injured', 'incident_characteristics']
target = 'n_killed'

# Create feature matrix X and target vector y
X = gun_df[features].copy()
y = gun_df[target]

# Encode categorical features
label_encoders = {}
for column in ['state', 'incident_characteristics']:
    le = LabelEncoder()
    X[column] = le.fit_transform(X[column])
    label_encoders[column] = le

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the model
model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)

# Predictions on test set
y_pred = model.predict(X_test)

# Evaluation metrics
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error: {mse}')
print(f'R-squared: {r2}')

In [None]:
# linear Regressor

# Select features and target
features = ['state', 'n_injured', 'incident_characteristics']
target = 'n_killed'

# Create feature matrix X and target vector y
X = gun_df[features].copy()
y = gun_df[target]

# Encode categorical features
label_encoders = {}
for column in ['state', 'incident_characteristics']:
    le = LabelEncoder()
    X[column] = le.fit_transform(X[column])
    label_encoders[column] = le

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the Linear Regression model
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)

# Predictions on test set
y_pred_linear = linear_model.predict(X_test)

# Evaluation metrics
mse_linear = mean_squared_error(y_test, y_pred_linear)
r2_linear = r2_score(y_test, y_pred_linear)

print('Linear Regression Results:')
print(f'Mean Squared Error: {mse_linear}')
print(f'R-squared: {r2_linear}')

In [None]:
#Decision Tree Regressor

# Initialize and train the Decision Tree Regressor
tree_model = DecisionTreeRegressor(random_state=42)
tree_model.fit(X_train, y_train)

# Predictions on test set
y_pred_tree = tree_model.predict(X_test)

# Evaluation metrics
mse_tree = mean_squared_error(y_test, y_pred_tree)
r2_tree = r2_score(y_test, y_pred_tree)

print('Decision Tree Regression Results:')
print(f'Mean Squared Error: {mse_tree}')
print(f'R-squared: {r2_tree}')