In [None]:
import requests
import pandas as pd
from bs4 import BeautifulSoup

# Function to extract data from a single page
def extract_data_from_page(url):
    headers = {"User-Agent": "Mozilla/5.0"}
    response = requests.get(url, headers=headers)

    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')

        # Locate the table
        table = soup.find('table')
        data = []

        # Find all rows in the table
        rows = table.find_all('tr')

        # Extract headers if this is the first page
        if not data:
            headers = [header.text.strip() for header in rows[0].find_all('th')]
            print(f"Extracted headers: {headers} (Count: {len(headers)})")

        # Iterate over rows and extract the data
        for row in rows[1:]:  # Skip the header row
            columns = row.find_all('td')
            if len(columns) == len(headers):
                incident = [col.text.strip() for col in columns]
                data.append(incident)
            else:
                print(f"Skipped a row with mismatched columns: {len(columns)} columns found")

        return headers, data
    else:
        print(f"Failed to retrieve page with status code: {response.status_code}")
        return None, []

# Main function to scrape all pages with a limit
def scrape_all_pages(base_url, limit=5000):
    all_data = []
    page = 1
    while True:
        url = f"{base_url}?page={page}"
        print(f"Scraping page {page}...")
        headers, data = extract_data_from_page(url)

        if data:
            all_data.extend(data)
            if len(all_data) >= limit:
                print(f"Reached limit of {limit} rows. Stopping extraction.")
                break
            page += 1
        else:
            break

    # Limit the data to the specified number of rows
    return headers, all_data[:limit]

# Base URL of the mass shooting data
base_url = "https://www.gunviolencearchive.org/mass-shooting"

# Scrape all pages with a limit of 5000 rows
headers, limited_data = scrape_all_pages(base_url, limit=5000)

df = pd.DataFrame(limited_data, columns=headers)

print(df)

df.to_csv("incident_dataset.csv", index=False)
print("Data successfully saved to incident_dataset.csv")


Scraping page 1...
Extracted headers: ['Incident ID', 'Incident Date', 'State', 'City Or County', 'Address', 'Victims Killed', 'Victims Injured', 'Suspects Killed', 'Suspects Injured', 'Suspects Arrested', 'Operations'] (Count: 11)
Scraping page 2...
Extracted headers: ['Incident ID', 'Incident Date', 'State', 'City Or County', 'Address', 'Victims Killed', 'Victims Injured', 'Suspects Killed', 'Suspects Injured', 'Suspects Arrested', 'Operations'] (Count: 11)
Scraping page 3...
Extracted headers: ['Incident ID', 'Incident Date', 'State', 'City Or County', 'Address', 'Victims Killed', 'Victims Injured', 'Suspects Killed', 'Suspects Injured', 'Suspects Arrested', 'Operations'] (Count: 11)
Scraping page 4...
Extracted headers: ['Incident ID', 'Incident Date', 'State', 'City Or County', 'Address', 'Victims Killed', 'Victims Injured', 'Suspects Killed', 'Suspects Injured', 'Suspects Arrested', 'Operations'] (Count: 11)
Scraping page 5...
Extracted headers: ['Incident ID', 'Incident Date', '

In [None]:
import requests
import pandas as pd

url = "https://data.cityofchicago.org/resource/ijzp-q8t2.json"

params = {
    "$limit": 10000,
    "$offset": 0,
    "$where": "date > '2023-01-01T00:00:00'",
    "$order": "date DESC"
}

response = requests.get(url, params=params)

if response.status_code == 200:
    data = response.json()
    df = pd.DataFrame(data)
    print(df.head())
    df.to_csv("chicago_crime_data.csv", index=False)
    print("Data saved to chicago_crime_data.csv")
else:
    print(f"Error: {response.status_code}, {response.text}")


         id case_number                     date                 block  iucr  \
0  13679950    JH527821  2024-11-30T00:00:00.000     021XX N TRIPP AVE  0910   
1  13678262    JH525798  2024-11-30T00:00:00.000   089XX S BRANDON AVE  2825   
2  13678841    JH526472  2024-11-30T00:00:00.000   019XX W CORTLAND ST  0460   
3  13684035    JH532783  2024-11-30T00:00:00.000       071XX W 60TH ST  0610   
4  13678560    JH526207  2024-11-30T00:00:00.000  054XX W BERENICE AVE  2820   

          primary_type              description location_description  arrest  \
0  MOTOR VEHICLE THEFT               AUTOMOBILE               STREET   False   
1        OTHER OFFENSE  HARASSMENT BY TELEPHONE            APARTMENT   False   
2              BATTERY                   SIMPLE        BAR OR TAVERN    True   
3             BURGLARY           FORCIBLE ENTRY      OTHER (SPECIFY)   False   
4        OTHER OFFENSE         TELEPHONE THREAT            RESIDENCE   False   

   domestic  ... ward community_area f

In [None]:
import pandas as pd
import numpy as np
from statsmodels.tsa.arima.model import ARIMA
from prophet import Prophet
import plotly.graph_objects as go

data = pd.read_csv('incident_dataset.csv')

# Data Preprocessing
data['Incident Date'] = pd.to_datetime(data['Incident Date'])
data = data[['Incident Date', 'Victims Killed', 'Victims Injured', 'City Or County']]
data.dropna(inplace=True)

# Function to forecast incidents based on user input for city and time frame
def forecast_incidents(city='', forecast_days=30):
    if city:
        city_data = data[data['City Or County'].str.contains(city, case=False)]
    else:
        city_data = data

    if city_data.empty:
        print(f"No data available for {city}.")
        return

    # Group by date and sum the number of victims killed and injured for the specified city
    daily_city_incidents = city_data.groupby(city_data['Incident Date'].dt.date).agg({'Victims Killed': 'sum', 'Victims Injured': 'sum'}).reset_index()

    # Time-Series Analysis using ARIMA for Victims Killed
    daily_city_incidents.set_index('Incident Date', inplace=True)

    # Fit ARIMA model
    model_arima_killed = ARIMA(daily_city_incidents['Victims Killed'], order=(5, 1, 0))
    model_arima_fit_killed = model_arima_killed.fit()

    # Forecasting future incidents for Victims Killed
    forecast_arima_killed = model_arima_fit_killed.forecast(steps=forecast_days)

    forecast_arima_killed = np.ceil(forecast_arima_killed)

    # Prepare dates for forecasted values
    forecast_dates = pd.date_range(start=daily_city_incidents.index[-1] + pd.Timedelta(days=1), periods=forecast_days)

    # Time-Series Analysis using Prophet for Victims Injured
    prophet_data_injured = daily_city_incidents.reset_index()
    prophet_data_injured.columns = ['ds','killed', 'y']  # Rename columns for Prophet

    # Initialize and fit the Prophet model for Injured victims
    model_prophet_injured = Prophet()
    model_prophet_injured.fit(prophet_data_injured[['ds', 'y']])

    # Create a dataframe for future dates to predict injuries
    future_dates_injured = model_prophet_injured.make_future_dataframe(periods=forecast_days)
    forecast_prophet_injured = model_prophet_injured.predict(future_dates_injured)

    forecast_prophet_injured['yhat'] = np.ceil(forecast_prophet_injured['yhat']).clip(lower=0)

    arima_fig = go.Figure()

    # Add historical data trace for Victims Killed
    arima_fig.add_trace(go.Scatter(x=daily_city_incidents.index,
                                     y=daily_city_incidents['Victims Killed'],
                                     mode='lines',
                                     name='Historical Victims Killed',
                                     line=dict(color='blue', width=2)))

    # Add forecasted data trace from ARIMA for Victims Killed
    arima_fig.add_trace(go.Scatter(x=forecast_dates,
                                     y=forecast_arima_killed,
                                     mode='lines',
                                     name='ARIMA Forecast (Killed)',
                                     line=dict(color='red', width=2)))

    # Update layout for ARIMA figure
    arima_fig.update_layout(title=f'ARIMA Forecast of Victims Killed in {city if city else "All Cities"}',
                             xaxis_title='Date',
                             yaxis_title='Number of Victims',
                             xaxis=dict(showgrid=True),
                             yaxis=dict(showgrid=True))

    arima_fig.show()

    prophet_fig = go.Figure()

    # Add historical data trace for Victims Injured
    prophet_fig.add_trace(go.Scatter(x=prophet_data_injured['ds'],
                                      y=prophet_data_injured['y'],
                                      mode='lines',
                                      name='Historical Victims Injured',
                                      line=dict(color='green', width=2)))

    # Add forecasted data trace from Prophet for Victims Injured
    prophet_fig.add_trace(go.Scatter(x=forecast_prophet_injured['ds'],
                                      y=forecast_prophet_injured['yhat'],
                                      mode='lines',
                                      name='Prophet Forecast (Injured)',
                                      line=dict(color='orange', width=2)))

    # Update layout for Prophet figure
    prophet_fig.update_layout(title=f'Prophet Forecast of Victims Injured in {city if city else "All Cities"}',
                              xaxis_title='Date',
                              yaxis_title='Number of Victims',
                              xaxis=dict(showgrid=True),
                              yaxis=dict(showgrid=True))

    prophet_fig.show()

user_city = input("Enter the city name (leave blank for all cities): ")
user_forecast_days = input("Enter the number of days to forecast (default is 30): ")

user_forecast_days = int(user_forecast_days) if user_forecast_days.isdigit() else 30

forecast_incidents(user_city, user_forecast_days)

Enter the city name (leave blank for all cities): 
Enter the number of days to forecast (default is 30): 


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  return get_prediction_index(
  return get_prediction_index(
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmp71_icxxq/roxmp3et.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmp71_icxxq/u0mcx08i.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.10/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=86748', 'data', 'file=/tmp/tmp71_icxxq/roxmp3et.json', 'init=/tmp/tmp71_icxxq/u0mcx08i.json', 'output', 'file=/tmp/tmp71_icxxq/prophet_modelplwppkp1/prophet_model-20241208132259.csv', 'method=optimize', 'algorithm=lbfgs', 'iter=10000']
13:22:59 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
13:22:59 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] 

In [None]:
pip install pandas matplotlib seaborn scikit-learn xgboost



In [None]:
import pandas as pd
import numpy as np
from sklearn.cluster import DBSCAN
import folium
from folium.plugins import MarkerCluster
import plotly.express as px
import matplotlib.pyplot as plt

# Load the crime data
data_file = "chicago_crime_data.csv"
df = pd.read_csv(data_file)

# Filter for gun-related incidents
if 'description' in df.columns:
    gun_related = df[df['description'].str.contains("GUN", case=False, na=False)]
else:
    print("The 'description' column is missing from the dataset.")
    exit()

# Ensure latitude and longitude columns are present
if {'latitude', 'longitude'}.issubset(gun_related.columns):
    gun_related = gun_related.dropna(subset=['latitude', 'longitude'])
    gun_related['latitude'] = gun_related['latitude'].astype(float)
    gun_related['longitude'] = gun_related['longitude'].astype(float)
else:
    print("The dataset does not contain latitude and longitude columns.")
    exit()

# Prepare data for DBSCAN
coordinates = gun_related[['latitude', 'longitude']].values

# Apply DBSCAN
eps = 0.01  # Approx. 1 km for latitude/longitude degrees
min_samples = 10  # Minimum points to form a cluster
db = DBSCAN(eps=eps, min_samples=min_samples, metric='haversine', algorithm='ball_tree').fit(np.radians(coordinates))

# Add cluster labels to the data
gun_related['cluster'] = db.labels_

# Save the clustered data to a new CSV
output_file = "gun_related_hotspots.csv"
gun_related.to_csv(output_file, index=False)
print(f"Hotspot detection results saved to {output_file}")

# Plotly Cluster Scatter Plot
scatter_plot = px.scatter(
    gun_related,
    x="longitude",
    y="latitude",
    color="cluster",
    title="Crime Hotspots for Gun-Related Incidents (DBSCAN)",
    color_continuous_scale="Viridis",
    hover_data=["description", "cluster"],
    labels={"cluster": "Cluster ID"}
)
scatter_plot.update_layout(
    xaxis_title="Longitude",
    yaxis_title="Latitude",
    template="plotly_dark",
    margin={"r": 10, "t": 50, "l": 10, "b": 10}
)
scatter_plot.show()

# Create a base map centered at Chicago
chicago_map = folium.Map(location=[41.8781, -87.6298], zoom_start=11)

# Add clusters to the map
marker_cluster = MarkerCluster()
for _, row in gun_related.iterrows():
    cluster_label = row['cluster']
    if cluster_label != -1:  # Ignore noise points
        folium.Marker(
            location=[row['latitude'], row['longitude']],
            popup=f"Cluster: {cluster_label}",
            icon=folium.Icon(color='blue' if cluster_label != -1 else 'gray')
        ).add_to(marker_cluster)

marker_cluster.add_to(chicago_map)

# Display the map inline
from IPython.display import display, HTML

map_html = chicago_map._repr_html_()
display(HTML(map_html))


Hotspot detection results saved to gun_related_hotspots.csv


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

# Encode the target variable
labelEncoder = LabelEncoder()
data['Severity'] = labelEncoder.fit_transform(data['Severity'])

# Exclude 'Operations' column in addition to other columns
X = data.drop(columns=['Severity', 'Incident Date', 'State', 'City Or County', 'Operations'])
y = data['Severity']

# Identify categorical and numerical features
catFeatures = ['Address']
numFeatures = X.drop(columns=catFeatures).columns

# Create a ColumnTransformer to handle categorical and numerical features separately
preprocessor = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', numFeatures),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), catFeatures),
    ])

X = preprocessor.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
import plotly.express as px
import pandas as pd

# Logistic Regression
logRegModel = LogisticRegression(max_iter=1000, random_state=42)  # Increase max_iter for convergence
logRegModel.fit(X_train, y_train)
logRegPreds = logRegModel.predict(X_test)
logRegReport = classification_report(y_test, logRegPreds, output_dict=True, zero_division=0)

# Decision Tree
dtModel = DecisionTreeClassifier(random_state=42)
dtModel.fit(X_train, y_train)
dtPreds = dtModel.predict(X_test)
dtReport = classification_report(y_test, dtPreds, output_dict=True, zero_division=0)

# SVM
svmModel = SVC(kernel='rbf', gamma=0.1, C=1.0, random_state=42)
svmModel.fit(X_train, y_train)
svmPreds = svmModel.predict(X_test)
svmReport = classification_report(y_test, svmPreds, output_dict=True, zero_division=0)

# Accuracy scores
accuracy_scores = {
    "Model": ["Logistic Regression", "Decision Tree", "SVM"],
    "Accuracy": [
        accuracy_score(y_test, logRegPreds),
        accuracy_score(y_test, dtPreds),
        accuracy_score(y_test, svmPreds)
    ],
    "Precision": [
        logRegReport["macro avg"]['precision'],
        dtReport["macro avg"]['precision'],
        svmReport["macro avg"]['precision']
    ],
    "Recall": [
        logRegReport["macro avg"]['recall'],
        dtReport["macro avg"]['recall'],
        svmReport["macro avg"]['recall']
    ],
    "F1-Score": [
        logRegReport["macro avg"]['f1-score'],
        dtReport["macro avg"]['f1-score'],
        svmReport["macro avg"]['f1-score']
    ]
}

# Convert to DataFrame for Plotly
scores_df = pd.DataFrame(accuracy_scores)

# Plotting Accuracy Comparison
fig_accuracy = px.bar(
    scores_df,
    x="Model",
    y="Accuracy",
    color="Accuracy",
    title="Model Accuracy Comparison",
    labels={"Accuracy": "Accuracy Score"},
    template="plotly_dark",
    text="Accuracy"
)

fig_accuracy.show()

# Plotting Precision, Recall, and F1-Score Comparison
fig_metrics = px.bar(
    scores_df,
    x="Model",
    y=["Precision", "Recall", "F1-Score"],
    title="Model Performance Comparison (Precision, Recall, F1-Score)",
    labels={"value": "Score", "variable": "Metric"},
    template="plotly_dark",
    text="value"
)

fig_metrics.show()


In [None]:
import plotly.graph_objects as go

# Extract feature importance (coefficients) for numerical features only
feature_importance = logRegModel.coef_[0][:len(numFeatures)]  # Slice to get coefficients for numerical features

# Get feature names for numerical features
feature_names = list(preprocessor.named_transformers_['num'].get_feature_names_out())

print("Feature Importance (Logistic Regression - Numerical Features):")
for name, coef in zip(feature_names, feature_importance):
    print(f"{name}: {coef:.4f}")

# Bar chart for feature importance using Plotly
fig_feature_importance = go.Figure(
    data=[
        go.Bar(
            x=feature_names,
            y=feature_importance,
            marker=dict(color=feature_importance, colorscale="Viridis"),
            text=feature_importance.round(4),
            textposition="outside"
        )
    ],
    layout=go.Layout(
        title="Feature Importance (Logistic Regression - Numerical Features)",
        xaxis=dict(title="Feature", tickangle=-45),
        yaxis=dict(title="Importance"),
        template="plotly_dark",
        margin=dict(b=150)
    )
)

fig_feature_importance.show()


Feature Importance (Logistic Regression - Numerical Features):
Incident ID: -0.0000
Victims Killed: 3.9687
Victims Injured: 3.5983
Suspects Killed: 0.3062
Suspects Injured: 0.4254
Suspects Arrested: -0.8272


In [None]:
import pandas as pd
import plotly.express as px

data = pd.read_csv("incident_dataset.csv")

state_counts = data['State'].value_counts().reset_index()
state_counts.columns = ['State', 'Incident Count']

fig = px.bar(
    state_counts,
    x='State',
    y='Incident Count',
    title='Incidents per State',
    labels={'Incident Count': 'Number of Incidents'},
    color='Incident Count',
    color_continuous_scale='Viridis'
)
fig.show()


In [None]:

fig = px.histogram(
    data,
    x='Victims Killed',
    nbins=20,
    title='Distribution of Victims Killed',
    labels={'Victims Killed': 'Number of Victims Killed'},
    color_discrete_sequence=['crimson']
)
fig.show()


In [None]:

data['Incident Date'] = pd.to_datetime(data['Incident Date'])

date_counts = data.groupby('Incident Date').size().reset_index(name='Incident Count')

fig = px.line(
    date_counts,
    x='Incident Date',
    y='Incident Count',
    title='Incidents Over Time',
    labels={'Incident Date': 'Date', 'Incident Count': 'Number of Incidents'}
)
fig.show()


In [None]:
fig = px.scatter(
    data,
    x='Victims Injured',
    y='Suspects Injured',
    title='Victims Injured vs. Suspects Injured',
    labels={'Victims Injured': 'Victims Injured', 'Suspects Injured': 'Suspects Injured'},
    color='State',
    hover_data=['City Or County']
)
fig.show()


In [None]:

state_summary = data.groupby('State').agg(
    Victims_Killed=('Victims Killed', 'sum'),
    Suspects_Killed=('Suspects Killed', 'sum')
).reset_index()

fig = px.bar(
    state_summary,
    x='State',
    y=['Victims_Killed', 'Suspects_Killed'],
    title='Incident Severity by State',
    labels={'value': 'Number of Deaths', 'variable': 'Category'},
    barmode='group'
)
fig.show()


In [None]:
import plotly.figure_factory as ff

corr_matrix = data[['Victims Killed', 'Victims Injured', 'Suspects Killed', 'Suspects Injured']].corr()

fig = ff.create_annotated_heatmap(
    z=corr_matrix.values,
    x=corr_matrix.columns.tolist(),
    y=corr_matrix.columns.tolist(),
    colorscale='Viridis',
    showscale=True
)
fig.update_layout(title='Correlation Matrix of Incident Metrics')
fig.show()


In [None]:

fig = px.pie(
    state_counts,
    values='Incident Count',
    names='State',
    title='Incident Distribution by State',
    color_discrete_sequence=px.colors.sequential.RdBu
)
fig.show()


In [None]:
import plotly.graph_objects as go

fig = go.Figure(data=[go.Scatter3d(
    x=data['Victims Killed'],
    y=data['Victims Injured'],
    z=data['Suspects Arrested'],
    mode='markers',
    marker=dict(size=5, color=data['Victims Injured'], colorscale='Viridis', opacity=0.8),
    text=data['City Or County']
)])

fig.update_layout(
    title='3D Scatter Plot of Incident Metrics',
    scene=dict(
        xaxis_title='Victims Killed',
        yaxis_title='Victims Injured',
        zaxis_title='Suspects Arrested'
    )
)
fig.show()
