In [None]:
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv("Accidental_Drug_Related_Deaths_2012-2024.csv")

In [None]:
#lets create a copy of the data before cleaning 
df_cld = df.copy()

In [None]:
#standarding the column names
print(df_cld.columns) # see what columns we have

In [None]:
# columns have Upper and lower case letters with spaces for some with two labels
#lets standardize the headers to this all lower case and to this format "x_y"

df_cld.columns = (
     df_cld.columns.str.strip()                               #remove leading trailing/spaces
        .str.lower()                                          #convert column names to lowercase
        .str.replace(" ", "_")                                #replace column name format "x Y" with "X_Y"
        .str.replace(r"[^\w\s]", "", regex=True)              #Remove any characters that are not letters, numbers, underscores, or spaces.
)

In [None]:
print(df_cld.head(2)) # confirm that the column standardization worked

In [None]:
#Some columns have all uppercase data for example df_cld["residence_city"] 
#has "BRIDGEPORT", "WATERBURY", "NORWICH", "BROOKFIELD", "NEW HAVEN"
#Lets standardize this

col_to_standardize = ["residence_city", "residence_county", "injury_city", "injury_county"]

df_cld[col_to_standardize] = df_cld[col_to_standardize].apply(lambda x: x.str.strip().str.title()) # remove leading/trailing spaces and capitalize first letter of each word

print(df_cld.head(2))




In [None]:
#Lets extract the latitude and longitude from the residencecitygeo, injurycitygeo and deathcitygeo
df_cld[["rc_latitude", "rc_longitude"]] = df_cld["residencecitygeo"].str.extract(r"\((.*?),\s(.*?)\)")

#covert to float
df_cld["rc_latitude"] = df_cld["rc_latitude"].astype(float)
df_cld["rc_longitude"] = df_cld["rc_longitude"].astype(float)

In [None]:
#lets do same for the injurycitygeo
df_cld[["ic_latitude", "ic_longitude"]] = df_cld["injurycitygeo"].str.extract(r"\((.*?),\s(.*?)\)")

df_cld["ic_latitude"] = df_cld["ic_latitude"].astype(float)
df_cld["ic_longitude"] = df_cld["ic_longitude"].astype(float)


In [None]:
#lets do same for the deathcitygeo
df_cld[["dc_latitude", "dc_longitude"]] = df_cld["deathcitygeo"].str.extract(r"\((.*?),\s(.*?)\)")

df_cld["dc_latitude"] = df_cld["dc_latitude"].astype(float)
df_cld["dc_longitude"] = df_cld["dc_longitude"].astype(float)

In [None]:
#The state column has some null values. However, if you check the deatchcitygeo column, you would see the corresponding state
# The injurycitygeo and residencecitygeo also have the states in them
# We are going to extract the state values from the death city column and put it in the corresponding null state column field

df_cld["residence_state"] = df_cld["residence_state"].fillna(df_cld["deathcitygeo"].str.extract(r"(?:^|,)\s*([A-Z]{2})\n")[0])

In [None]:
#confirming result
print(df_cld["residence_state"].isnull().sum())

In [None]:
# Lets Fill missing values in residence_city, residence_county "Unknown" 
#as there is no correpsonding data for them in the geo columns
#and they have valuable data we would need for our analysis
df_cld["residence_city"].fillna("Unknown", inplace=True)
df_cld["residence_county"].fillna("Unknown", inplace=True)


print(df_cld["residence_city"].isnull().sum())

In [None]:
#lets standardize the race column
print(df_cld["race"].unique())

# Convert to lowercase and strip spaces
df_cld["race"] = df_cld["race"].str.lower().str.strip()



# Map races using substring matching
# we are going to use the official USA race classification data for this mapping
# White, Black or African American, American Indian or Alaska Native, Asian, Native Hawaiian or Other Pacific Islander

df_cld.loc[df_cld["race"].str.contains("black", na=False), "race"] = "Black or African American"
df_cld.loc[df_cld["race"].str.contains("white", na=False), "race"] = "White"
df_cld.loc[df_cld["race"].str.contains("asian", na=False), "race"] = "Asian"
df_cld.loc[df_cld["race"].str.contains("american indian|native american|lenni lenape", na=False), "race"] = "American Indian"
df_cld.loc[df_cld["race"].str.contains("haitian", na=False), "race"] = "Black or African American"
df_cld.loc[df_cld["race"].str.contains("portugese", na=False), "race"] = "White"
df_cld.loc[df_cld["race"].str.contains("puerto rican", na=False), "race"] = "Other"
df_cld.loc[df_cld["race"].str.contains("hawaiian", na=False), "race"] = "Native Hawaiian"
df_cld.loc[df_cld["race"].str.contains("chinese", na=False), "race"] = "Asian"
df_cld.loc[df_cld["race"].str.contains("korean", na=False), "race"] = "Asian"
df_cld.loc[df_cld["race"].str.contains("japanese", na=False), "race"] = "Asian"

# Check unique values
#print(df_cld["race"].unique())

#one row had Other (Specify)
df_cld["race"] = df_cld["race"].str.strip().replace({"Other (Specify)": "Other"})

# Fill remaining empty fields with unknown
df_cld["race"] = df_cld["race"].fillna("Unknown").str.title()

# Check unique values
print(df_cld["race"].unique())

#now we have a clean output


In [None]:
# Lets clean up the ethincity column

df_cld["ethnicity"] = df["Ethnicity"]
#print(df_cld["ethnicity"].unique())

df_cld["ethnicity"] = df_cld["ethnicity"].fillna("Unknown").str.strip().str.title()

#The ethnicity column has multiple variations of the hispanic and non-hispanic
#That would be our standardized values for this column "Hispanic" and "Non-Hispanic"

df_cld.loc[
    df_cld["ethnicity"].str.match(
        r"^No, Not Spanish/Hispanic/Latino$|^Not Spanish/Hispanic/Latino$|^n$", 
        case=False, na=False
    ),
    "ethnicity"
] = "Non-Hispanic"

df_cld.loc[
    (~df_cld["ethnicity"].isin(["Non-Hispanic", "Unknown"])) &
    (df_cld["ethnicity"].str.contains(
        r"Yes|Hispanic|Latino|Puerto Rican|Mexican|Cuban|Other Spanish", 
        case=False, na=False
    )),
    "ethnicity"
] = "Hispanic"


print(df_cld["ethnicity"].unique()) 


In [None]:
#drugs' columns have Y and NaN values in them

#print(df_cld.columns)

#For forecaseting purposes we will change the Y to 1 and NaN values to 0

drug_columns = [
    'heroin', 'cocaine', 'fentanyl', 'fentanyl_analogue', 'oxycodone',
    'oxymorphone', 'ethanol', 'hydrocodone', 'benzodiazepine', 'methadone',
    'methamphetamine', 'amphet', 'tramad', 'hydromorphone',
    'morphine_not_heroin', 'xylazine', 'gabapentin', 'opiate_nos',
    'heroinmorphcodeine', 'any_opioid'
]

df_cld[drug_columns] = df_cld[drug_columns].replace({
    "Y": 1, "y": 1, 
    "Y POPS": 1,  
    "N": 0, 
    "n": 0, 
    "P":np.nan, 
    "Y (PTCH)": 1, 
    "NO RX BUT STRAWS": 0, 
    "STOLE MEDS": 0, 
    "PCP NEG":0, 
    "N":0}).fillna(0)

#Converting to integer
# df_cld[drug_columns].astype('int8')

#got an error - ValueError: invalid literal for int() with base 10: 'Y POPS'
#Seems some drug columns have non numeric values
#lets fish them out
for col in drug_columns:
    bad_values = df_cld[~df_cld[col].isin(['Y', 'N', 'P', np.nan])][col].unique()
    if len(bad_values) > 0:
        print(f"{col}: {bad_values}")


#found them - fentanyl: [1 0 'Y POPS' 'Y (PTCH)'] and morphine_not_heroin: [0 1 'NO RX BUT STRAWS' 'STOLE MEDS' 'PCP NEG']

#The former likely means yes, converting to --> 1. The latter likely means No, converting to --> 0
#adding them to the initial replace code above

df_cld[drug_columns] = df_cld[drug_columns].astype('int8')

#Lets see if that worked
print(df_cld[drug_columns].unique())



In [None]:
#seeing what data type our drug columns are
print(df_cld[drug_columns].dtypes)

In [None]:
#Lets clean up the location column

#print(df_cld["location"].unique()) - -Checking what values are in the column

#location mapping
locatn_map = {
    "Decedent’s Home": "Home",
    "Decedent's Home": "Home",
    "Residence": "Home",
    "Hospital": "Hospital",
    "Hiospital": "Hospital",
    "Hospital - ER/Outpatient": "Hospital",
    "Hospital - Inpatient": "Hospital",
    "Hospital - Dead On Arrival": "Hospital",
    "Hospice": "Hospice",
    "Hospice Facility": "Hospice",
    "Nursing Home": "Care Facility",
    "Convalescent Home": "Care Facility",
    "Assisted Living": "Care Facility",
    "Shelter": "Shelter",
    "Other": "Other",
    "Other (Specify)": "Other"
}

df_cld["location"] = df_cld["location"].str.strip().replace(locatn_map).fillna("Unknown")

print(df_cld["location"].unique()) #cleaned


In [None]:
#convert age column to integer
df_cld["age"] = df_cld["age"].astype('Int64')

print(df_cld["age"])

In [None]:

print(df_cld["sex"].isnull().sum()) #seeing 9 null values in the sex column

#lets convert the null values to Unknown as they have data we need
df_cld["sex"] = df_cld["sex"].fillna("Unknown")

print(df_cld["sex"].unique()) #That worked!

In [None]:
#doublechecking how the dataframe currently looks
print(df_cld.head(100))

In [None]:
#lets change the date column into a datetime column

df_cld["date"] = pd.to_datetime(df_cld["date"])

print(df_cld.dtypes)

In [None]:
#Cleaning up the manner of death column

print(df_cld["manner_of_death"].unique())

df_cld["manner_of_death"] = df_cld["manner_of_death"].str.strip().replace({
    "accident": "Accident",
    "Acciddent": "Accident",
    "ACCIDENT": "Accident",
}).fillna("Unknown")

print(df_cld["manner_of_death"].unique())

In [None]:


#Lets drop columns we do not need

#lets create a copy first
df_cleaned = df_cld.copy()

print(df_cleaned.columns)


df_cleaned.drop([
    "location_if_other",
    "heroin_death_certificate_dc",
    "other", "other_opioid",
    "description_of_injury",
    "death_state",
    "death_county",
    "residencecitygeo", 
    "injurycitygeo", 
    "deathcitygeo",
    "injury_place",
    "other_significant_conditions"
], axis = 1, inplace=True)


In [None]:
print(df_cleaned["death_city"].unique())

In [None]:
#Lets categorize the cause_of_death column as it has values like "Acute Intoxication by the Combined Effects of Fentanyl and Cocaine"

#Lets create a function we would apply to that column
def categorize_cause_of_death(cause):
    categories = []
    if 'Fentanyl' in cause:
        categories.append('Fentanyl')
    if 'Cocaine' in cause:
        categories.append('Cocaine')
    if 'Heroin' in cause:
        categories.append('Heroin')
    if 'Ethanol' in cause:
        categories.append('Ethanol')
    if not categories:
        return 'Other'
    return '+'.join(categories) + '-related'


df_cleaned['cause_of_death'] = df_cleaned['cause_of_death'].apply(categorize_cause_of_death)

print(df_cleaned["cause_of_death"].unique())

In [None]:
#Lets clean up the death city column

df_cleaned["death_city"] = df_cleaned["death_city"].str.strip().str.title().replace({
    "6430": "Groton"
}).fillna("Unknown")

print(df_cleaned["death_city"].unique())

In [None]:
#Cleaning up the injury county column

df_cleaned["injury_county"] = df_cleaned["injury_county"].str.strip().str.title().replace({"Mnew London": "New London"}).fillna("Unknown")

print(df_cleaned["injury_county"].unique())

df_cleaned["injury_state"] = df_cleaned["injury_state"].str.strip().str.upper().fillna("Unknown")

df_cleaned["injury_state"]= df_cleaned["injury_state"].replace({"MASSACHUSSETS": "MA", "CONNECTICUT": "CT", "UKNOWN": "UNKNOWN"})
print(df_cleaned["injury_state"].unique())


In [None]:
print(df_cleaned["death_city"].unique())

df_cleaned_backup = df_cleaned.copy() 

In [None]:
print(df_cleaned["race"].unique())

In [None]:
print(df_cleaned.columns)

In [None]:
#lETS DO SOME FORECASTING!!

#I will be using Prophet for thie forecast
#!pip install prophet cmdstanpy - Installing prophet

# 1. Forecasting total overdose deaths over time using prophet

#lets find the total deaths per month

death_per_month = (
    df_cleaned.groupby(df_cleaned["date"].dt.to_period("M"))
    .size()
    .reset_index(name="death_count")
)

death_per_month["date"] = death_per_month["date"].dt.to_timestamp()




In [None]:
#confirming the data is accurate
print(death_per_month)

In [None]:
from prophet import Prophet

df_cleaned_forecast =  death_per_month.rename(columns={"date": "ds", "death_count":"y"})


model = Prophet()
model.fit(df_cleaned_forecast)

#Predict next 12 months
future = model.make_future_dataframe(periods=12, freq="M")
forecast = model.predict(future)


#lets use only what we need
forecasted_monthly_death = forecast[["ds", "yhat", "yhat_lower", "yhat_upper"]]
forecasted_monthly_death.columns = ["date", "forecast", "lower_bound", "upper_bound"]



In [None]:
print(forecasted_monthly_death.head())

In [None]:
#2. lets see yearly forecasted trends for two prevalent drugs - fentanyly and cocaine

#lets first create a year column
df_cleaned["year"] = df_cleaned["date"].dt.year

yearly_trends = (
    df_cleaned.groupby("year")[["fentanyl", "cocaine"]]
    .apply(lambda x: pd.Series({
        "fentanyl_cases": (x["fentanyl"] == 1).sum(),
        "cocaine_cases": (x["cocaine"] == 1).sum()
    })
).reset_index()
)


print(yearly_trends)

In [None]:
#import pandas as pd
from prophet import Prophet


# Fentanyl Forecast
fentanyl_df = yearly_trends.rename(columns={"year":"ds", "fentanyl_cases":"y"})
fentanyl_df["ds"] = pd.to_datetime(fentanyl_df["ds"], format="%Y")

# Log-transform y to handle large range
fentanyl_df["y"] = np.log1p(fentanyl_df["y"])

model_fent = Prophet(
    growth="linear",
    changepoint_prior_scale=0.3,  # flexible to capture surges
    yearly_seasonality=False
)
model_fent.fit(fentanyl_df)

future_fent = model_fent.make_future_dataframe(periods=3, freq="Y")
forecast_fent = model_fent.predict(future_fent)

# Convert back from log-transform
forecast_fent["yhat"] = np.expm1(forecast_fent["yhat"])
forecast_fent["yhat_lower"] = np.expm1(forecast_fent["yhat_lower"])
forecast_fent["yhat_upper"] = np.expm1(forecast_fent["yhat_upper"])

# Clip negatives to 0
forecast_fent["yhat"] = forecast_fent["yhat"].clip(lower=0)
forecast_fent["yhat_lower"] = forecast_fent["yhat_lower"].clip(lower=0)
forecast_fent["yhat_upper"] = forecast_fent["yhat_upper"].clip(lower=0)

fentanyl_forecast = forecast_fent[["ds", "yhat", "yhat_lower", "yhat_upper"]]
fentanyl_forecast.columns = ["year", "forecast", "lower_bound", "upper_bound"]
fentanyl_forecast["substance"] = "Fentanyl"

# now LETS FORECAST COCAINA!!!!
cocaine_df = yearly_trends.rename(columns={"year": "ds", "cocaine_cases": "y"})
cocaine_df["ds"] = pd.to_datetime(cocaine_df["ds"], format="%Y")
cocaine_df["y"] = np.log1p(cocaine_df["y"])

model_coc = Prophet(
    growth="linear",
    changepoint_prior_scale=0.3,
    yearly_seasonality=False
)
model_coc.fit(cocaine_df)

future_coc = model_coc.make_future_dataframe(periods=3, freq="Y")
forecast_coc = model_coc.predict(future_coc)

forecast_coc["yhat"] = np.expm1(forecast_coc["yhat"])
forecast_coc["yhat_lower"] = np.expm1(forecast_coc["yhat_lower"])
forecast_coc["yhat_upper"] = np.expm1(forecast_coc["yhat_upper"])

forecast_coc["yhat"] = forecast_coc["yhat"].clip(lower=0)
forecast_coc["yhat_lower"] = forecast_coc["yhat_lower"].clip(lower=0)
forecast_coc["yhat_upper"] = forecast_coc["yhat_upper"].clip(lower=0)

cocaine_forecast = forecast_coc[["ds", "yhat", "yhat_lower", "yhat_upper"]]
cocaine_forecast.columns = ["year", "forecast", "lower_bound", "upper_bound"]
cocaine_forecast["substance"] = "Cocaine"

# Combining both
yearly_forecast = pd.concat([fentanyl_forecast, cocaine_forecast], ignore_index=True)

print(yearly_forecast)


In [None]:
#Lets combine our forecasted data and actual data into one dataframe

fentanyl_actuals = yearly_trends[['year', 'fentanyl_cases']].copy()
fentanyl_actuals.rename(columns={'fentanyl_cases': 'actual'}, inplace=True)
fentanyl_actuals['substance'] = 'Fentanyl'

cocaine_actuals = yearly_trends[['year', 'cocaine_cases']].copy()
cocaine_actuals.rename(columns={'cocaine_cases': 'actual'}, inplace=True)
cocaine_actuals['substance'] = 'Cocaine'

#Combine actuals into a single DataFrame
actuals_combined = pd.concat([fentanyl_actuals, cocaine_actuals], ignore_index=True)

#ensuring the year columns match type
actuals_combined['year'] = pd.to_datetime(actuals_combined['year'], format='%Y')
yearly_forecast['year'] = pd.to_datetime(yearly_forecast['year'])

#Merge forecast with actuals
merged_table = pd.merge(
    yearly_forecast,
    actuals_combined,
    how='left',
    on=['year', 'substance']
)

#Lets sort dataframe
merged_table.sort_values(by=['substance', 'year'], inplace=True)
merged_table.reset_index(drop=True, inplace=True)


print(merged_table)

In [None]:
#lets also forceast average age of victims for next 3 years

# Aggregate average age per year
avg_age_yearly = df_cleaned.groupby("year")["age"].mean().reset_index()
avg_age_yearly.rename(columns={"age":"y"}, inplace=True)
avg_age_yearly["ds"] = pd.to_datetime(avg_age_yearly["year"], format="%Y")

# Prophet model
model_age = Prophet(growth="linear", changepoint_prior_scale=0.3, yearly_seasonality=False)
model_age.fit(avg_age_yearly[["ds","y"]])

# Forecast next 3 years
future_age = model_age.make_future_dataframe(periods=3, freq="Y")
forecast_age = model_age.predict(future_age)

# Build DataFrame
age_forecast = forecast_age[["ds","yhat","yhat_lower","yhat_upper"]].copy()
age_forecast.columns = ["year","forecast","lower_bound","upper_bound"]
age_forecast["metric"] = "Average Age"



In [None]:
#confirm the outputs

print(avg_age_yearly)
print(age_forecast)

In [None]:
#Lets forecast cases by Race


#Lets count how many deaths happened per year
count_race_year = df_cleaned.groupby(["year", "race"]).size().reset_index(name="deaths")

#print(count_race_year)

forecasts = []

for race, group in count_race_year.groupby("race"):
    if group["year"].nunique() < 2:
        print(f"Skipping {race} — not enough data for forecast.")
        continue

    df_race = group.rename(columns={"year": "ds", "deaths": "y"})
    df_race["ds"] = pd.to_datetime(df_race["ds"], format="%Y")

    model = Prophet(
        changepoint_prior_scale=0.3,
        yearly_seasonality=False,
        growth="linear"
    )
    model.fit(df_race)

    future = model.make_future_dataframe(periods=3, freq="Y")
    forecast = model.predict(future)

    results = forecast[["ds", "yhat", "yhat_lower", "yhat_upper"]]
    results["race"] = race
    forecasts.append(results)

# Combine everything
race_forecasts = pd.concat(forecasts, ignore_index=True)
race_forecasts = race_forecasts.rename(columns={
    "ds": "year",
    "yhat": "forecast",
    "yhat_lower": "lower_bound",
    "yhat_upper": "upper_bound"
})

race_forecasts["forecast"] = race_forecasts["forecast"].round().astype(int)


In [None]:
#Cleaning our data column onr final time to avoid errors or incorrect data inference when pushed to tableau


#Lets rename columns that also key names in snowflake to avoid errors
df_cleaned = df_cleaned.rename(columns={
    'date': 'event_date',
    'year': 'event_year'
})

#Lets restandardize the event_date column too avoid snowflake infering a wrong data type
df_cleaned['event_date'] = pd.to_datetime(df_cleaned['event_date'], errors='coerce')
df_cleaned['event_date'] = df_cleaned['event_date'].dt.tz_localize(None)
df_cleaned['event_date'] = df_cleaned['event_date'].dt.strftime('%Y-%m-%d %H:%M:%S')

print(df_cleaned['event_date'].dtype)


print(df_cleaned.columns)

In [None]:
#Lets upgrade our pandas for the snowflake environment
#!pip install --user --upgrade "snowflake-connector-python[pandas]" pandas numpy

#!pip install snowflake-connector-python - Installed the snowflake connector

#Lets push our data to our data warehouse --> snowflake for staging and production

import snowflake.connector
from snowflake.connector.pandas_tools import write_pandas


conn = snowflake.connector.connect(
    user='******',
    password='**********',
    account='*******',
    warehouse='COMPUTE_WH',
    database='LEARNSQL',
    schema='PUBLIC',
    role="ACCOUNTADMIN"
)





In [None]:
#lets confirm if we connected to snowflake
cur = conn.cursor()
cur.execute("SELECT CURRENT_USER(), CURRENT_ACCOUNT(), CURRENT_REGION();")
result = cur.fetchone()
print("Connected as:", result)
cur.close()
conn.close()

In [None]:
#Lets move our clean and forecasted data


success_accident, chunks_accident, rows_accident, _ = write_pandas(conn, df_cleaned, "accident_deaths_data", 'LEARNSQL', 'PUBLIC')
#success_forecast, chunks_forecast, rows_forecast, _ = write_pandas(conn, yearly_forecast, "FORECAST_AD_DEATHS", auto_create_table=True)

print(f"ACTUALS_TABLE upload: success={success_accident}, rows={rows_accident}")
#print(f"FORECAST_TABLE upload: success={success_forecast}, rows={rows_forecast}")

conn.close()


In [None]:
#Lets also create an excel file

with pd.ExcelWriter("overdose_analysis.xlsx") as writer:
    df_cleaned.to_excel(writer, sheet_name="Raw_Cleaned_Data", index=False)
    merged_table.to_excel(writer, sheet_name="Forecasted_Data", index=False)

print("Excel file created: overdose_analysis.xlsx")