In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
np.random.seed(76)

In [3]:
years = np.arange(2012, 2023)

In [4]:
target_communities = [
    'Alpine', 'Borrego Springs', 'Camp Pendleton', 'Fallbrook', 'Jamul',
    'Laguna-Pine Valley', 'Mountain Empire', 'Oceanside-Escondido',
    'Palomar-Julian CCD', 'Pauma Valley CCD', 'Ramona CCD', 'San Diego CCD',
    'Valley Center CCD'
]

In [5]:
def example_data(years, communities):
    data = []
    for year in years:
        for community in communities:
            row = {
                'Community': community,
                'Year': year,
                'Proportion of families below the poverty level': np.random.uniform(0.05, 0.25),
                'Employment status': np.random.uniform(0.6, 0.95),
                'Recorded overdose incidents': np.random.randint(5, 150),
                'Proportion of population unhoused': np.random.uniform(0.01, 0.1),
                'Number of healthcare facilities': np.random.randint(1, 20),
                'Urban or rural': np.random.choice(['Urban', 'Rural']),
                'Opioid prescription rates': np.random.uniform(0.05, 0.3),
                'Drug-related arrests': np.random.randint(10, 300),
                'Total population': np.random.randint(10000, 100000),
                'Total population Male': np.random.randint(5000, 50000),
                'Total population Female': np.random.randint(5000, 50000),
                'Population 0 to 14 years': np.random.randint(1000, 20000),
                'Population 15 to 19 years': np.random.randint(500, 10000),
                'Population 20 to 24 years': np.random.randint(500, 10000),
                'Population 25 to 34 years': np.random.randint(2000, 15000),
                'Population 35 to 44 years': np.random.randint(2000, 15000),
                'Population 45 to 54 years': np.random.randint(2000, 15000),
                'Population 55 to 64 years': np.random.randint(2000, 15000),
                'Population 65 years and over': np.random.randint(1000, 15000),
            }
            data.append(row)
    return pd.DataFrame(data)

In [6]:
df = example_data(years, target_communities)

In [7]:
df['Urban or rural'] = df['Urban or rural'].apply(lambda x: 1 if x == 'Urban' else 0)

In [8]:
print(df.head())

         Community  Year  Proportion of families below the poverty level  \
0           Alpine  2012                                        0.112166   
1  Borrego Springs  2012                                        0.061367   
2   Camp Pendleton  2012                                        0.090575   
3        Fallbrook  2012                                        0.169965   
4            Jamul  2012                                        0.230340   

   Employment status  Recorded overdose incidents  \
0           0.886646                           31   
1           0.828941                          135   
2           0.911075                           63   
3           0.877483                          142   
4           0.926356                          116   

   Proportion of population unhoused  Number of healthcare facilities  \
0                           0.074082                               16   
1                           0.084891                               13   
2    

In [9]:
target_variable = 'Recorded overdose incidents'

In [10]:
features = df.drop(columns=['Community', 'Year', target_variable])

In [24]:
X = features
y = df[target_variable]

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)

In [14]:
rf_model.fit(X_train, y_train)

In [15]:
y_pred = rf_model.predict(X_test)

In [16]:
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")

Mean Squared Error: 1646.5431689655175
R-squared: -0.06486403743788438


In [17]:
results_df = pd.DataFrame({
    'Actual Overdose Incidents': y_test,
    'Predicted Overdose Incidents': y_pred
})
print("\nTest Set Predictions:")
print(results_df.head())



Test Set Predictions:
     Actual Overdose Incidents  Predicted Overdose Incidents
117                         19                         91.38
19                          59                         96.94
82                         120                         87.98
97                          25                         56.57
56                          22                         54.35


In [33]:
future_years = np.arange(2023, 2025)  # Predict for 2023, 2024, and 2025
future_data = pd.DataFrame({
    'Proportion of families below the poverty level': np.random.uniform(0.05, 0.25, size=len(future_years)),
    'Employment status': np.random.uniform(0.6, 0.95, size=len(future_years)),
    'Proportion of population unhoused': np.random.uniform(0.01, 0.1, size=len(future_years)),
    'Number of healthcare facilities': np.random.randint(1, 20, size=len(future_years)),
    'Urban or rural': np.random.choice([1, 0], size=len(future_years)),  # 1 for Urban, 0 for Rural
    'Opioid prescription rates': np.random.uniform(0.05, 0.3, size=len(future_years)),
    'Drug-related arrests': np.random.randint(10, 300, size=len(future_years)),
    'Total population': np.random.randint(10000, 100000, size=len(future_years)),  # Include Total population
    'Total population Male': np.random.randint(5000, 50000, size=len(future_years)),
    'Total population Female': np.random.randint(5000, 50000, size=len(future_years)),
    'Population 0 to 14 years': np.random.randint(1000, 20000, size=len(future_years)),
    'Population 15 to 19 years': np.random.randint(500, 10000, size=len(future_years)),
    'Population 20 to 24 years': np.random.randint(500, 10000, size=len(future_years)),
    'Population 25 to 34 years': np.random.randint(2000, 15000, size=len(future_years)),
    'Population 35 to 44 years': np.random.randint(2000, 15000, size=len(future_years)),
    'Population 45 to 54 years': np.random.randint(2000, 15000, size=len(future_years)),
    'Population 55 to 64 years': np.random.randint(2000, 15000, size=len(future_years)),
    'Population 65 years and over': np.random.randint(1000, 15000, size=len(future_years))
})
future_predictions = rf_model.predict(future_data)

In [32]:
future_results_df = pd.DataFrame({
    'Year': future_years,
    'Predicted Overdose Incidents': future_predictions
})
print("\nFuture Predictions (2023-2025):")
print(future_results_df)


Future Predictions (2023-2025):
   Year  Predicted Overdose Incidents
0  2025                         70.46
1  2026                         80.12


Now, for specific communities

In [34]:
future_years = np.arange(2023, 2025)

In [35]:
specific_communities = [
    'San Diego County, California', 'Alpine', 'Borrego Springs', 'Camp Pendleton',
    'Fallbrook', 'Jamul', 'Laguna-Pine Valley', 'Mountain Empire',
    'Oceanside-Escondido', 'Palomar-Julian CCD', 'Pauma Valley CCD',
    'Ramona CCD', 'San Diego CCD', 'Valley Center CCD'
]

In [36]:
future_data = []
for year in future_years:
    for community in specific_communities:
        row = {
            'Community': community,
            'Year': year,
            'Proportion of families below the poverty level': np.random.uniform(0.05, 0.25),
            'Employment status': np.random.uniform(0.6, 0.95),
            'Proportion of population unhoused': np.random.uniform(0.01, 0.1),
            'Number of healthcare facilities': np.random.randint(1, 20),
            'Urban or rural': np.random.choice([1, 0]),  # 1 for Urban, 0 for Rural
            'Opioid prescription rates': np.random.uniform(0.05, 0.3),
            'Drug-related arrests': np.random.randint(10, 300),
            'Total population': np.random.randint(10000, 100000),
            'Total population Male': np.random.randint(5000, 50000),
            'Total population Female': np.random.randint(5000, 50000),
            'Population 0 to 14 years': np.random.randint(1000, 20000),
            'Population 15 to 19 years': np.random.randint(500, 10000),
            'Population 20 to 24 years': np.random.randint(500, 10000),
            'Population 25 to 34 years': np.random.randint(2000, 15000),
            'Population 35 to 44 years': np.random.randint(2000, 15000),
            'Population 45 to 54 years': np.random.randint(2000, 15000),
            'Population 55 to 64 years': np.random.randint(2000, 15000),
            'Population 65 years and over': np.random.randint(1000, 15000)
        }
        future_data.append(row)

In [37]:
future_data_df = pd.DataFrame(future_data)

In [38]:
future_X = future_data_df.drop(columns=['Community', 'Year'])

In [39]:
future_predictions = rf_model.predict(future_X)

In [40]:
future_data_df['Predicted Overdose Incidents'] = future_predictions

In [41]:
print("\nFuture Predictions for Specific Communities (2023-2025):")
print(future_data_df[['Community', 'Year', 'Predicted Overdose Incidents']])


Future Predictions for Specific Communities (2023-2025):
                       Community  Year  Predicted Overdose Incidents
0   San Diego County, California  2023                         59.29
1                         Alpine  2023                         72.42
2                Borrego Springs  2023                         51.47
3                 Camp Pendleton  2023                         67.55
4                      Fallbrook  2023                         65.22
5                          Jamul  2023                         69.42
6             Laguna-Pine Valley  2023                         85.10
7                Mountain Empire  2023                         73.86
8            Oceanside-Escondido  2023                         84.00
9             Palomar-Julian CCD  2023                         72.97
10              Pauma Valley CCD  2023                         72.95
11                    Ramona CCD  2023                         55.98
12                 San Diego CCD  2023       

In [47]:
predictions_2023 = future_data_df[future_data_df['Year'] == 2023]

total_predicted_overdoses_2023 = predictions_2023['Predicted Overdose Incidents'].sum()

print(f"Total predicted overdose incidents for 2023: {total_predicted_overdoses_2023}")

Total predicted overdose incidents for 2023: 1000.1
