In [80]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV

import pickle

The problem is a regression problem that's why I chose regression algorithms

In [14]:
#import the datasets
train_features= pd.read_csv('dengue_features_train.csv')
train_labels=pd.read_csv('dengue_labels_train.csv')
test_data = pd.read_csv('dengue_features_test.csv')

In [3]:
df1= train_features.copy()
df2= train_labels.copy()

In [15]:
df3=test_data.copy()

In [4]:
merged_df=pd.merge(df1, df2, on=['city', 'year', 'weekofyear'])

In [20]:
merged_df.head()

Unnamed: 0,city,year,weekofyear,week_start_date,ndvi_ne,ndvi_nw,ndvi_se,ndvi_sw,precipitation_amt_mm,reanalysis_air_temp_k,...,reanalysis_sat_precip_amt_mm,reanalysis_specific_humidity_g_per_kg,reanalysis_tdtr_k,station_avg_temp_c,station_diur_temp_rng_c,station_max_temp_c,station_min_temp_c,station_precip_mm,total_cases,month
0,sj,1990,18,1990-04-30,0.1226,0.103725,0.198483,0.177617,12.42,297.572857,...,12.42,14.012857,2.628571,25.442857,6.9,29.4,20.0,16.0,4,4
1,sj,1990,19,1990-05-07,0.1699,0.142175,0.162357,0.155486,22.82,298.211429,...,22.82,15.372857,2.371429,26.714286,6.371429,31.7,22.2,8.6,5,5
2,sj,1990,20,1990-05-14,0.03225,0.172967,0.1572,0.170843,34.54,298.781429,...,34.54,16.848571,2.3,26.714286,6.485714,32.2,22.8,41.4,4,5
3,sj,1990,21,1990-05-21,0.128633,0.245067,0.227557,0.235886,15.36,298.987143,...,15.36,16.672857,2.428571,27.471429,6.771429,33.3,23.3,4.0,3,5
4,sj,1990,22,1990-05-28,0.1962,0.2622,0.2512,0.24734,7.52,299.518571,...,7.52,17.21,3.014286,28.942857,9.371429,35.0,23.9,5.8,6,5


use san juan and iquiots data seperately 

In [87]:
sj_data=merged_df[merged_df['city'] == 'sj']
iq_data=merged_df[merged_df['city'] == 'iq']

In [88]:
sj_data.shape

(936, 26)

In [90]:
df=sj_data.copy()

In [89]:
iq_data.shape

(520, 26)

In [26]:
def wrangle(data,target_column=None):
    # convert the date column to the correct format
    data['week_start_date'] = pd.to_datetime(data['week_start_date'])
    # add the month column
    data['month'] = data['week_start_date'].dt.month
    # fill the nulls
    data.fillna(method='ffill', inplace=True)
    # drop columns
    data = data.drop(columns=(["city","year","weekofyear","week_start_date"]), axis=1)
    
    return data

    

In [31]:
def preprocessing(data,target_column=None):
    # Separate features and target
    if target_column is not None:
        X = data.drop(target_column, axis=1)
        y = data[target_column]
        
        scaler=MinMaxScaler(feature_range=(0, 1)) 
        X_scaled=scaler.fit_transform(X)

        return X_scaled, y
    else:
        # No target column
        X = data
        scaler=MinMaxScaler(feature_range=(0, 1)) 
        X_scaled=scaler.fit_transform(X)

        return X_scaled

### San Juan Data

In [91]:
# San Juan Data
san_data=wrangle(df,'total_cases')

In [93]:
X_scaled_san, y_san=preprocessing(san_data,'total_cases')

In [96]:
X_scaled_san.shape

(936, 21)

Split the data set into Train and test sets\
time series data is split based on a cutoff. not randomized data

In [97]:
# I will use 20% on the test data
cut_off=int(len(X_scaled_san)*0.8)

In [118]:
X_train_san=X_scaled_san[0: cut_off]

In [119]:
X_test_san=X_scaled_san[cut_off:]

In [120]:
y_train_san= y_san[0: cut_off]

In [121]:
y_test_san= y_san[cut_off:]

In [124]:
### Baseline accuracy

In [125]:
#Linear regression
lr=LinearRegression()
lr.fit(X_train_san, y_train_san) # Train the model on the training data
 # Evaluate the model on the validation set
y_pred = lr.predict(X_test_san)

In [127]:
# calaculate the metrics
# R-squared
lr_r2 = r2_score(y_test_san, y_pred)

# Mean Squared Error
lr_mse = mean_squared_error(y_test_san, y_pred)

# Mean Absolute Error
lr_mae = mean_absolute_error(y_test_san, y_pred)

In [128]:

print("R-squared:", lr_r2)
print("Mean Squared Error:", lr_mse)
print("Mean Absolute Error:", lr_mae)

R-squared: -0.2597561012788663
Mean Squared Error: 1224.1913373957357
Mean Absolute Error: 26.89028649271969


In [130]:
df = pd.DataFrame(data={"Actual Values": y_test_san, "Predictions": y_pred})
pd.options.display.float_format = '{:.2f}'.format
df.head()

Unnamed: 0,Actual Values,Predictions
748,13,70.44
749,27,57.1
750,13,15.14
751,18,40.78
752,16,40.08


In [131]:

# KNeighbors regressor
knr=KNeighborsRegressor()
knr.fit(X_train_san, y_train_san) # Train the model on the training data
 # Evaluate the model on the validation set
y_pred_knr = knr.predict(X_test_san)

In [132]:
# R-squared
knr_r2 = r2_score(y_test_san, y_pred_knr)

# Mean Squared Error
knr_mse = mean_squared_error(y_test_san, y_pred_knr)

# Mean Absolute Error
knr_mae = mean_absolute_error(y_test_san, y_pred_knr)

print("R-squared:", knr_r2)
print("Mean Squared Error:", knr_mse)
print("Mean Absolute Error:", knr_mae)

R-squared: -0.4475229311480782
Mean Squared Error: 1406.6572340425535
Mean Absolute Error: 26.273404255319146


In [133]:
df = pd.DataFrame(data={"Actual Values": y_test_san, "Predictions": y_pred_knr})
pd.options.display.float_format = '{:.2f}'.format
df.head()

Unnamed: 0,Actual Values,Predictions
748,13,50.4
749,27,93.2
750,13,35.8
751,18,68.4
752,16,37.2


In [134]:
# random fores regressor
rf = RandomForestRegressor(n_estimators=100, random_state=42)

rf.fit(X_train_san, y_train_san) # Train the model on the training data
 # Evaluate the model on the validation set
y_pred_rf = rf.predict(X_test_san)
     

In [135]:
# R-squared
rf_r2 = r2_score(y_test_san, y_pred_rf)

# Mean Squared Error
rf_mse = mean_squared_error(y_test_san, y_pred_rf)

# Mean Absolute Error
rf_mae = mean_absolute_error(y_test_san, y_pred_rf)

print("R-squared:", rf_r2)
print("Mean Squared Error:", rf_mse)
print("Mean Absolute Error:", rf_mae)

R-squared: -0.4793870339901194
Mean Squared Error: 1437.6217664893613
Mean Absolute Error: 26.9243085106383


In [136]:
df = pd.DataFrame(data={"Actual Values": y_test_san, "Predictions": y_pred_rf})
pd.options.display.float_format = '{:.2f}'.format
df.head()

Unnamed: 0,Actual Values,Predictions
748,13,51.25
749,27,54.98
750,13,39.58
751,18,46.25
752,16,51.8


In [137]:

# Let me try ridge regression
rr=Ridge()
rr.fit(X_train_san, y_train_san) # Train the model on the training data
 # Evaluate the model on the validation set
y_pred_rr = rr.predict(X_test_san)

In [138]:

# R-squared
rr_r2 = r2_score(y_test_san, y_pred_rr)

# Mean Squared Error
rr_mse = mean_squared_error(y_test_san, y_pred_rr)

# Mean Absolute Error
rr_mae = mean_absolute_error(y_test_san, y_pred_rr)

print("R-squared:", rr_r2)
print("Mean Squared Error:", rr_mse)
print("Mean Absolute Error:", rr_mae)

R-squared: -0.13692842846654507
Mean Squared Error: 1104.8312700805816
Mean Absolute Error: 25.446879043736086


In [139]:
df = pd.DataFrame(data={"Actual Values": y_test_san, "Predictions": y_pred_rr})
pd.options.display.float_format = '{:.2f}'.format
df.head()

Unnamed: 0,Actual Values,Predictions
748,13,64.75
749,27,54.91
750,13,24.11
751,18,41.53
752,16,40.21


In [140]:
# Check how the models are all performing in a visual manner

models = ['LinearRegression','Ridge','RandomForestRegressor','KNeighborsRegressor']
r2_values = [lr_r2,rr_r2, rf_r2,knr_r2]
mse_values = [lr_mse,rr_mse, rf_mse,knr_mse]
mae_values = [lr_mae, rr_mae, rf_mae, knr_mae]

# Print or visualize the comparison
for model, r2, mse, mae in zip(models, r2_values, mse_values, mae_values):
    print(f'Model: {model}')
    print(f'R-squared: {r2}')
    print(f'Mean Squared Error: {mse}')
    print(f'Mean Absolute Error: {mae}')
    print('\n')

Model: LinearRegression
R-squared: -0.2597561012788663
Mean Squared Error: 1224.1913373957357
Mean Absolute Error: 26.89028649271969


Model: Ridge
R-squared: -0.13692842846654507
Mean Squared Error: 1104.8312700805816
Mean Absolute Error: 25.446879043736086


Model: RandomForestRegressor
R-squared: -0.4793870339901194
Mean Squared Error: 1437.6217664893613
Mean Absolute Error: 26.9243085106383


Model: KNeighborsRegressor
R-squared: -0.4475229311480782
Mean Squared Error: 1406.6572340425535
Mean Absolute Error: 26.273404255319146




## Iquitos Data

In [143]:
df_2=iq_data.copy()

In [145]:
df2_data=wrangle(df_2,'total_cases')

In [146]:
X_scaled_iq, y_iq=preprocessing(df2_data,'total_cases')

In [151]:
# I will use 20% on the test data
cut_off_2=int(len(X_scaled_iq)*0.8)

In [153]:
X_train_iq=X_scaled_iq[0: cut_off_2]

In [156]:
X_test_iq=X_scaled_iq[cut_off_2:]

In [158]:
y_train_iq= y_iq[0: cut_off_2]

In [159]:
y_test_iq= y_iq[cut_off_2:]

In [162]:
#Linear regression
lr_2=LinearRegression()
lr_2.fit(X_train_iq, y_train_iq) # Train the model on the training data
 # Evaluate the model on the validation set
y_pred_iq = lr_2.predict(X_test_iq)

In [163]:
# calaculate the metrics
# R-squared
lr_r2_2 = r2_score(y_test_iq, y_pred_iq)

# Mean Squared Error
lr_mse_2 = mean_squared_error(y_test_iq, y_pred_iq)

# Mean Absolute Error
lr_mae_2 = mean_absolute_error(y_test_iq, y_pred_iq)

print("R-squared:", lr_r2_2)
print("Mean Squared Error:", lr_mse_2)
print("Mean Absolute Error:", lr_mae_2)


R-squared: -0.07975856353471689
Mean Squared Error: 140.8042702834837
Mean Absolute Error: 7.344065367909114


In [164]:
df = pd.DataFrame(data={"Actual Values": y_test_iq, "Predictions": y_pred_iq})
pd.options.display.float_format = '{:.2f}'.format
df.head()

Unnamed: 0,Actual Values,Predictions
1352,0,5.21
1353,3,1.76
1354,3,4.52
1355,1,6.21
1356,5,6.53


In [165]:

# KNeighbors regressor
knr_2=KNeighborsRegressor()
knr_2.fit(X_train_iq, y_train_iq) # Train the model on the training data
 # Evaluate the model on the validation set
y_pred_knr_iq = knr_2.predict(X_test_iq)

In [166]:
# R-squared
knr_r2_2 = r2_score(y_test_iq, y_pred_knr_iq)

# Mean Squared Error
knr_mse_2 = mean_squared_error(y_test_iq, y_pred_knr_iq)

# Mean Absolute Error
knr_mae_2 = mean_absolute_error(y_test_iq, y_pred_knr_iq)

print("R-squared:", knr_r2_2)
print("Mean Squared Error:", knr_mse_2)
print("Mean Absolute Error:", knr_mae_2)

R-squared: -0.14793423914738946
Mean Squared Error: 149.6946153846154
Mean Absolute Error: 7.38076923076923


In [167]:
df = pd.DataFrame(data={"Actual Values": y_test_iq, "Predictions": y_pred_knr_iq})
pd.options.display.float_format = '{:.2f}'.format
df.head()

Unnamed: 0,Actual Values,Predictions
1352,0,2.4
1353,3,5.2
1354,3,4.6
1355,1,3.8
1356,5,3.0


In [168]:
# random fores regressor
rf_2 = RandomForestRegressor(n_estimators=100, random_state=42)

rf_2.fit(X_train_iq, y_train_iq) # Train the model on the training data
 # Evaluate the model on the validation set
y_pred_rf_iq = rf_2.predict(X_test_iq)

In [169]:
# R-squared
rf_r2_2 = r2_score(y_test_iq, y_pred_rf_iq)

# Mean Squared Error
rf_mse_2 = mean_squared_error(y_test_iq, y_pred_rf_iq)

# Mean Absolute Error
rf_mae_2 = mean_absolute_error(y_test_iq, y_pred_rf_iq)

print("R-squared:", rf_r2_2)
print("Mean Squared Error:", rf_mse_2)
print("Mean Absolute Error:", rf_mae_2)

R-squared: -0.13411560388076404
Mean Squared Error: 147.89261730769232
Mean Absolute Error: 7.517692307692308


In [172]:
df = pd.DataFrame(data={"Actual Values": y_test_iq, "Predictions": y_pred_rf_iq})
pd.options.display.float_format = '{:.2f}'.format
df.head()

Unnamed: 0,Actual Values,Predictions
1352,0,3.13
1353,3,3.55
1354,3,2.65
1355,1,3.41
1356,5,2.41


In [173]:

# Let me try ridge regression
rr_2=Ridge()
rr_2.fit(X_train_iq, y_train_iq) # Train the model on the training data
 # Evaluate the model on the validation set
y_pred_rr_iq = rr_2.predict(X_test_iq)

In [174]:

# R-squared
rr_r2_2 = r2_score(y_test_iq, y_pred_rr_iq)

# Mean Squared Error
rr_mse_2 = mean_squared_error(y_test_iq, y_pred_rr_iq)

# Mean Absolute Error
rr_mae_2 = mean_absolute_error(y_test_iq, y_pred_rr_iq)

print("R-squared:", rr_r2_2)
print("Mean Squared Error:", rr_mse_2)
print("Mean Absolute Error:", rr_mae_2)

R-squared: -0.05858965293156082
Mean Squared Error: 138.04377075068436
Mean Absolute Error: 7.2447106518776385


In [175]:
df = pd.DataFrame(data={"Actual Values": y_test_iq, "Predictions": y_pred_rr_iq})
pd.options.display.float_format = '{:.2f}'.format
df.head()

Unnamed: 0,Actual Values,Predictions
1352,0,6.54
1353,3,3.36
1354,3,6.1
1355,1,5.09
1356,5,6.57


In [176]:
# Check how the models are all performing in a visual manner

models = ['LinearRegression','Ridge','RandomForestRegressor','KNeighborsRegressor']
r2_values = [lr_r2_2,rr_r2_2, rf_r2_2,knr_r2_2]
mse_values = [lr_mse_2,rr_mse_2, rf_mse_2,knr_mse_2]
mae_values = [lr_mae_2, rr_mae_2, rf_mae_2, knr_mae_2]

# Print or visualize the comparison
for model, r2, mse, mae in zip(models, r2_values, mse_values, mae_values):
    print(f'Model: {model}')
    print(f'R-squared: {r2}')
    print(f'Mean Squared Error: {mse}')
    print(f'Mean Absolute Error: {mae}')
    print('\n')

Model: LinearRegression
R-squared: -0.07975856353471689
Mean Squared Error: 140.8042702834837
Mean Absolute Error: 7.344065367909114


Model: Ridge
R-squared: -0.05858965293156082
Mean Squared Error: 138.04377075068436
Mean Absolute Error: 7.2447106518776385


Model: RandomForestRegressor
R-squared: -0.13411560388076404
Mean Squared Error: 147.89261730769232
Mean Absolute Error: 7.517692307692308


Model: KNeighborsRegressor
R-squared: -0.14793423914738946
Mean Squared Error: 149.6946153846154
Mean Absolute Error: 7.38076923076923




**Observations**\
Ridge Regrerssion is the best performing model in the two towns


In [None]:
# let me try combining the two MAE- Ridge and see how it performs.

In [177]:
# MAEs from the two datasets
MAE1 = 25.446879043736086
MAE2 = 7.2447106518776385

# Number of observations in the two datasets
n1 = 936
n2 = 520

# Calculate the weighted average MAE
weighted_avg_MAE = ((n1 * MAE1) + (n2 * MAE2)) / (n1 + n2)

print(f"Weighted Average MAE: {weighted_avg_MAE}")


Weighted Average MAE: 18.946104618072354


After training the models, you can work introduce a test dataset and work use the models to predict the total_cases