In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

In [None]:
# Load the training data
train_features = pd.read_csv('dengue_features_train.csv')
train_labels = pd.read_csv('dengue_labels_train.csv')
print(train_labels.head())

  city  year  weekofyear  total_cases
0   sj  1990          18            4
1   sj  1990          19            5
2   sj  1990          20            4
3   sj  1990          21            3
4   sj  1990          22            6


In [None]:
# Merge the features and labels into one DataFrame
train_data = pd.merge(train_features, train_labels, on=['city', 'year', 'weekofyear'])


In [None]:
# Preprocess the data
def preprocess_data(data):
    # Drop unnecessary columns
    data = data.drop(['week_start_date', 'city', 'year'], axis=1)

    # Fill missing values with the mean of the column
    data = data.fillna(data.mean())

    return data

In [None]:
# Apply preprocessing to the training data
train_data = preprocess_data(train_data)

In [None]:
# Split the data into features and labels
X = train_data.drop('total_cases', axis=1)
y = train_data['total_cases']

In [None]:
# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
# Initialize and train a Random Forest regressor
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

In [None]:
# Make predictions on the validation set
val_preds = rf.predict(X_val)


In [None]:
# Calculate the Mean Absolute Error on the validation set
mae = mean_absolute_error(y_val, val_preds)
print(f'Mean Absolute Error on Validation Set: {mae}')

Mean Absolute Error on Validation Set: 19.495431058382255


In [None]:
# Load the test data
test_data = pd.read_csv('dengue_features_test.csv')

# Apply preprocessing to the test data
test_data = preprocess_data(test_data)

In [None]:
# Make predictions on the test data
test_preds = rf.predict(test_data)

In [None]:
# Create a submission DataFrame
submission = pd.read_csv('submission_format.csv')
submission['total_cases'] = np.round(test_preds).astype(int)

In [None]:
# Save the submission DataFrame to a CSV file
submission.to_csv('submission.csv', index=False)

In [None]:
submission.head()

Unnamed: 0,city,year,weekofyear,total_cases
0,sj,2008,18,6
1,sj,2008,19,6
2,sj,2008,20,16
3,sj,2008,21,21
4,sj,2008,22,14


###**Deep Learning with 4 Hidden Layers**

In [27]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.impute import SimpleImputer

In [28]:
# Merge the features and labels into one DataFrame
train_data = pd.merge(train_features, train_labels, on=['city', 'year', 'weekofyear'])


TypeError: ignored

In [11]:
# Preprocess the data
def preprocess_data(data):
    # Drop unnecessary columns

    data = data.drop(['week_start_date', 'year'], axis=1)

    # Fill missing values using SimpleImputer (median strategy)
    # Encode the 'city' column using One-Hot Encoding
    one_hot_encoder = OneHotEncoder(sparse=False)
    city_encoded = one_hot_encoder.fit_transform(data[['city']])
    city_encoded_df = pd.DataFrame(city_encoded, columns=['city_sj', 'city_iq'])
    data = pd.concat([data, city_encoded_df], axis=1)
    data = data.drop('city', axis=1)
        # Fill missing values using SimpleImputer (median strategy)
    imputer = SimpleImputer(strategy='median')
    data = pd.DataFrame(imputer.fit_transform(data), columns=data.columns)

    return data

In [12]:
# Apply preprocessing to the training data
train_data = preprocess_data(train_data)

# Split the data into features and labels
X = train_data.drop('total_cases', axis=1)
y = train_data['total_cases']

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.30, random_state=42)




In [13]:
# Feature scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)


In [29]:
# Build the deep learning model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.regularizers import l1_l2
from tensorflow.keras.layers import LeakyReLU
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import LearningRateScheduler

# Define a learning rate schedule
def lr_schedule(epoch):
    """
    Learning rate schedule that drops the learning rate by 10% every 10 epochs.
    """
    initial_lr = 0.001
    drop_every = 10
    lr = initial_lr * np.power(0.9, epoch // drop_every)
    return lr

"""model = keras.Sequential([
    layers.Input(shape=(X_train.shape[1],)),  # Input layer

    # Four hidden layers with dropout for regularization
    layers.Dense(128, activation=LeakyReLU(alpha=0.01)),
    layers.Dropout(0.3),
    layers.Dense(64, activation=LeakyReLU(alpha=0.01)),
    layers.Dropout(0.2),
    layers.Dense(32, activation=LeakyReLU(alpha=0.01)),
    layers.Dropout(0.1),
    layers.Dense(16, activation=LeakyReLU(alpha=0.01)),

    layers.Dense(1, activation='linear')  # Output layer with linear activation for regression
])"""

# Initialize a Sequential model
model = Sequential()

# Add input layer
model.add(Dense(units=64, activation=LeakyReLU(alpha=0.01), input_dim=X_train.shape[1]))
model.add(Dropout(0.2))
# Add 5 hidden layers with L1 and L2 regularization
for _ in range(5):
    model.add(Dense(units=64, activation=LeakyReLU(alpha=0.01), kernel_regularizer=l1_l2(0.01, 0.01)))
    model.add(Dropout(0.1))
# Add output layer
model.add(Dense(units=1, activation='linear'))
#model.add(RandomForestRegressor(n_estimators=100, random_state=42))
# Define the optimizer with an initial learning rate
opt = Adam(learning_rate=0.0)  # The actual learning rate will be set by the scheduler


In [30]:
# Compile the model
model.compile(optimizer=opt, loss='mean_absolute_error')

# Define the learning rate scheduler
lr_scheduler = LearningRateScheduler(lr_schedule)

In [31]:
# Train the model
model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=200, batch_size=16, verbose=2, callbacks=[lr_scheduler])


Epoch 1/200
64/64 - 2s - loss: 40.0010 - val_loss: 36.0741 - lr: 0.0010 - 2s/epoch - 33ms/step
Epoch 2/200
64/64 - 0s - loss: 29.0104 - val_loss: 28.8731 - lr: 0.0010 - 236ms/epoch - 4ms/step
Epoch 3/200
64/64 - 0s - loss: 23.7997 - val_loss: 25.1475 - lr: 0.0010 - 194ms/epoch - 3ms/step
Epoch 4/200
64/64 - 0s - loss: 21.3408 - val_loss: 23.5789 - lr: 0.0010 - 224ms/epoch - 3ms/step
Epoch 5/200
64/64 - 0s - loss: 20.3308 - val_loss: 23.1772 - lr: 0.0010 - 229ms/epoch - 4ms/step
Epoch 6/200
64/64 - 0s - loss: 19.6747 - val_loss: 22.4464 - lr: 0.0010 - 227ms/epoch - 4ms/step
Epoch 7/200
64/64 - 0s - loss: 19.2257 - val_loss: 22.0303 - lr: 0.0010 - 224ms/epoch - 3ms/step
Epoch 8/200
64/64 - 0s - loss: 19.1695 - val_loss: 21.6976 - lr: 0.0010 - 189ms/epoch - 3ms/step
Epoch 9/200
64/64 - 0s - loss: 18.5332 - val_loss: 21.4237 - lr: 0.0010 - 205ms/epoch - 3ms/step
Epoch 10/200
64/64 - 0s - loss: 18.0362 - val_loss: 21.1964 - lr: 0.0010 - 235ms/epoch - 4ms/step
Epoch 11/200
64/64 - 0s - loss:

<keras.src.callbacks.History at 0x78987c1d7880>

In [32]:
# Extract features from the trained deep neural network
train_features = model.predict(X_train)
val_features = model.predict(X_val)



In [33]:
# Train a Random Forest Regressor using the extracted features
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(train_features, y_train)

# Make predictions on the validation set
val_preds = rf_model.predict(val_features)

In [34]:
# Calculate the Mean Absolute Error on the validation set
mae = mean_absolute_error(y_val, val_preds)
print(f'Mean Absolute Error on Validation Set: {mae}')

Mean Absolute Error on Validation Set: 21.78792511169227


In [35]:
# Load the test data
test_data = pd.read_csv('dengue_features_test.csv')

# Apply preprocessing to the test data
test_data = preprocess_data(test_data)

# Feature scaling for test data
test_data_scaled = scaler.transform(test_data)

# Extract features from the trained deep neural network for the test data
test_features = model.predict(test_data_scaled)

# Make predictions on the test data using the Random Forest Regressor
test_preds = rf_model.predict(test_features)

# Create a submission DataFrame
submission = pd.read_csv('submission_format.csv')
submission['total_cases'] = np.round(test_preds).astype(int)

# Save the submission DataFrame to a CSV file
submission.to_csv('submissiondlRFr1.csv', index=False)

print(submission.head())

  city  year  weekofyear  total_cases
0   sj  2008          18            4
1   sj  2008          19           27
2   sj  2008          20            2
3   sj  2008          21           18
4   sj  2008          22           20




In [36]:
print(submission.head(40))

   city  year  weekofyear  total_cases
0    sj  2008          18            4
1    sj  2008          19           27
2    sj  2008          20            2
3    sj  2008          21           18
4    sj  2008          22           20
5    sj  2008          23            6
6    sj  2008          24            4
7    sj  2008          25           25
8    sj  2008          26           20
9    sj  2008          27           10
10   sj  2008          28           24
11   sj  2008          29           74
12   sj  2008          30           13
13   sj  2008          31           28
14   sj  2008          32           52
15   sj  2008          33           19
16   sj  2008          34           21
17   sj  2008          35           38
18   sj  2008          36           45
19   sj  2008          37           19
20   sj  2008          38           40
21   sj  2008          39           15
22   sj  2008          40           38
23   sj  2008          41           24
24   sj  2008          42

In [None]:
#For Deep L model alone

# Evaluate the model on the validation set
val_preds = model.predict(X_val)
mae = mean_absolute_error(y_val, val_preds)
print(f'Mean Absolute Error on Validation Set: {mae}')

Mean Absolute Error on Validation Set: 18.056874186823514


In [None]:
# Load the test data
test_data = pd.read_csv('dengue_features_test.csv')

# Apply preprocessing to the test data
test_data = preprocess_data(test_data)

# Standardize test features
test_data = scaler.transform(test_data)

In [None]:
# Make predictions on the test data
test_preds = model.predict(test_data)



In [None]:
# Create a submission DataFrame
submission = pd.read_csv('submission_format.csv')
submission['total_cases'] = np.round(test_preds).astype(int)

In [None]:
# Save the submission DataFrame to a CSV file
submission.to_csv('submission_dl5.csv', index=False)

In [None]:
submission.head()

Unnamed: 0,city,year,weekofyear,total_cases
0,sj,2008,18,6
1,sj,2008,19,5
2,sj,2008,20,9
3,sj,2008,21,13
4,sj,2008,22,12


###**Dynamically altering hyperparameters**

In [None]:
# Merge the features and labels into one DataFrame
train_data = pd.merge(train_features, train_labels, on=['city', 'year', 'weekofyear'])

# Preprocess the data
def preprocess_data(data):
    # Drop unnecessary columns
    data = data.drop(['week_start_date', 'city', 'year'], axis=1)

    # Fill missing values with the mean of the column
    data = data.fillna(data.mean())

    return data

# Apply preprocessing to the training data
train_data = preprocess_data(train_data)

# Split the data into features and labels
X = train_data.drop('total_cases', axis=1)
y = train_data['total_cases']

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)


In [None]:
# Define a function to create and compile the model with dynamic hyperparameters
def create_model(trial):
    model = Sequential()
    model.add(Dense(units=64, activation='relu', input_dim=X_train.shape[1]))
    model.add(Dropout(rate=trial.suggest_uniform('dropout_input', 0.0, 0.5)))

    for i in range(5):
        model.add(Dense(units=64, activation='relu', kernel_regularizer=l1_l2(0.01, 0.01)))
        model.add(Dropout(rate=trial.suggest_uniform(f'dropout_hidden_{i}', 0.0, 0.5)))

    model.add(Dense(units=1, activation='linear'))

    lr = trial.suggest_loguniform('learning_rate', 1e-5, 1e-1)
    optimizer = Adam(learning_rate=lr)
    model.compile(optimizer=optimizer, loss='mean_absolute_error')

    return model


In [None]:
!pip install optuna
import optuna

Collecting optuna
  Downloading optuna-3.3.0-py3-none-any.whl (404 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m404.2/404.2 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.12.0-py3-none-any.whl (226 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m226.0/226.0 kB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting cmaes>=0.10.0 (from optuna)
  Downloading cmaes-0.10.0-py3-none-any.whl (29 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.7.0-py2.py3-none-any.whl (11 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.2.4-py3-none-any.whl (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.7/78.7 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: Mako, colorlog, cmaes, alembic, optuna
Successfully installed Mako-1.2.4 alembic-1.12.0 cmaes-0.10.0 colorlog-6.7.0 optuna-3.3.0


In [None]:
# Define an Optuna objective function to optimize hyperparameters
def objective(trial):
    model = create_model(trial)
    history = model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=200, batch_size=16, verbose=2)
    val_loss = history.history['val_loss'][-1]
    return val_loss

In [99]:
# Create an Optuna study
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50)  # You can adjust the number of trials

# Get the best hyperparameters
best_params = study.best_params

[I 2023-09-22 16:41:30,253] A new study created in memory with name: no-name-c60f085b-270d-408b-bffe-30054cb04f1e
  model.add(Dropout(rate=trial.suggest_uniform('dropout_input', 0.0, 0.5)))
  model.add(Dropout(rate=trial.suggest_uniform(f'dropout_hidden_{i}', 0.0, 0.5)))
  lr = trial.suggest_loguniform('learning_rate', 1e-5, 1e-1)
[I 2023-09-22 16:42:55,437] Trial 0 finished with value: 28.26535987854004 and parameters: {'dropout_input': 0.30790662919035255, 'dropout_hidden_0': 0.06612442925401324, 'dropout_hidden_1': 0.09568250643814191, 'dropout_hidden_2': 0.47950053343615107, 'dropout_hidden_3': 0.3149219140946801, 'dropout_hidden_4': 0.02772282105728352, 'learning_rate': 0.08710882363709695}. Best is trial 0 with value: 28.26535987854004.
[I 2023-09-22 16:44:19,253] Trial 1 finished with value: 23.060861587524414 and parameters: {'dropout_input': 0.1898128440346475, 'dropout_hidden_0': 0.47141650438447213, 'dropout_hidden_1': 0.09892313472010666, 'dropout_hidden_2': 0.2888451052442

KeyboardInterrupt: ignored

In [None]:
# Print the best hyperparameters
print("Best Hyperparameters:")
print(best_params)

# Create and train the final model with the best hyperparameters
best_model = create_model(study.best_trial)
best_model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=100, batch_size=32, verbose=2)


###**XGBoost**

In [1]:
# Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.feature_selection import SelectKBest, f_regression

In [12]:
# Load the training data
train_features = pd.read_csv('dengue_features_train.csv')
train_labels = pd.read_csv('dengue_labels_train.csv')

# Merge the features and labels into one DataFrame
train_data = pd.merge(train_features, train_labels, on=['city', 'year', 'weekofyear'])


In [13]:
# Preprocess the data
def preprocess_data(data):
    # Drop unnecessary columns
    data = data.drop(['week_start_date', 'year'], axis=1)

    # Fill missing values with the mean of the column
    data = data.fillna(data.mean())

    # Encode the 'city' column using Label Encoding
    #label_encoder = LabelEncoder()
    #data['city'] = label_encoder.fit_transform(data['city'])

    # Encode the 'city' column using One-Hot Encoding
    one_hot_encoder = OneHotEncoder(sparse=False)
    city_encoded = one_hot_encoder.fit_transform(data[['city']])
    city_encoded_df = pd.DataFrame(city_encoded, columns=['city_sj', 'city_iq'])
    data = pd.concat([data, city_encoded_df], axis=1)
    data = data.drop('city', axis=1)

    return data

In [14]:
# Apply preprocessing to the training data
train_data = preprocess_data(train_data)

# Split the data into features and labels
X = train_data.drop('total_cases', axis=1)
y = train_data['total_cases']


  data = data.fillna(data.mean())


In [15]:
# Feature selection using SelectKBest and f_regression
k_best = SelectKBest(score_func=f_regression, k='all')
X_selected = k_best.fit_transform(X, y)

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_selected, y, test_size=0.31, random_state=42)


In [16]:
# Feature scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)

In [17]:
# Initialize and train an XGBoost Regressor
xgb = XGBRegressor(n_estimators=115, random_state=42)
xgb.fit(X_train, y_train)

# Make predictions on the validation set
val_preds = xgb.predict(X_val)

In [18]:
# Calculate the Mean Absolute Error on the validation set
mae = mean_absolute_error(y_val, val_preds)
print(f'Mean Absolute Error on Validation Set: {mae}')

# Load the test data
test_data = pd.read_csv('dengue_features_test.csv')


Mean Absolute Error on Validation Set: 17.91332539919335


In [19]:
# Apply preprocessing to the test data
test_data = preprocess_data(test_data)

# Feature selection for test data
test_selected = k_best.transform(test_data)

# Feature scaling for test data
test_data_scaled = scaler.transform(test_selected)

# Make predictions on the test data
test_preds = xgb.predict(test_data_scaled)


  data = data.fillna(data.mean())


In [20]:
# Create a submission DataFrame
submission = pd.read_csv('submission_format.csv')
submission['total_cases'] = np.round(test_preds).astype(int)

# Save the submission DataFrame to a CSV file
submission.to_csv('submission_xgb2_feat_eng.csv', index=False)

In [21]:
submission.head()

Unnamed: 0,city,year,weekofyear,total_cases
0,sj,2008,18,4
1,sj,2008,19,4
2,sj,2008,20,8
3,sj,2008,21,14
4,sj,2008,22,11


###**Gradient Boosting Regressor**

In [41]:
# Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.impute import SimpleImputer

# Load the training data
train_features = pd.read_csv('dengue_features_train.csv')
train_labels = pd.read_csv('dengue_labels_train.csv')

# Merge the features and labels into one DataFrame
train_data = pd.merge(train_features, train_labels, on=['city', 'year', 'weekofyear'])

# Preprocess the data
def preprocess_data(data):
    # Drop unnecessary columns
    data = data.drop(['week_start_date'], axis=1)


    # Encode the 'city' column using One-Hot Encoding
    one_hot_encoder = OneHotEncoder(sparse=False)
    city_encoded = one_hot_encoder.fit_transform(data[['city']])
    city_encoded_df = pd.DataFrame(city_encoded, columns=['city_sj', 'city_iq'])
    data = pd.concat([data, city_encoded_df], axis=1)
    data = data.drop('city', axis=1)
        # Fill missing values using SimpleImputer (median strategy)
    imputer = SimpleImputer(strategy='median')
    data = pd.DataFrame(imputer.fit_transform(data), columns=data.columns)
    return data

In [42]:
# Apply preprocessing to the training data
train_data = preprocess_data(train_data)

# Split the data into features and labels
X = train_data.drop('total_cases', axis=1)
y = train_data['total_cases']

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=42)

# Feature scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)




In [43]:
# Initialize a Gradient Boosting Regressor
gbr = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)

# Train the model
gbr.fit(X_train, y_train)

# Make predictions on the validation set
val_preds = gbr.predict(X_val)

# Calculate the Mean Absolute Error on the validation set
mae = mean_absolute_error(y_val, val_preds)
print(f'Mean Absolute Error on Validation Set: {mae}')

Mean Absolute Error on Validation Set: 13.400370921256629


In [44]:
# Load the test data
test_data = pd.read_csv('dengue_features_test.csv')

# Apply preprocessing to the test data
test_data = preprocess_data(test_data)

# Feature scaling for test data
test_data_scaled = scaler.transform(test_data)

# Make predictions on the test data
test_preds = gbr.predict(test_data_scaled)



In [45]:
# Create a submission DataFrame
submission = pd.read_csv('submission_format.csv')
submission['total_cases'] = np.round(test_preds).astype(int)

# Save the submission DataFrame to a CSV file
submission.to_csv('submissiongbr.csv', index=False)

print(submission.head())

  city  year  weekofyear  total_cases
0   sj  2008          18            8
1   sj  2008          19            4
2   sj  2008          20            9
3   sj  2008          21            7
4   sj  2008          22           12


###**Ridge Regression**

In [4]:
# Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
#from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.metrics import mean_absolute_error
from sklearn.impute import SimpleImputer

# Load the training data
train_features = pd.read_csv('dengue_features_train.csv')
train_labels = pd.read_csv('dengue_labels_train.csv')

# Merge the features and labels into one DataFrame
train_data = pd.merge(train_features, train_labels, on=['city', 'year', 'weekofyear'])

# Preprocess the data
def preprocess_data(data):
    # Drop unnecessary columns
    #data = data.drop(['week_start_date'], axis=1)
    data = data.drop(['week_start_date', 'year'], axis=1)

    # Fill missing values using SimpleImputer (median strategy)
    # Encode the 'city' column using One-Hot Encoding
    one_hot_encoder = OneHotEncoder(sparse=False)
    city_encoded = one_hot_encoder.fit_transform(data[['city']])
    city_encoded_df = pd.DataFrame(city_encoded, columns=['city_sj', 'city_iq'])
    data = pd.concat([data, city_encoded_df], axis=1)
    data = data.drop('city', axis=1)
        # Fill missing values using SimpleImputer (median strategy)
    imputer = SimpleImputer(strategy='median')
    data = pd.DataFrame(imputer.fit_transform(data), columns=data.columns)

    return data

# Apply preprocessing to the training data
train_data = preprocess_data(train_data)



In [5]:
# Split the data into features and labels
X = train_data.drop('total_cases', axis=1)
y = train_data['total_cases']

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.28, random_state=42)

# Feature scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)

In [6]:
# Initialize a Ridge regression model with alpha (regularization strength) hyperparameter
ridge = Ridge(alpha=1.0)

# Train the model
ridge.fit(X_train, y_train)

# Make predictions on the validation set
val_preds = ridge.predict(X_val)

# Calculate the Mean Absolute Error on the validation set
mae = mean_absolute_error(y_val, val_preds)
print(f'Mean Absolute Error on Validation Set: {mae}')

Mean Absolute Error on Validation Set: 22.82958940992133


In [7]:
# Load the test data
test_data = pd.read_csv('dengue_features_test.csv')

# Apply preprocessing to the test data
test_data = preprocess_data(test_data)

# Feature scaling for test data
test_data_scaled = scaler.transform(test_data)

# Make predictions on the test data
test_preds = ridge.predict(test_data_scaled)




In [8]:
# Create a submission DataFrame
submission = pd.read_csv('submission_format.csv')
submission['total_cases'] = np.round(test_preds).astype(int)

# Save the submission DataFrame to a CSV file
submission.to_csv('submissionridgeR.csv', index=False)
print(submission.head())

  city  year  weekofyear  total_cases
0   sj  2008          18           32
1   sj  2008          19           22
2   sj  2008          20           29
3   sj  2008          21           31
4   sj  2008          22           35


###**RF Regressor Testing**

In [1]:
# Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.impute import SimpleImputer

# Build the deep learning model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.regularizers import l1_l2
from tensorflow.keras.layers import LeakyReLU
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import LearningRateScheduler

In [2]:
#import random

# Load the training data
train_features = pd.read_csv('dengue_features_train.csv')
train_labels = pd.read_csv('dengue_labels_train.csv')
print(train_labels.head())

# Merge the features and labels into one DataFrame
train_data = pd.merge(train_features, train_labels, on=['city', 'year', 'weekofyear'])


# Preprocess the data
def preprocess_data(data):

    # Shuffle the data rows
    #data = data.sample(frac=1).reset_index(drop=True)

    # Drop unnecessary columns
    data = data.drop(['week_start_date', 'city', 'year'], axis=1)

    # Fill missing values with the mean of the column
    #data = data.fillna(data.mean())

    # Fill missing values using SimpleImputer (median strategy)
    imputer = SimpleImputer(strategy='median')
    data = pd.DataFrame(imputer.fit_transform(data), columns=data.columns)
    # Shuffle the dataset

    return data

  # Apply preprocessing to the training data
train_data = preprocess_data(train_data)

  city  year  weekofyear  total_cases
0   sj  1990          18            4
1   sj  1990          19            5
2   sj  1990          20            4
3   sj  1990          21            3
4   sj  1990          22            6


In [5]:
from sklearn.preprocessing import StandardScaler

# Split the data into features and labels
X = train_data.drop('total_cases', axis=1)
y = train_data['total_cases']

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)

# Initialize a Random Forest Regressor
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
#rf_model = RandomForestRegressor(n_estimators=200, max_depth=15, min_samples_split=5, min_samples_leaf=2, random_state=42)


# Train the Random Forest Regressor
rf_model.fit(X_train, y_train)

# Extract features using Random Forest Regressor
train_features_rf = rf_model.predict(X_train).reshape(-1, 1)  # Reshape to maintain 2D shape for concatenation
val_features_rf = rf_model.predict(X_val).reshape(-1, 1)

In [7]:
#delete this alone
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.preprocessing import StandardScaler



# Define a learning rate schedule
def lr_schedule(epoch):
    """
    Learning rate schedule that drops the learning rate by 10% every 10 epochs.
    """
    initial_lr = 0.00105
    drop_every = 10
    lr = initial_lr * np.power(0.9, epoch // drop_every)
    return lr

# Initialize a Sequential model
model = tf.keras.models.Sequential()

# Add input layer for features extracted by Random Forest
model.add(tf.keras.layers.Input(shape=(train_features_rf.shape[1],)))
model.add(Dropout(0.12))

# Add hidden layers with ReLU activation
for _ in range(5):
    model.add(tf.keras.layers.Dense(units=64, activation=LeakyReLU(alpha=0.01), kernel_regularizer=tf.keras.regularizers.l1_l2(0.01, 0.01)))
    model.add(Dropout(0.1))
####
model.add(tf.keras.layers.Dense(units=32, activation='relu', kernel_regularizer=tf.keras.regularizers.l1_l2(0.01, 0.01)))
####
model.add(tf.keras.layers.BatchNormalization())

#### Output Layer
model.add(tf.keras.layers.Dense(units=1, activation='linear'))

opt = Adam(learning_rate=0.0)
# Compile the model
# Compile the model
model.compile(optimizer=opt, loss='mean_absolute_error')

# Define the learning rate scheduler
lr_scheduler = LearningRateScheduler(lr_schedule)

###model.compile(optimizer='adam', loss='mean_absolute_error')

# Train the model
model.fit(train_features_rf, y_train, validation_data=(val_features_rf, y_val), epochs=200, batch_size=16, verbose=2, callbacks=[lr_scheduler])

# Train the model using Random Forest features
###model.fit(train_features_rf, y_train, validation_data=(val_features_rf, y_val), epochs=200, batch_size=16, verbose=2)

# Extract features from validation data using Random Forest
val_features_rf = rf_model.predict(X_val).reshape(-1, 1)

# Make predictions on the validation set using the neural network
val_preds = model.predict(val_features_rf)

# Calculate the Mean Absolute Error on the validation set
#mae = mean_absolute_error(y_val, val_preds)
#print(f'Mean Absolute Error on Validation Set: {mae}')

# Load the test data
test_data = pd.read_csv('dengue_features_test.csv')

# Apply preprocessing to the test data
test_data = preprocess_data(test_data)
test_data_scaled = scaler.transform(test_data)

# Extract features from test data using Random Forest
test_features_rf = rf_model.predict(test_data_scaled).reshape(-1, 1)

# Make predictions on the test data using the neural network
test_preds = model.predict(test_features_rf)

# Create a submission DataFrame
submission = pd.read_csv('submission_format.csv')
submission['total_cases'] = np.round(test_preds).astype(int)

# Save the submission DataFrame to a CSV file
submission.to_csv('submissionrftodl6bn.csv', index=False)

print(submission.head())

"""print(train_features[0])
# Initialize a Sequential model
model = Sequential()

# Add input layer
model.add(Dense(units=64, activation=LeakyReLU(alpha=0.01), input_dim=train_features.shape[1]))
model.add(Dropout(0.2))
# Add 5 hidden layers with L1 and L2 regularization
for _ in range(5):
    model.add(Dense(units=64, activation=LeakyReLU(alpha=0.01), kernel_regularizer=l1_l2(0.01, 0.01)))
    model.add(Dropout(0.1))
# Add output layer
model.add(Dense(units=1, activation='linear'))
#model.add(RandomForestRegressor(n_estimators=100, random_state=42))
# Define the optimizer with an initial learning rate
opt = Adam(learning_rate=0.0)  # The actual learning rate will be set by the scheduler

# Compile the model
model.compile(optimizer=opt, loss='mean_absolute_error')

# Define the learning rate scheduler
lr_scheduler = LearningRateScheduler(lr_schedule)

# Train the model
model.fit(train_features, y_train, validation_data=(val_features, y_val), epochs=200, batch_size=16, verbose=2, callbacks=[lr_scheduler])

# Make predictions on the validation set
val_preds = model.predict(val_features)

# Calculate the Mean Absolute Error on the validation set
mae = mean_absolute_error(y_val, val_preds)
print(f'Mean Absolute Error on Validation Set: {mae}')


# Make predictions on the test features
test_preds = model.predict(test_features)

# Create a submission DataFrame
submission = pd.read_csv('submission_format.csv')
submission['total_cases'] = np.round(test_preds).astype(int)

# Save the submission DataFrame to a CSV file
submission.to_csv('submissionrfrtodl.csv', index=False)

print(submission.head())"""

Epoch 1/200
73/73 - 2s - loss: 45.1714 - val_loss: 50.6773 - lr: 0.0010 - 2s/epoch - 31ms/step
Epoch 2/200
73/73 - 0s - loss: 40.5679 - val_loss: 49.2776 - lr: 0.0010 - 250ms/epoch - 3ms/step
Epoch 3/200
73/73 - 0s - loss: 35.3794 - val_loss: 48.1889 - lr: 0.0010 - 206ms/epoch - 3ms/step
Epoch 4/200
73/73 - 0s - loss: 34.0130 - val_loss: 47.4797 - lr: 0.0010 - 211ms/epoch - 3ms/step
Epoch 5/200
73/73 - 0s - loss: 33.0798 - val_loss: 46.8376 - lr: 0.0010 - 241ms/epoch - 3ms/step
Epoch 6/200
73/73 - 0s - loss: 33.0323 - val_loss: 46.2968 - lr: 0.0010 - 249ms/epoch - 3ms/step
Epoch 7/200
73/73 - 0s - loss: 31.6063 - val_loss: 45.7320 - lr: 0.0010 - 280ms/epoch - 4ms/step
Epoch 8/200
73/73 - 0s - loss: 31.7758 - val_loss: 45.2032 - lr: 0.0010 - 254ms/epoch - 3ms/step
Epoch 9/200
73/73 - 0s - loss: 30.9615 - val_loss: 44.8661 - lr: 0.0010 - 232ms/epoch - 3ms/step
Epoch 10/200
73/73 - 0s - loss: 30.4319 - val_loss: 44.5106 - lr: 0.0010 - 256ms/epoch - 4ms/step
Epoch 11/200
73/73 - 0s - loss:

"print(train_features[0])\n# Initialize a Sequential model\nmodel = Sequential()\n\n# Add input layer\nmodel.add(Dense(units=64, activation=LeakyReLU(alpha=0.01), input_dim=train_features.shape[1]))\nmodel.add(Dropout(0.2))\n# Add 5 hidden layers with L1 and L2 regularization\nfor _ in range(5):\n    model.add(Dense(units=64, activation=LeakyReLU(alpha=0.01), kernel_regularizer=l1_l2(0.01, 0.01)))\n    model.add(Dropout(0.1))\n# Add output layer\nmodel.add(Dense(units=1, activation='linear'))\n#model.add(RandomForestRegressor(n_estimators=100, random_state=42))\n# Define the optimizer with an initial learning rate\nopt = Adam(learning_rate=0.0)  # The actual learning rate will be set by the scheduler\n\n# Compile the model\nmodel.compile(optimizer=opt, loss='mean_absolute_error')\n\n# Define the learning rate scheduler\nlr_scheduler = LearningRateScheduler(lr_schedule)\n\n# Train the model\nmodel.fit(train_features, y_train, validation_data=(val_features, y_val), epochs=200, batch_size