In [2]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# Load the dataset
# Adjust file paths as necessary for your environment
train_data = pd.read_csv("/content/train_data_covid (1).csv")
test_data = pd.read_csv("/content/test_data_covid.csv")

# For Google Colab: Uncomment and use the following code to upload files manually
# from google.colab import files
# uploaded = files.upload()
# train_data = pd.read_csv("train_data_covid.csv")
# test_data = pd.read_csv("test_data_covid.csv")

# Convert 'Date' column to datetime format
train_data['Date'] = pd.to_datetime(train_data['Date'])
test_data['Date'] = pd.to_datetime(test_data['Date'])

# Extract features from the 'Date' column
train_data['Year'] = train_data['Date'].dt.year
train_data['Month'] = train_data['Date'].dt.month
train_data['Day'] = train_data['Date'].dt.day

test_data['Year'] = test_data['Date'].dt.year
test_data['Month'] = test_data['Date'].dt.month
test_data['Day'] = test_data['Date'].dt.day

# Drop the original 'Date' column
train_data.drop(columns=['Date'], inplace=True)
test_data.drop(columns=['Date'], inplace=True)

# Define categorical columns to be one-hot encoded
categorical_cols = ['State/UnionTerritory', 'Time', 'ConfirmedIndianNational', 'ConfirmedForeignNational']

# Define preprocessing pipeline for categorical features
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))  # Handle unknown categories by ignoring them
])

# Combine preprocessing steps for all features using ColumnTransformer
preprocessor = ColumnTransformer(transformers=[
    ('cat', categorical_transformer, categorical_cols)
])

# Split the dataset into features (X) and target variable (y)
X = train_data.drop(columns=["Deaths"])
y = train_data["Deaths"]

# Split the dataset into training and validation sets
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the RandomForestRegressor model
model = RandomForestRegressor(random_state=42)

# Define a pipeline that first preprocesses the data and then applies the model
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),  # Apply the preprocessor to the data
    ('model', model)                 # Train the model
])

# Define hyperparameters for random search
param_dist = {
    'model__n_estimators': [50, 100, 200],  # Number of trees in the forest
    'model__max_depth': [None, 10, 20],     # Maximum depth of the tree
    'model__min_samples_split': [2, 5, 10], # Minimum number of samples required to split an internal node
    'model__min_samples_leaf': [1, 2, 4]    # Minimum number of samples required to be at a leaf node
}

# Perform RandomizedSearchCV to find the best hyperparameters
random_search = RandomizedSearchCV(pipeline, param_dist, n_iter=10, cv=5, scoring='neg_mean_absolute_error', random_state=42)
random_search.fit(X_train, y_train)

# Get the best model from the random search
best_model = random_search.best_estimator_

# Evaluate the model on the validation set
predictions = best_model.predict(X_valid)
mape = np.mean(np.abs((y_valid - predictions) / (y_valid + 1e-10))) * 100  # Calculate MAPE, adding a small epsilon to avoid division by zero
print("Mean Absolute Percentage Error (MAPE):", mape)

# Make predictions on the test dataset
test_predictions = best_model.predict(test_data)

# Prepare the submission file
submission = pd.DataFrame({"Sno": test_data["Sno"], "Deaths": test_predictions})
submission.to_csv("submission.csv", index=False)

# For Google Colab: Download the file to your local machine
# from google.colab import files
# files.download("submission.csv")


  test_data['Date'] = pd.to_datetime(test_data['Date'])


Mean Absolute Percentage Error (MAPE): 8763579892301.567
