In [None]:
import numpy as np  #For Linear Algebra
import csv  #For creation of the final CSV file
import pandas as pd  #Data Processing and I/O Operations done on the CSV File

# Gradient XGBoost with Random Forest for making predictions
from xgboost import XGBRFRegressor


⚛: The reason I went for XGBoost with Random Forest Regression as my preferred model is:


---






1.   **Ensemble Power** : The combination of XGBoost with Random Forest can be a useful tool to combine the strengths of both the models, and hence improve regression accuracy.


---



2.   **Robust to Overfitting** : Both the models come with built in methods to combat overfitting with XGBoost having boosting and Random Forest having bagging.


---





3.   **Non Linearity Handling** : Both the models are suitable to work on a wide range of problems as they are able to capture non-linearities effectively and understand the data better. They are also useful for handling various types of data and can be useful even when there are missing values.



---


4.   **Hyperparameter Tuning** : Both the models allow the user to optimize the model further by tweaking Hyperparameters for the model and tailor a model specific to the task on hand.



---







5.   **Parallelization** : Both the models support parallelization that helps train the model at a faster rate on Multi-core CPUs and good GPUs.








---

Overall this combination of XGBoost implemented with Random Forest can be very effective in solving a wide range of regression problems with a lot of opportunities to optimize the model.




In [None]:
# Define dataset
file_path="/content/train.csv"
weather_data=pd.read_csv(file_path)


# Defining the **features and target variables** for the model

In [None]:
features=["YEAR","T2M_RANGE","T2M_MAX","T2M_MIN","RH2M","PS","WS10M"]
X=weather_data[features]


targets=["T2M","QV2M","VACATION_RATE"]
y=weather_data[targets]


# Defining the **test dataset** and the **features** for the model to analyze and predict from

In [None]:
test_file_path='/content/test.csv'
weather_data_test=pd.read_csv(test_file_path)

test_features=["YEAR","T2M_RANGE","T2M_MAX","T2M_MIN","RH2M","PS","WS10M"]
X_test=weather_data_test[test_features]


# Defining the Model
**HyperParameters**

1. n_estimators :  The number of boosting rounds or trees in the model. A higher number can improve performance, but can also cause overfitting of the data.

2. subsample : The fraction of samples used for fitting the trees. It helps control overfitting.

3. colsample_bynode :  It controls the fraction of features that are considered when constructing each tree node during the boosting process.

4. learning_rate :  It controls the step size at each iteration when updating the model's parameters to minimize the loss function.

5. max_depth :  It specifies the maximum depth or level that a decision tree can grow to during the construction process.

6. random_state : It serves the purpose of introducing randomness into certain operations that would otherwise be deterministic. Its primary use is to control the randomness or reproducibility of a machine learning model's behavior.


In [None]:
model = XGBRFRegressor(n_estimators=1000, subsample=0.9, colsample_bynode=0.1 , learning_rate=0.8, max_depth=7,random_state=0)


# Fit the model on the whole dataset
model.fit(X, y)



# Start predictions
yhat = model.predict(X_test)


# Using List Comprehension to take out VACATION_RATE values

In [None]:
vacation_rate = [row[-1] for row in yhat]

#CREATING THE LIST OF IDS
ids = [1440 + i for i in range(len(vacation_rate))]

#MAPPING EACH ID TO EVERY VALUE OF VACATION_RATE
data = list(zip(ids, vacation_rate))


# CREATING A NEW CSV FILE CONTAINING ID AND VACATION_RATE ONLY

In [None]:
csv_file_path = "/content/submission.csv"
with open(csv_file_path, mode="w", newline="") as csv_file:

    csv_writer = csv.writer(csv_file)


    # Write header
    csv_writer.writerow(["ID", "VACATION_RATE"])


    # Write data rows
    csv_writer.writerows(data)


print("CSV file has been created.")