In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
# #########Q1#########
# Load oil.csv. This file contains years worth of data of the daily
# oil price. However, the data is missing for a few days. 
# Make sure that every day contains
# a value using any data imputation technique 
# that you learned during the data preparation
# week or during the missing values imputation week

# 1. Load the data
oil_df = pd.read_csv('oil.csv')

# 2. Convert the 'date' column to datetime and set it as index
oil_df['date'] = pd.to_datetime(oil_df['date'])
oil_df.set_index('date', inplace=True)

oil_df.isna().sum()

# 3. Perform linear interpolation to fill missing values
oil_df_interpolated = oil_df.interpolate(method='linear')

# 4. Calculate the mean of the available oil price data excluding NaN values
mean_oil_price = oil_df['dcoilwtico'].mean()

# 5. Fill initial NaN values with the calculated mean
oil_df_interpolated['dcoilwtico'].fillna(mean_oil_price, inplace=True)

# Create new file 'oil2.csv' and write interpolated data into it after filling initial NaN values with mean
oil_df_interpolated.to_csv('oil2.csv')

# oil_df_interpolated.head()



In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
from sklearn import preprocessing
from sklearn import datasets, linear_model
from sklearn.datasets import load_iris
#Note: imports are optional (I used VSCode to do the assignment)
#########Q2#########
# Augment the data in test.csv and train.csv with the oil price data.
# Re-loading the train.csv and test.csv files
oil_df_interpolated = pd.read_csv('oil2.csv')
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

# Convert the 'date' column in oil_df_interpolated to datetime
oil_df_interpolated['date'] = pd.to_datetime(oil_df_interpolated['date'])

# Convert the 'date' column in train and test datasets to datetime
train_df['date'] = pd.to_datetime(train_df['date'])
test_df['date'] = pd.to_datetime(test_df['date'])

# Merge train_df and oil_df_interpolated using 'date' as the key column
train_augmented = train_df.merge(oil_df_interpolated, left_on='date', right_on='date', how='left')
test_augmented = test_df.merge(oil_df_interpolated, left_on='date', right_on='date', how='left')

train_augmented.to_csv('train_augmented.csv')
test_augmented.to_csv('test_augmented.csv')


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
from sklearn import preprocessing
from sklearn import datasets, linear_model
from sklearn.datasets import load_iris
########Question 3########
# Load augmented datasets
train_augmented = pd.read_csv('train_augmented.csv')
test_augmented = pd.read_csv('test_augmented.csv')
train_augmented.drop(["date"], axis=1, inplace=True)
test_augmented.drop(["date"], axis=1, inplace=True)

# print(train_augmented.head())

# I used a sample of the dataset at first to test with 30%, 70% and then 100% of the values
sampled_train_augmented = train_augmented.sample(frac=1, random_state=42)

# Basic preprocessing for train and sampled train datasets
label_encoder = LabelEncoder()
print(sampled_train_augmented.tail())
sampled_train_augmented['family'] = label_encoder.fit_transform(sampled_train_augmented['family'])
test_augmented['family'] = label_encoder.transform(test_augmented['family'])

# Fill missing values with mean
# print(sampled_train_augmented.tail()) (used for testing)

sampled_train_augmented.fillna(sampled_train_augmented.mean(), inplace=True)
test_augmented.fillna(sampled_train_augmented.mean(), inplace=True)

# Preparing the data for training
features = ['family', 'dcoilwtico']  
X = sampled_train_augmented[features]
y = sampled_train_augmented['sales']

# Split the sampled training data for validation
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize models
lr_model = LinearRegression()
rf_model = RandomForestRegressor(n_estimators=10, max_depth=5, min_samples_split=4, min_samples_leaf=2, random_state=42)

# Train models
lr_model.fit(X_train, y_train)
rf_model.fit(X_train, y_train)

# Predict on validation set
lr_pred_val = lr_model.predict(X_val)
rf_pred_val = rf_model.predict(X_val)

# Calculate and compare MSE
lr_mse = mean_squared_error(y_val, lr_pred_val)
rf_mse = mean_squared_error(y_val, rf_pred_val)
print(type(lr_mse))

# Convert MSE to DataFrames
lr_mse_df = pd.DataFrame({'Linear Regression MSE': [lr_mse]})
rf_mse_df = pd.DataFrame({'Random Forest MSE': [rf_mse]})

# Save MSE to CSV files (optional)
lr_mse_df.to_csv('lr_mse.csv', index=False)
rf_mse_df.to_csv('rf_mse.csv', index=False)

# Print the values to console
print("Linear Regression MSE: ", lr_mse)
print("Random Forest MSE: ", rf_mse)
print(train_augmented.sales.max())
print(train_augmented.sales.min())

# Predict on test set
X_test = test_augmented[features]
test_augmented['lr_sales_prediction'] = lr_model.predict(X_test)
test_augmented['rf_sales_prediction'] = rf_model.predict(X_test)

lr_sales_prediction_df = pd.DataFrame({'lr_sales_prediction': test_augmented['lr_sales_prediction']})
rf_sales_prediction_df = pd.DataFrame({'rf_sales_prediction': test_augmented['rf_sales_prediction']})
# Save predictions to a CSV file
test_augmented[['lr_sales_prediction', 'rf_sales_prediction']].to_csv('predictions_test_augmented.csv', index=False)


In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error

########Question 4########
# Gotta reconstruct the id column on predictions so that I can merge on this column
# NOTE: assumption that the data is perfectly aligned (row wise), which is okay and works
test_augmented = pd.read_csv('test_augmented.csv')
predictions_test_augmented = pd.read_csv('predictions_test_augmented.csv')
predictions_test_augmented['id'] = test_augmented['id']

submission = pd.read_csv('submission.csv')

# Merge predictions with submission 
comparison_df = predictions_test_augmented.merge(submission, on='id')

print(comparison_df.head())

# Function to calculate RMSE
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

# Function to calculate MAPE
# def mape(y_true, y_pred): 
#     epsilon = 1e-10
#     return np.mean(np.abs((y_true - y_pred) / (y_true + epsilon))) * 100
# EDIT: MAPE is not a good metric for this problem because there are many 0s in the data...
# I tried using a variable to offset and fix the division by 0 (epsilon), but that has weird behaviour (aka extremely high values, meaning  that there are instances where the actual sales (y_true) are very close to zero, which greatly inflates the percentage errors)
# So, I instead use Median Absolute Deviation (MedAD) as a third metric (see below)

def medad(y_true, y_pred):
    return np.median(np.abs(y_true - y_pred))

# Calculate MedAD for Linear Regression and Random Forest predictions
lr_medad = medad(comparison_df['sales'], comparison_df['lr_sales_prediction'])
rf_medad = medad(comparison_df['sales'], comparison_df['rf_sales_prediction'])

# Calculate metrics for Linear Regression predictions
lr_rmse = rmse(comparison_df['sales'], comparison_df['lr_sales_prediction'])
lr_mad = mean_absolute_error(comparison_df['sales'], comparison_df['lr_sales_prediction'])
# lr_mape = mape(comparison_df['sales'], comparison_df['lr_sales_prediction'])

# Calculate metrics for Random Forest predictions
rf_rmse = rmse(comparison_df['sales'], comparison_df['rf_sales_prediction'])
rf_mad = mean_absolute_error(comparison_df['sales'], comparison_df['rf_sales_prediction'])
# rf_mape = mape(comparison_df['sales'], comparison_df['rf_sales_prediction'])

# Print results to console 
print(f"Linear Regression - RMSE: {lr_rmse}, MAD: {lr_mad}")
print(f"Random Forest - RMSE: {rf_rmse}, MAD: {rf_mad}")
print(f"Linear Regression - MedAD: {lr_medad}")
print(f"Random Forest - MedAD: {rf_medad}")

# Question 4 Analysis


## Question 4, c) 

### "Compare the three errors. Are they in agreement? Do you think any of the methods is objectively better than the others in this case?"

I chose to use the MedAD metric as the third metric to compare the models' performance. The MedAD metric is the Median Absolute Deviation between the predicted and actual values. This metric is useful because it is not as sensitive to outliers as the MAPE metric (see comments in q4).

Analysis:

All three metrics show a **consistent** trend: the Random Forest model has significantly lower errors compared to the Linear Regression model. This consistency across different types of error metrics indicates a strong agreement in their assessment of the models' performance.

**The RFM is a better model** for this case because it has lower errors across all three metrics (RMSE, MAD, and MedAD), indicating that it is better at predicting the actual values of the data points. 
- A lower RMSE indicates that it is better at handling larger deviations in predictions. 
- A lower MAD indicates that, on average, it makes smaller errors than the Linear Regression model. 
- A lower MedAD, which indicates that the median size of errors is much smaller, indicating more consistent and accurate predictions for the majority of the data points.
_____

Interpretation of the errors:

- RMSE: It emphasizes larger errors more due to the squaring of the residuals. The fact that RF has a substantially lower RMSE suggests it's better at handling larger deviations in predictions.
- MAD: This metric gives an average level of error and is less sensitive to outliers than RMSE. The lower MAD for the Random Forest model indicates that on average, it makes smaller errors than the Linear Regression model.
- MedAD: This metric gives the typical or median error in the predictions and is robust to outliers. The significantly lower MedAD for the Random Forest model suggests that the median size of errors is much smaller, indicating more consistent and accurate predictions for the majority of the data points.