In [None]:
# import pandas as pd
# from xgboost import XGBRegressor
# from sklearn.model_selection import train_test_split
# from sklearn.metrics import mean_squared_error, r2_score

# # Load data
# df = pd.read_excel("Feature_engineering_DS.xlsx")

# # Define features (X) and target (y)
# X = df.drop(columns=["L8","date"])
# y = df["L8"]

# # Split into train/test
# X_train, X_test, y_train, y_test = train_test_split(
#     X, y, test_size=0.2, random_state=42
# )

# # Initialize and train XGBoost model
# model = XGBRegressor(
#     n_estimators=500,
#     learning_rate=0.05,
#     max_depth=6,
#     subsample=0.8,
#     colsample_bytree=0.8,
#     random_state=42,
#     objective='reg:squarederror'
# )

# model.fit(X_train, y_train)

# # Predictions
# y_pred = model.predict(X_test)

# # Evaluation
# mse = mean_squared_error(y_test, y_pred)
# r2 = r2_score(y_test, y_pred)

# print(f"Mean Squared Error: {mse:.4f}")
# print(f"R² Score: {r2:.4f}")



In [None]:
# X_train.head()


In [None]:
import pandas as pd
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score,mean_absolute_error
import numpy as np

# Load data
df = pd.read_excel("Feature_engineering_DS.xlsx")

# Ensure data is sorted by date (important for time series)
# Replace 'date_column_name' with your actual date column
df = df.sort_values(by="date").reset_index(drop=True)

# Define features (X) and target (y)
X = df.drop(columns=["L8","date"])
y = df["L8"]

# Time-based split (no randomization)
split_index = int(len(df) * 0.8)  # 80% for training, 20% for testing

X_train, X_test = X.iloc[:split_index], X.iloc[split_index:]
y_train, y_test = y.iloc[:split_index], y.iloc[split_index:]

print(f"Training samples: {len(X_train)}")
print(f"Testing samples: {len(X_test)}")

# Initialize and train XGBoost model
model = XGBRegressor(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    objective='reg:squarederror'
)

model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Evaluation
xgb_mse = mean_squared_error(y_test, y_pred)
xgb_r2 = r2_score(y_test, y_pred)
xgb_rmse = np.sqrt(xgb_mse)
xgb_mae = mean_absolute_error(y_test, y_pred)
# mape
# mae

print(f"Mean Squared Error: {xgb_mse:.2f}")
print(f"Root Mean Squared Error: {xgb_rmse:.2f}")
print(f"R² Score: {xgb_r2:.4f}")
print(f"MAE Score: {xgb_mae:.4f}")


Training samples: 84172
Testing samples: 21044
Mean Squared Error: 73114.46
Root Mean Squared Error: 270.40
R² Score: 0.8192
MAE Score: 191.0665


In [None]:
#mo

# Task
Rewrite the provided code using a CNN model.

## Data preparation

### Subtask:
Reshape the data to be suitable for a CNN model, which typically expects a 3D input (samples, timesteps, features).


**Reasoning**:
Reshape the training and testing data to be suitable for a CNN model with the specified number of timesteps and verify the shapes.



In [None]:
n_steps = X_train.shape[1] # Use all features as timesteps

X_train_reshaped = X_train.values.reshape((X_train.shape[0], 1, X_train.shape[1]))
X_test_reshaped = X_test.values.reshape((X_test.shape[0], 1, X_test.shape[1]))

print("Reshaped X_train shape:", X_train_reshaped.shape)
print("Reshaped X_test shape:", X_test_reshaped.shape)

Reshaped X_train shape: (84172, 1, 11)
Reshaped X_test shape: (21044, 1, 11)


## Model building

### Subtask:
Define and compile a CNN model architecture using a deep learning framework like TensorFlow or PyTorch.


**Reasoning**:
Define and compile a CNN model architecture using TensorFlow/Keras as instructed.



In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense

# Define the CNN model
model_cnn = Sequential()

# Add Conv1D layer
model_cnn.add(Conv1D(filters=64, kernel_size=1, activation='relu', input_shape=(X_train_reshaped.shape[1], X_train_reshaped.shape[2])))

# Add MaxPooling1D layer
model_cnn.add(MaxPooling1D(pool_size=1))

# Flatten the output
model_cnn.add(Flatten())

# Add Dense layers
model_cnn.add(Dense(100, activation='relu'))

# Output layer
model_cnn.add(Dense(1))

# Compile the model
model_cnn.compile(optimizer='adam', loss='mse', metrics=['mae', 'mse'])

# Print model summary
model_cnn.summary()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


## Model training

### Subtask:
Train the CNN model on the prepared training data.


In [None]:
# Train the CNN model
history = model_cnn.fit(
    X_train_reshaped,
    y_train,
    epochs=50,
    batch_size=32,
    validation_split=0.2  # Using a validation split to monitor performance
)

Epoch 1/50
[1m2105/2105[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 3ms/step - loss: 491275.6562 - mae: 318.9229 - mse: 491275.6562 - val_loss: 88986.8047 - val_mae: 196.4779 - val_mse: 88986.8047
Epoch 2/50
[1m2105/2105[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2ms/step - loss: 62963.7891 - mae: 174.6901 - mse: 62963.7891 - val_loss: 86846.8594 - val_mae: 199.6904 - val_mse: 86846.8594
Epoch 3/50
[1m2105/2105[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 3ms/step - loss: 63420.5430 - mae: 176.0598 - mse: 63420.5430 - val_loss: 87862.4219 - val_mae: 197.7157 - val_mse: 87862.4219
Epoch 4/50
[1m2105/2105[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2ms/step - loss: 62053.6445 - mae: 173.1850 - mse: 62053.6445 - val_loss: 92817.1719 - val_mae: 200.6893 - val_mse: 92817.1719
Epoch 5/50
[1m2105/2105[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 3ms/step - loss: 63517.2422 - mae: 175.0953 - mse: 63517.2422 - val_loss: 90815.2500 - val_mae:

**Reasoning**:
The task is to summarize the data. I will load the data and display the first few rows, along with the data types and descriptive statistics to get an overview of the dataset.



**Reasoning**:
The previous attempt to load the data failed because the file 'data.csv' was not found. I need to load the data from the correct file path, which is '/data/data.csv'.



**Reasoning**:
The previous attempts to load the data failed because the file was not found at either 'data.csv' or '/data/data.csv'. I need to try another common location for data files in this environment, which is '/tmp/data.csv'.



In [None]:
df = pd.read_excel('Feature_engineering_DS.xlsx')
display(df.head())
display(df.info())
display(df.describe())

Unnamed: 0,L1,L2,L3,L4,L5,L6,L7,Weekends,holiday,local/national,Week,L8,date
0,2870,2845,2832,2937,1898,2323,2630,1,0,0,6,2526,01-01-2022 12:00:00 AM
1,2864,2850,2814,2923,1931,2345,2580,1,0,0,6,2505,01-01-2022 12:15:00 AM
2,2846,2867,2791,2884,1924,2347,2523,1,0,0,6,2560,01-01-2022 12:30:00 AM
3,2809,2826,2850,2904,1916,2288,2549,1,0,0,6,2551,01-01-2022 12:45:00 AM
4,2788,2818,2815,2851,1929,2225,2507,1,0,0,6,2572,01-01-2022 01:00:00 AM


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 105216 entries, 0 to 105215
Data columns (total 13 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   L1              105216 non-null  int64 
 1   L2              105216 non-null  int64 
 2   L3              105216 non-null  int64 
 3   L4              105216 non-null  int64 
 4   L5              105216 non-null  int64 
 5   L6              105216 non-null  int64 
 6   L7              105216 non-null  int64 
 7   Weekends        105216 non-null  int64 
 8   holiday         105216 non-null  int64 
 9   local/national  105216 non-null  int64 
 10  Week            105216 non-null  int64 
 11  L8              105216 non-null  int64 
 12  date            105216 non-null  object
dtypes: int64(12), object(1)
memory usage: 10.4+ MB


None

Unnamed: 0,L1,L2,L3,L4,L5,L6,L7,Weekends,holiday,local/national,Week,L8
count,105216.0,105216.0,105216.0,105216.0,105216.0,105216.0,105216.0,105216.0,105216.0,105216.0,105216.0,105216.0
mean,4382.148399,4383.106438,4384.123413,4385.175848,4386.372405,4387.845774,4389.291382,0.286496,0.072993,0.081204,4.0,4390.572546
std,687.233495,686.569065,685.78859,685.088589,683.774668,681.439406,679.781796,0.452126,0.260126,0.301719,2.002289,678.489133
min,1772.0,1772.0,1772.0,1772.0,1841.0,2192.0,2228.0,0.0,0.0,0.0,1.0,2228.0
25%,3895.0,3896.0,3897.75,3899.0,3900.0,3902.0,3903.0,0.0,0.0,0.0,2.0,3905.0
50%,4396.0,4397.0,4398.0,4399.0,4399.0,4400.0,4401.0,0.0,0.0,0.0,4.0,4402.0
75%,4860.0,4861.0,4861.0,4862.0,4862.0,4862.0,4863.0,1.0,0.0,0.0,6.0,4863.0
max,6455.0,6455.0,6455.0,6455.0,6455.0,6455.0,6455.0,1.0,1.0,2.0,7.0,6455.0


## Model evaluation

### Subtask:
Evaluate the trained CNN model on the test data using appropriate metrics like MSE and R².

**Reasoning**:
Evaluate the performance of the trained CNN model on the test dataset using Mean Squared Error (MSE), Root Mean Squared Error (RMSE), and R² Score.

In [None]:
# Evaluate the model
loss, mae, mse = model_cnn.evaluate(X_test_reshaped, y_test, verbose=0)
rmse = np.sqrt(mse)

print(f"CNN Model Evaluation:")
print(f"  Mean Squared Error (MSE): {mse:.2f}")
print(f"  Root Mean Squared Error (RMSE): {rmse:.2f}")

# Calculate R² score
y_pred_cnn = model_cnn.predict(X_test_reshaped)
r2_cnn = r2_score(y_test, y_pred_cnn)
print(f"  R² Score: {r2_cnn:.4f}")

CNN Model Evaluation:
  Mean Squared Error (MSE): 64744.29
  Root Mean Squared Error (RMSE): 254.45
[1m658/658[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step
  R² Score: 0.8399


## Finish task

### Subtask:
Present the results and findings from the CNN model.

**Reasoning**:
Summarize the performance of the CNN model by comparing its evaluation metrics (MSE, RMSE, R²) to the previously obtained metrics from the XGBoost model.

In [None]:
print("Comparison of Model Performance:")
print("-" * 30)
print(f"XGBoost Model:")
print(f"  Mean Squared Error: {mse:.4f}")
print(f"  R² Score: {r2:.4f}")
print("-" * 30)
print(f"CNN Model:")
print(f"  Mean Squared Error (MSE): {xgb_mse:.2f}")
print(f"  Root Mean Squared Error (RMSE): {xgb_rmse:.2f}")
print(f"  R² Score: {r2_cnn:.4f}")