In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

# Load dataset
df = pd.read_excel("AirQualityUCI.xlsx", sheet_name="AirQualityUCI")

# Create Datetime and drop original Date/Time
df['Datetime'] = pd.to_datetime(df['Date'].dt.strftime('%Y-%m-%d') + ' ' + df['Time'].astype(str))
df.drop(['Date', 'Time'], axis=1, inplace=True)

# Replace invalid values (-200) with NaN and fill
df.replace(to_replace=-200, value=np.nan, inplace=True)
df.ffill(inplace=True)
df.bfill(inplace=True)

# Standardize numeric columns
numeric_cols = df.select_dtypes(include=[np.number]).columns
scaler = StandardScaler()
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

# Feature engineering
df['Hour'] = df['Datetime'].dt.hour
df['DayOfWeek'] = df['Datetime'].dt.dayofweek

# Define features and target
X = df[['PT08.S1(CO)', 'PT08.S2(NMHC)', 'PT08.S3(NOx)', 'PT08.S4(NO2)',
        'PT08.S5(O3)', 'T', 'RH', 'AH', 'Hour', 'DayOfWeek']]
y = df['CO(GT)']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train regression model
regressor = LinearRegression()
regressor.fit(X_train, y_train)

# Predictions
y_train_pred = regressor.predict(X_train)
y_test_pred = regressor.predict(X_test)

# R² scores
r2_train = r2_score(y_train, y_train_pred)
r2_test = r2_score(y_test, y_test_pred)

# Matrix output
r2_matrix = pd.DataFrame({
    'Dataset': ['Train', 'Test'],
    'R2 Score': [r2_train, r2_test]
})
print(r2_matrix)
