In [6]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score,mean_squared_error, r2_score

# Read the CSV file, using semicolon as a delimiter and skipping the last empty column
data = pd.read_csv("AirQuality.csv", delimiter=';', usecols=lambda x: 'Unnamed' not in x)

# Clean up column names by removing leading/trailing spaces
data.columns = data.columns.str.strip()

print("Column names after cleaning:")
print(data.columns)

Column names after cleaning:
Index(['Date', 'Time', 'CO(GT)', 'PT08.S1(CO)', 'NMHC(GT)', 'C6H6(GT)',
       'PT08.S2(NMHC)', 'NOx(GT)', 'PT08.S3(NOx)', 'NO2(GT)', 'PT08.S4(NO2)',
       'PT08.S5(O3)', 'T', 'RH', 'AH'],
      dtype='object')


In [4]:
# Convert columns with commas to numeric, replacing ',' with '.'
cols_to_convert = ['CO(GT)', 'C6H6(GT)', 'T', 'RH', 'AH']
for col in cols_to_convert:
    data[col] = data[col].str.replace(',', '.', regex=False).astype(float)

# Convert 'Date' and 'Time' to datetime objects
data['Date'] = pd.to_datetime(data['Date'], format='%d/%m/%Y', errors='coerce')
data['Time'] = pd.to_datetime(data['Time'], format='%H.%M.%S', errors='coerce').dt.time

# Combine 'Date' and 'Time' into a single datetime column
data['DateTime'] = pd.to_datetime(data['Date'].astype(str) + ' ' + data['Time'].astype(str), errors='coerce')

# Drop the original 'Date' and 'Time' columns if no longer needed
data = data.drop(['Date', 'Time'], axis=1)

# Display the data types to confirm changes
print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9471 entries, 0 to 9470
Data columns (total 14 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   CO(GT)         9357 non-null   float64       
 1   PT08.S1(CO)    9357 non-null   float64       
 2   NMHC(GT)       9357 non-null   float64       
 3   C6H6(GT)       9357 non-null   float64       
 4   PT08.S2(NMHC)  9357 non-null   float64       
 5   NOx(GT)        9357 non-null   float64       
 6   PT08.S3(NOx)   9357 non-null   float64       
 7   NO2(GT)        9357 non-null   float64       
 8   PT08.S4(NO2)   9357 non-null   float64       
 9   PT08.S5(O3)    9357 non-null   float64       
 10  T              9357 non-null   float64       
 11  RH             9357 non-null   float64       
 12  AH             9357 non-null   float64       
 13  DateTime       9357 non-null   datetime64[ns]
dtypes: datetime64[ns](1), float64(13)
memory usage: 1.0 MB
None


In [5]:
# Define features (x) and target variable (y)
features = ['CO(GT)', 'PT08.S1(CO)', 'C6H6(GT)', 'PT08.S2(NMHC)', 'NOx(GT)', 'PT08.S3(NOx)', 'PT08.S4(NO2)', 'PT08.S5(O3)', 'T', 'RH', 'AH']
target = 'NO2(GT)'

x = data[features]
y = data[target]

# Handle missing values by dropping rows with NaNs
# This is a simple approach, more sophisticated methods like imputation can be used
data_cleaned = data.dropna(subset=features + [target])
x_cleaned = data_cleaned[features]
y_cleaned = data_cleaned[target]


# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x_cleaned, y_cleaned, test_size=0.2, random_state=2)

# Initialize and train the Linear Regression model
lrr = LinearRegression()
lrr.fit(x_train, y_train)

# Make predictions
y_lrr_pred = lrr.predict(x_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_lrr_pred)
r2 = r2_score(y_test, y_lrr_pred)

print("Linear Regression Mean Squared Error (MSE):", mse)
print("Linear Regression R-squared (R2):", r2)

Linear Regression Mean Squared Error (MSE): 3070.239529721829
Linear Regression R-squared (R2): 0.8086949817663733
