In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR

In [None]:
df = pd.read_csv('AirQuality1.csv',sep=';')

In [None]:
df = df.drop(0)

In [None]:
df.reset_index(drop=True, inplace=True)

In [None]:
df.reset_index(drop=True, inplace=True)

In [None]:
print(df.head())


         Date      Time CO(GT)  PT08.S1(CO)  NMHC(GT) C6H6(GT)  PT08.S2(NMHC)  \
0  10/03/2004  19.00.00      2       1292.0     112.0      9,4          955.0   
1  10/03/2004  20.00.00    2,2       1402.0      88.0      9,0          939.0   
2  10/03/2004  21.00.00    2,2       1376.0      80.0      9,2          948.0   
3  10/03/2004  22.00.00    1,6       1272.0      51.0      6,5          836.0   
4  10/03/2004  23.00.00    1,2       1197.0      38.0      4,7          750.0   

   NOx(GT)  PT08.S3(NOx)  NO2(GT)  PT08.S4(NO2)  PT08.S5(O3)     T    RH  \
0    103.0        1174.0     92.0        1559.0        972.0  13,3  47,7   
1    131.0        1140.0    114.0        1555.0       1074.0  11,9  54,0   
2    172.0        1092.0    122.0        1584.0       1203.0  11,0  60,0   
3    131.0        1205.0    116.0        1490.0       1110.0  11,2  59,6   
4     89.0        1337.0     96.0        1393.0        949.0  11,2  59,2   

       AH  Unnamed: 15 ,,,,,  
0  0,7255          NaN   

In [None]:
df.drop(columns=df.columns[-2:], inplace=True)

# Replace commas with dots in numeric columns
numeric_cols = ['C6H6(GT)', 'T', 'RH', 'AH']
df[numeric_cols] = df[numeric_cols].replace(',', '.', regex=True)

# Convert comma-separated values to numerical format
comma_sep_cols = ['CO(GT)', 'C6H6(GT)', 'T', 'RH', 'AH']
df[comma_sep_cols] = df[comma_sep_cols].apply(lambda x: x.str.replace(',', '.').astype(float))

# Convert non-numeric columns to numeric
non_numeric_columns = ["CO(GT)", "PT08.S1(CO)", "NMHC(GT)", "C6H6(GT)", "PT08.S2(NMHC)",
           "NOx(GT)", "PT08.S3(NOx)", "NO2(GT)", "PT08.S4(NO2)", "PT08.S5(O3)", "T", "RH", "AH"]  # Update with non-numeric column names
for col in non_numeric_columns:
    df[col] = pd.to_numeric(df[col], errors='coerce')


# Check for duplicate rows
duplicate_rows = df[df.duplicated()]

# Check for missing values (NaN) in each column
missing_values = df.isna().sum()

# Check for missing values (NaN) in the entire DataFrame
total_missing_values = df.isna().sum().sum()

# Print the results
print("Duplicate rows:")
print(duplicate_rows)

print("\nMissing values in each column:")
print(missing_values)

print("\nTotal missing values in the DataFrame:", total_missing_values)

df.drop(df[df.eq(-200).any(axis=1)].index, inplace=True)

# Drop rows with missing values
df.dropna(subset=df.columns[:-1], how='all', inplace=True)

# Reset the index after removing rows
cleaned_df = df.reset_index(drop=True)

# Remove duplicate rows
cleaned_df = df.drop_duplicates()

# Print the cleaned DataFrame
print(cleaned_df)

# Check for missing values and duplicates in the cleaned DataFrame
print("Missing values in each column:")
print(cleaned_df.isna().sum())

print("\nDuplicate rows:")
print(cleaned_df[cleaned_df.duplicated()])
df.head()


# Check data types of the columns
print(df.dtypes)

Duplicate rows:
     Date Time  CO(GT)  PT08.S1(CO)  NMHC(GT)  C6H6(GT)  PT08.S2(NMHC)  \
9357  NaN  NaN     NaN          NaN       NaN       NaN            NaN   
9358  NaN  NaN     NaN          NaN       NaN       NaN            NaN   
9359  NaN  NaN     NaN          NaN       NaN       NaN            NaN   
9360  NaN  NaN     NaN          NaN       NaN       NaN            NaN   
9361  NaN  NaN     NaN          NaN       NaN       NaN            NaN   
...   ...  ...     ...          ...       ...       ...            ...   
9465  NaN  NaN     NaN          NaN       NaN       NaN            NaN   
9466  NaN  NaN     NaN          NaN       NaN       NaN            NaN   
9467  NaN  NaN     NaN          NaN       NaN       NaN            NaN   
9468  NaN  NaN     NaN          NaN       NaN       NaN            NaN   
9469  NaN  NaN     NaN          NaN       NaN       NaN            NaN   

      NOx(GT)  PT08.S3(NOx)  NO2(GT)  PT08.S4(NO2)  PT08.S5(O3)   T  RH  AH  
9357      NaN    

In [None]:
df['Date'] = pd.to_datetime(df['Date'], format='%d/%m/%Y')
df['Time'] = pd.to_datetime(df['Time'], format='%H.%M.%S')

# Now you can convert the time column to datetime
df['Time'] = pd.to_datetime(df['Time'], format='%H.%M.%S')

# Dropping the original 'Date' column if not needed
df = df.drop(columns=['Date','Time'])

# Splitting the data into features (X) and target variable (y)
X = df.drop(columns=['CO(GT)'])
y = df['CO(GT)']

# Splitting the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model Training (Linear Regression)
model = LinearRegression()
model.fit(X_train, y_train)

# Model Evaluation
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)

print("Mean Absolute Error:", mae)
print("Mean Squared Error:", mse)
print("Root Mean Squared Error:", rmse)

Mean Absolute Error: 0.16715993424836964
Mean Squared Error: 0.05142828652094674
Root Mean Squared Error: 0.2267780556423984


In [None]:
svm_model = SVR(kernel='linear')  # You can choose different kernels like 'rbf', 'poly', etc.
svm_model.fit(X_train, y_train)

# Make predictions
y_pred_svm = svm_model.predict(X_test)

# Evaluate the model
mae_svm = mean_absolute_error(y_test, y_pred_svm)
mse_svm = mean_squared_error(y_test, y_pred_svm)
rmse_svm = mean_squared_error(y_test, y_pred_svm, squared=False)

print("SVM Mean Absolute Error:", mae_svm)
print("SVM Mean Squared Error:", mse_svm)
print("SVM Root Mean Squared Error:", rmse_svm)

SVM Mean Absolute Error: 0.7937187503618001
SVM Mean Squared Error: 1.0587191753578826
SVM Root Mean Squared Error: 1.0289408026499303


In [None]:
# Model Training (Decision Tree Regression)
dt_model = DecisionTreeRegressor()
dt_model.fit(X_train, y_train)

# Model Evaluation
y_pred_dt = dt_model.predict(X_test)
mae_dt = mean_absolute_error(y_test, y_pred_dt)
mse_dt = mean_squared_error(y_test, y_pred_dt)
rmse_dt = mean_squared_error(y_test, y_pred_dt, squared=False)

print("Decision Tree Mean Absolute Error:", mae_dt)
print("Decision Tree Mean Squared Error:", mse_dt)
print("Decision Tree Root Mean Squared Error:", rmse_dt)

Decision Tree Mean Absolute Error: 0.18734939759036146
Decision Tree Mean Squared Error: 0.07126506024096387
Decision Tree Root Mean Squared Error: 0.2669551652262302
