In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Enable inline plotting
%matplotlib inline


In [None]:
# Load the dataset
df = pd.read_csv("AirQualityUCI.csv", sep=';', decimal=',', encoding='latin1')
df.head()


In [None]:
# Drop completely empty columns
df.drop(columns=['Unnamed: 15', 'Unnamed: 16'], inplace=True)

# Combine Date and Time into a datetime column
df['Datetime'] = pd.to_datetime(df['Date'] + ' ' + df['Time'], format='%d/%m/%Y %H.%M.%S', errors='coerce')
df.drop(columns=['Date', 'Time'], inplace=True)

# Set datetime as index
df.set_index('Datetime', inplace=True)
df.sort_index(inplace=True)
df.head()


In [None]:
# Replace -200 values (representing missing values) with NaN
df.replace(-200, pd.NA, inplace=True)

# Drop rows with too many NaNs
df.dropna(thresh=5, inplace=True)

# Fill remaining NaNs with forward fill
df.fillna(method='ffill', inplace=True)

df.isna().sum()


In [None]:
# Summary statistics
df.describe()


In [None]:
# Correlation heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(df.corr(numeric_only=True), annot=True, fmt=".2f", cmap='coolwarm')
plt.title("Correlation Heatmap")
plt.show()


In [None]:
# Plot time series for selected pollutants
pollutants = ['CO(GT)', 'NOx(GT)', 'NO2(GT)', 'C6H6(GT)', 'T', 'RH']
df[pollutants].plot(subplots=True, figsize=(15, 12), title='Air Quality Time Series Trends')
plt.tight_layout()
plt.show()
