In [2]:
import os
import pandas as pd

# --- PATH SETUP ---
PROJECT_ROOT = os.path.abspath(os.path.join(os.getcwd(), ".."))
DATA_RAW = os.path.join(PROJECT_ROOT, "data", "raw")
DATA_PROCESSED = os.path.join(PROJECT_ROOT, "data", "processed")

print("Project root:", PROJECT_ROOT)
print("Raw data path:", DATA_RAW)
print("Processed data path:", DATA_PROCESSED)

Project root: e:\thesis
Raw data path: e:\thesis\data\raw
Processed data path: e:\thesis\data\processed


In [3]:
df = pd.read_csv(os.path.join(DATA_RAW, "GlobalWeatherRepository.csv"))

In [4]:
drop_columns = [
    # Targets
    "air_quality_gb-defra-index",

    # Duplicates
    "temperature_fahrenheit",
    "wind_mph",
    "pressure_in",
    "gust_mph",

    # Astronomical
    "sunrise", "sunset",
    "moonrise", "moonset",
    "moon_phase", "moon_illumination",

    # Metadata
    "last_updated", "last_updated_epoch"
]

df = df.drop(columns=drop_columns, errors="ignore")


In [5]:
X = df.drop(columns=["air_quality_us-epa-index"])
y = df["air_quality_us-epa-index"]

print("Features shape:", X.shape)
print("Target shape:", y.shape)
print("Target classes:\n", y.value_counts())

Features shape: (115568, 27)
Target shape: (115568,)
Target classes:
 air_quality_us-epa-index
1    60842
2    36635
3     9603
4     6960
5     1098
6      430
Name: count, dtype: int64


In [6]:
df = pd.read_csv(os.path.join(DATA_PROCESSED, "cleaned_air_quality_data.csv"))

In [7]:
df.to_csv(os.path.join(DATA_PROCESSED, "weather_cleaned_temp.csv"), index=False)


In [8]:
# Identify categorical and numerical columns
categorical_cols = X.select_dtypes(include=["object"]).columns
numerical_cols = X.select_dtypes(include=["number"]).columns

print("Categorical columns:")
print(categorical_cols)

print("\nNumerical columns:")
print(numerical_cols)

Categorical columns:
Index(['country', 'location_name', 'timezone', 'condition_text',
       'wind_direction'],
      dtype='object')

Numerical columns:
Index(['latitude', 'longitude', 'temperature_celsius', 'wind_kph',
       'wind_degree', 'pressure_mb', 'precip_mm', 'precip_in', 'humidity',
       'cloud', 'feels_like_celsius', 'feels_like_fahrenheit', 'visibility_km',
       'visibility_miles', 'uv_index', 'gust_kph',
       'air_quality_Carbon_Monoxide', 'air_quality_Ozone',
       'air_quality_Nitrogen_dioxide', 'air_quality_Sulphur_dioxide',
       'air_quality_PM2.5', 'air_quality_PM10'],
      dtype='object')


In [9]:
X = X.drop(columns=["location_name", "timezone"])

In [10]:
print(X.select_dtypes(include=["object"]).columns)

Index(['country', 'condition_text', 'wind_direction'], dtype='object')


In [11]:
# Check class distribution
print(y.value_counts().sort_index())

air_quality_us-epa-index
1    60842
2    36635
3     9603
4     6960
5     1098
6      430
Name: count, dtype: int64


In [12]:
# Percentage distribution
print((y.value_counts(normalize=True).sort_index()) * 100)

air_quality_us-epa-index
1    52.646061
2    31.699952
3     8.309394
4     6.022428
5     0.950090
6     0.372075
Name: proportion, dtype: float64


In [13]:
pip install numpy pandas matplotlib seaborn scikit-learn codecarbon

Note: you may need to restart the kernel to use updated packages.


In [14]:
import sklearn
print(sklearn.__version__)

1.8.0


In [15]:
print(X.select_dtypes(include=["object"]).columns)

Index(['country', 'condition_text', 'wind_direction'], dtype='object')


In [16]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
X["country"] = le.fit_transform(X["country"])


In [17]:
print(X.dtypes)

country                           int64
latitude                        float64
longitude                       float64
temperature_celsius             float64
condition_text                   object
wind_kph                        float64
wind_degree                       int64
wind_direction                   object
pressure_mb                     float64
precip_mm                       float64
precip_in                       float64
humidity                          int64
cloud                             int64
feels_like_celsius              float64
feels_like_fahrenheit           float64
visibility_km                   float64
visibility_miles                float64
uv_index                        float64
gust_kph                        float64
air_quality_Carbon_Monoxide     float64
air_quality_Ozone               float64
air_quality_Nitrogen_dioxide    float64
air_quality_Sulphur_dioxide     float64
air_quality_PM2.5               float64
air_quality_PM10                float64


In [18]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

for col in X.select_dtypes(include=["object"]).columns:
    X[col] = le.fit_transform(X[col])


In [19]:
print(X.dtypes)

country                           int64
latitude                        float64
longitude                       float64
temperature_celsius             float64
condition_text                    int64
wind_kph                        float64
wind_degree                       int64
wind_direction                    int64
pressure_mb                     float64
precip_mm                       float64
precip_in                       float64
humidity                          int64
cloud                             int64
feels_like_celsius              float64
feels_like_fahrenheit           float64
visibility_km                   float64
visibility_miles                float64
uv_index                        float64
gust_kph                        float64
air_quality_Carbon_Monoxide     float64
air_quality_Ozone               float64
air_quality_Nitrogen_dioxide    float64
air_quality_Sulphur_dioxide     float64
air_quality_PM2.5               float64
air_quality_PM10                float64


In [20]:
pollutant_cols = [
    "air_quality_PM2.5",
    "air_quality_PM10",
    "air_quality_Ozone",
    "air_quality_Carbon_Monoxide",
    "air_quality_Nitrogen_dioxide",
    "air_quality_Sulphur_dioxide"
]

X_clean = X.drop(columns=pollutant_cols)

In [23]:
X = df.drop(columns=["air_quality_us-epa-index"])
y = df["air_quality_us-epa-index"]

In [24]:
print([c for c in X.columns if "air_quality" in c])

['air_quality_Carbon_Monoxide', 'air_quality_Ozone', 'air_quality_Nitrogen_dioxide', 'air_quality_Sulphur_dioxide', 'air_quality_PM2.5', 'air_quality_PM10']


Linear Regression

In [31]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

# Identify categorical columns
categorical_cols = X.select_dtypes(include=['object', 'category']).columns
numeric_cols = X.select_dtypes(include=['number']).columns

# Preprocessing
preprocessor = ColumnTransformer([
    ("num", StandardScaler(), numeric_cols),
    ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols)
])

# Pipeline
pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("model", LogisticRegression(
        max_iter=1000,
        class_weight="balanced",
        solver="lbfgs"
    ))
])

# Train
pipeline.fit(X_train, y_train)

# Predict
y_pred = pipeline.predict(X_test)

# Evaluation
from sklearn.metrics import classification_report, f1_score
print(classification_report(y_test, y_pred))
print("Macro F1-score:", f1_score(y_test, y_pred, average="macro"))


              precision    recall  f1-score   support

           1       1.00      0.98      0.99     12169
           2       0.97      0.97      0.97      7327
           3       0.91      0.99      0.95      1921
           4       0.99      0.98      0.99      1392
           5       0.92      0.99      0.96       219
           6       0.98      1.00      0.99        86

    accuracy                           0.98     23114
   macro avg       0.96      0.99      0.97     23114
weighted avg       0.98      0.98      0.98     23114

Macro F1-score: 0.9726070457048767


Random Forest Model

In [32]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, f1_score

# Identify categorical columns
categorical_cols = X.select_dtypes(include=['object', 'category']).columns
numeric_cols = X.select_dtypes(include=['number']).columns

# Preprocessing: scale numeric, one-hot encode categorical
preprocessor = ColumnTransformer([
    ("num", "passthrough", numeric_cols),  # RF doesn't need scaling
    ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols)
])

# Pipeline
pipeline_rf = Pipeline([
    ("preprocessor", preprocessor),
    ("model", RandomForestClassifier(
        n_estimators=100,
        random_state=42,
        class_weight="balanced",
        n_jobs=-1
    ))
])

# Train
pipeline_rf.fit(X_train, y_train)

# Predict
y_pred_rf = pipeline_rf.predict(X_test)

# Evaluation
print(classification_report(y_test, y_pred_rf))
print("Macro F1-score:", f1_score(y_test, y_pred_rf, average="macro"))



              precision    recall  f1-score   support

           1       1.00      1.00      1.00     12169
           2       0.98      0.99      0.99      7327
           3       0.96      0.90      0.93      1921
           4       0.92      0.97      0.94      1392
           5       0.93      0.71      0.81       219
           6       0.99      0.90      0.94        86

    accuracy                           0.98     23114
   macro avg       0.96      0.91      0.93     23114
weighted avg       0.98      0.98      0.98     23114

Macro F1-score: 0.9343687813984696


In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

# Identify categorical columns
categorical_cols = X.select_dtypes(include=['object', 'category']).columns
numeric_cols = X.select_dtypes(include=['number']).columns

# Preprocessing
preprocessor = ColumnTransformer([
    ("num", StandardScaler(), numeric_cols),
    ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols)
])

# Pipeline
pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("model", LogisticRegression(
        max_iter=1000,
        class_weight="balanced",
        solver="lbfgs"
    ))
])

# Train
pipeline.fit(X_train, y_train)

# Predict
y_pred = pipeline.predict(X_test)

# Evaluation
from sklearn.metrics import classification_report, f1_score
print(classification_report(y_test, y_pred))
print("Macro F1-score:", f1_score(y_test, y_pred, average="macro"))


              precision    recall  f1-score   support

           1       1.00      0.98      0.99     12169
           2       0.97      0.97      0.97      7327
           3       0.91      0.99      0.95      1921
           4       0.99      0.98      0.99      1392
           5       0.92      0.99      0.96       219
           6       0.98      1.00      0.99        86

    accuracy                           0.98     23114
   macro avg       0.96      0.99      0.97     23114
weighted avg       0.98      0.98      0.98     23114

Macro F1-score: 0.9726070457048767
