In [None]:
pip install catboost xgboost

Collecting catboost
  Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


In [None]:
import re
import os
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
from mpl_toolkits.mplot3d import Axes3D
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, LabelEncoder
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, precision_score, recall_score, accuracy_score, f1_score, confusion_matrix, classification_report
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.datasets import fetch_openml
mpl.rcParams['figure.figsize'] = [16,9]
from sklearn.ensemble import GradientBoostingClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
import pickle
import os
import gc

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
df = pd.read_csv('/content/drive/MyDrive/weather_2.csv')

In [None]:
df['date'] = pd.to_datetime(df['date'])

In [None]:
df['date'] = df['date'].dt.tz_localize(None)

In [None]:
unique_cities = df['city'].unique()

In [None]:
from sklearn.ensemble import RandomForestClassifier
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# Set figure size
plt.rcParams['figure.figsize'] = [10, 8]

# Create directories in Google Drive
base_path = '/content/drive/MyDrive/weather_models'
os.makedirs(f"{base_path}/saved_models_clasf", exist_ok=True)
os.makedirs(f"{base_path}/plots", exist_ok=True)

# Define target and exclude columns
target_columns = ['wind_direction_cat', 'rain_cat', 'snowfall_cat', 'cloud_cover_cat']
exclude_cols = ['city', 'latitude', 'longitude', 'date', 'wind_direction', 'rain', 'snowfall', 'cloud_cover']

# Ensure date column is datetime
df['date'] = pd.to_datetime(df['date'])

# Get unique cities
unique_cities = df['city'].unique()

# Initialize LabelEncoder
le = LabelEncoder()

for target in target_columns:
    print(f"\n=== TRAINING MODELS TO CLASSIFY: {target.upper()} ===")
    metrics_list = []
    i = 0
    for city in unique_cities:  # Limit to 1 city as in your code
        print(f"\nProcessing city: {city}")
        # Filter for 00:00 hours
        city_df = df[(df['city'] == city) & (df['date'].dt.hour == 0)].copy()

        # Debug: Check data size and unique classes
        print(f"Rows for {city} ({target}): {len(city_df)}")
        print(f"Unique {target} values: {city_df[target].unique()}")

        # Check if data is sufficient
        # if city_df.empty or len(city_df[target].unique()) < 2:
        #     print(f"Skipping {city} ({target}): Insufficient data or classes ({len(city_df[target].unique())} unique classes)")
        #     continue

        city_df = city_df.drop(columns=[col for col in exclude_cols if col in city_df.columns])
        city_df = city_df.dropna(subset=[target])

        y = le.fit_transform(city_df[target])
        print(f"Encoded classes for {city} ({target}): {le.classes_}")
        with open(f"{base_path}/saved_models_clasf/{city.replace(' ', '_')}_{target}_label_encoder.pkl", "wb") as f:
            pickle.dump(le, f)

        X = city_df.drop(columns=[target])

        for col in X.select_dtypes(include=['object']).columns:
            X[col] = le.fit_transform(X[col])
            with open(f"{base_path}/saved_models_clasf/{city.replace(' ', '_')}_{col}_feature_encoder.pkl", "wb") as f:
                pickle.dump(le, f)

        scaler = StandardScaler()
        X = scaler.fit_transform(X)
        with open(f"{base_path}/saved_models_clasf/{city.replace(' ', '_')}_{target}_scaler.pkl", "wb") as f:
            pickle.dump(scaler, f)

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

        # Optimized hyperparameter grid
        param_grid = {
            'n_estimators': [100, 200],
            'max_depth': [10, 20, None],
            'min_samples_split': [2, 5],
            'min_samples_leaf': [1, 2]
        }

        model = RandomForestClassifier(random_state=42)
        grid_search = GridSearchCV(model, param_grid, cv=5, scoring='f1_weighted', n_jobs=1)  # n_jobs=1 for memory
        try:
            grid_search.fit(X_train, y_train)
        except Exception as e:
            print(f"Error training model for {city} ({target}): {e}")
            continue

        best_model = grid_search.best_estimator_

        model_filename = f"{base_path}/saved_models_clasf/{city.replace(' ', '_')}_{target}_model.pkl"
        with open(model_filename, "wb") as f:
            pickle.dump(best_model, f)

        with open(model_filename.replace("_model.pkl", "_features.pkl"), "wb") as f:
            pickle.dump(list(city_df.drop(columns=[target]).columns), f)

        y_pred = best_model.predict(X_test)

        metrics = {
            'city': city,
            'accuracy': accuracy_score(y_test, y_pred),
            'precision': precision_score(y_test, y_pred, average='weighted', zero_division=0),
            'recall': recall_score(y_test, y_pred, average='weighted', zero_division=0),
            'f1': f1_score(y_test, y_pred, average='weighted', zero_division=0),
            'best_params': grid_search.best_params_
        }
        metrics_list.append(metrics)

        # Feature importance plot
        feature_importance = pd.DataFrame({
            'feature': city_df.drop(columns=[target]).columns,
            'importance': best_model.feature_importances_
        }).sort_values('importance', ascending=False)

        plt.figure(figsize=(12, 6))
        plt.clf()
        sns.barplot(data=feature_importance, x='importance', y='feature')
        plt.title(f'Feature Importance - {city} ({target})')
        plt.tight_layout()
        plt.savefig(f"{base_path}/plots/feature_importance_{city.replace(' ', '_')}_{target}.png")
        plt.close()

        # Confusion matrix
        cm = confusion_matrix(y_test, y_pred, labels=le.transform(le.classes_))
        cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]  # Normalize
        plt.figure(figsize=(10, 8))
        plt.clf()
        sns.heatmap(cm_normalized, annot=cm, fmt='d', cmap='Blues', xticklabels=le.classes_, yticklabels=le.classes_)
        plt.title(f'Confusion Matrix - {city} ({target})')
        plt.xlabel('Predicted')
        plt.ylabel('True')
        plt.tight_layout()
        plt.savefig(f"{base_path}/plots/confusion_matrix_{city.replace(' ', '_')}_{target}.png")
        plt.close()

        i += 1
        print(f"{i}th model saved for {city} ({target})")
        print(f"Best parameters: {grid_search.best_params_}")
        # print(f"Classification Report:\n{classification_report(y_test, y_pred, target_names=le.classes_)}")

        # Clean up
        del best_model, y_pred, city_df, X, y, X_train, X_test, y_train, y_test
        gc.collect()

    metrics_df = pd.DataFrame(metrics_list)
    metrics_df.to_csv(f"{base_path}/plots/metrics_{target}.csv", index=False)
    print(f"Saved metrics for {target} to {base_path}/plots/metrics_{target}.csv")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Processing city: Buenos Aires
Rows for Buenos Aires (wind_direction_cat): 1099
Unique wind_direction_cat values: ['SE' 'N' 'NW' 'NE' 'SW' 'E' 'S' 'W']
Encoded classes for Buenos Aires (wind_direction_cat): ['E' 'N' 'NE' 'NW' 'S' 'SE' 'SW' 'W']
40th model saved for Buenos Aires (wind_direction_cat)
Best parameters: {'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 100}

Processing city: Cairo
Rows for Cairo (wind_direction_cat): 1099
Unique wind_direction_cat values: ['NE' 'NW' 'N' 'SE' 'SW' 'S' 'E' 'W']
Encoded classes for Cairo (wind_direction_cat): ['E' 'N' 'NE' 'NW' 'S' 'SE' 'SW' 'W']
41th model saved for Cairo (wind_direction_cat)
Best parameters: {'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}

Processing city: Canberra
Rows for Canberra (wind_direction_cat): 1099
Unique wind_direction_cat values: ['SE' 'W' 'N' 'S' 'NW' 'SW' 'NE' 'E']
Encoded classe