# 1) Laden der verarbeiteten Daten

In [None]:
import os

import kagglehub
import pandas as pd

In [None]:
# Download latest version of dataset
# link: https://www.kaggle.com/datasets/nikhil7280/weather-type-classification
path = kagglehub.dataset_download("nikhil7280/weather-type-classification")

complete_path = path + "/" + os.listdir(path)[0]

print("Path to dataset:", complete_path)

In [None]:
# Read the raw data file (csv file)
df = pd.read_csv(complete_path)

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
# Prüfen, ob die Daten der Zielvariable ausgewogen sind
for target_val in df["Weather Type"].unique():
    print(f"{target_val} has {len(df[df['Weather Type'] == target_val])} samples")

### Die Daten sind schon gut vorverarbeitet. Allerdings sind einige Daten noch nicht numerisch

In [None]:
from sklearn.preprocessing import LabelEncoder

def encode_df(df, categorical_cols = ["Cloud Cover", "Season", "Location", "Weather Type"]):# Im dict werden alle LabelEncoder gespeichert
    label_encoders = {}

    for col in categorical_cols:
        # Für jede Spalte wird ein neuer LabelEncoder erstellt
        le = LabelEncoder()
        
        if col in df.columns:
            df[col] = le.fit_transform(df[col])
            label_encoders[col] = le

    # Ausgabe welcher Wert welche nummerische Repräsentation hat
    for col, encoder in label_encoders.items():
        print(f"\n{col} mapping:")
        for i, class_name in enumerate(encoder.classes_):
            print(f"  {class_name} -> {i}")

    return df, label_encoders

In [None]:
data, label_encoders = encode_df(df)

In [None]:
import pickle
with open('../../data/day_3/label_encoders.pkl', 'wb') as f:
    pickle.dump(label_encoders, f)

In [None]:
data.info()

### Jetzt sind die Daten bereit für das Machine Learning

# 2) Daten in Training und Test aufteilen

In [None]:
# Speichern der Features, ohne die Zielvariable
X = data.drop(columns=["Weather Type"])
X.head()

In [None]:
# Speichern der Zielvariable
y = data["Weather Type"]
y.head()

## 2.1) Statischer Split

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
# train_test_split teilt sowohl die Features als auch die Zielvariable automatisch in Trainings- und Testdaten auf
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

In [None]:
for target_val in y_train.unique():
    print(f"{target_val} has {len(y_train[y_train == target_val])} samples")

## 2.2) Stratified Split

In [None]:
# Durch das Parameter stratify wird sichergestellt, dass die Verteilung der Zielvariable in den Trainings- und Testdaten gleich ist
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y)

In [None]:
for target_val in y_train.unique():
    print(f"{target_val} has {len(y_train[y_train == target_val])} samples")

In [None]:
print("Größe von X_train:", X_train.shape)
print("Größe von X_test:", X_test.shape)

In [None]:
y_test = pd.DataFrame(y_test)
y_train = pd.DataFrame(y_train)

In [None]:
# Speichern der Trainings- und Testdaten für spätere Reproduzierbarkeit und Evaluierung
X_train.to_parquet("../../data/day_3/X_train.parquet")
X_test.to_parquet("../../data/day_3/X_test.parquet")
y_train.to_parquet("../../data/day_3/y_train.parquet")
y_test.to_parquet("../../data/day_3/y_test.parquet")

# 3) Hyperparameter festlegen

In [None]:
# Modell auswählen
from sklearn.ensemble import RandomForestClassifier

# Dokumentation durchgehen und relevante Hyperparameter setzen
# https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html


In [None]:
from sklearn.neighbors import KNeighborsClassifier
# https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html

In [None]:
from sklearn.svm import SVC
# https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html

## 3.1) Manuelles Setzen der Hyperparameter

In [None]:
rfc = RandomForestClassifier(
    n_estimators=75,
    max_depth=10,
    min_samples_split=2,
    min_samples_leaf=2,
    max_features="sqrt",
    random_state=42,
)

In [None]:
knn = KNeighborsClassifier(
    n_neighbors=7,
    weights="distance",
    algorithm="auto",
    leaf_size=35,
    p=2,
)

In [None]:
svc = SVC(
    C=1.0,
    kernel="rbf",
    degree=3,
    gamma="scale",
    coef0=0.0,
    shrinking=True,
    probability=True,
    tol=0.001,
    cache_size=200,
    class_weight=None,
    verbose=False,
    max_iter=-1,
    decision_function_shape="ovr",
    break_ties=False,
)

## 3.2) Automatisiertes Tuning der Hyperparameter mit GridSearch

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
rfc_grid=RandomForestClassifier(random_state=42)

param_grid_rfc = { 
    'n_estimators': [75, 150],
    'max_features': ['sqrt', 'log2'],
    'max_depth' : [5, 10, 15],
    'min_samples_leaf': [2, 3],
    'criterion' :['gini', 'entropy']
}

grid_rfc = GridSearchCV(estimator=rfc_grid, param_grid=param_grid_rfc, cv=5, n_jobs=-1, verbose=2)

In [None]:
knn_grid=KNeighborsClassifier()

param_grid_knn = { 
    'n_neighbors': [3, 5, 7],
    'weights': ['uniform', 'distance'],
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
    'leaf_size': [20, 30, 40],
    'p': [1, 2]
}

grid_knn = GridSearchCV(estimator=knn_grid, param_grid=param_grid_knn, cv=5, n_jobs=-1, verbose=2)

In [None]:
svc_grid=SVC(probability=True, random_state=42)

param_grid_svc = {
    'C': [0.1, 2, 10],
    'kernel': ['rbf', 'sigmoid'],
    'degree': [2, 3],
    'coef0': [0.0, 0.5]
}

grid_svc = GridSearchCV(estimator=svc_grid, param_grid=param_grid_svc, cv=5, n_jobs=-1, verbose=2)

# 4) Das Modell trainieren

In [None]:
from time import time
import pickle

## 4.1) Ein einzelnes Modell trainieren

In [None]:
# Den random forest auf die Trainingsdaten anpassen
start_time = time()
rfc.fit(X_train, y_train.values.ravel())
end_time = time()
print("Trainingszeit Random Forest:", end_time - start_time, "Sekunden")

In [None]:
print("Feature importances in percent:")
{f"{X.columns[i]}":float(list(rfc.feature_importances_)[i]) * 100 for i in range(len(X.columns))}

In [None]:
def save_model(model, filename):
    # Ordner für alle Modelle erstellen
    model_dir = '../../models'

    # Speichern des Modells als pickle-Datei
    model_path = os.path.join(model_dir, filename)
    with open(model_path, 'wb') as file:
        pickle.dump(model, file)

    print(f"Modell gespeichert im Pfad:\n- {model_path}")

In [None]:
save_model(rfc, 'random_forest_model.pkl')

In [None]:
# Den random forest auf die Trainingsdaten anpassen
start_time = time()
knn.fit(X_train, y_train.values.ravel())
end_time = time()
print("Trainingszeit KNN:", end_time - start_time, "Sekunden")

save_model(knn, 'KNN_model.pkl')

In [None]:
# Den random forest auf die Trainingsdaten anpassen
start_time = time()
svc.fit(X_train, y_train.values.ravel())
end_time = time()
print("Trainingszeit SVC:", end_time - start_time, "Sekunden")

save_model(svc, 'SVC_model.pkl')

# 4.2) Ein Hyperparametertuning trainieren

In [None]:
print(X_train)

In [None]:
start_time = time()
y_train = y_train.values.ravel()
grid_rfc.fit(X_train, y_train)
end_time = time()
print("Trainingszeit des Hyperparametertunings RFC:", end_time - start_time, "Sekunden")

In [None]:
print("Best parameters found: ", grid_rfc.best_params_)

In [None]:
best_rfc = grid_rfc.best_estimator_
print("Feature importances in percent:")
{f"{X.columns[i]}":float(list(best_rfc.feature_importances_)[i]) * 100 for i in range(len(X.columns))}

In [None]:
save_model(best_rfc, "tuned_rfc_model.pkl")

In [None]:
start_time = time()
grid_knn.fit(X_train, y_train)
end_time = time()
print("Trainingszeit des Hyperparametertunings KNN:", end_time - start_time, "Sekunden")

print("Best parameters found: ", grid_knn.best_params_)

best_knn = grid_knn.best_estimator_

save_model(best_knn, "tuned_knn_model.pkl")

In [None]:
start_time = time()
grid_svc.fit(X_train, y_train)
end_time = time()
print("Trainingszeit des Hyperparametertunings SVC:", end_time - start_time, "Sekunden")

print("Best parameters found: ", grid_svc.best_params_)

best_svc = grid_svc.best_estimator_

save_model(best_svc, "tuned_svc_model.pkl")