# 1) Laden der verarbeiteten Daten

In [1]:
import os

import kagglehub
import pandas as pd

In [2]:
# Download latest version of dataset
# link: https://www.kaggle.com/datasets/nikhil7280/weather-type-classification
path = kagglehub.dataset_download("nikhil7280/weather-type-classification")

complete_path = path + "/" + os.listdir(path)[0]

print("Path to dataset:", complete_path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/nikhil7280/weather-type-classification?dataset_version_number=1...


100%|██████████| 186k/186k [00:00<00:00, 562kB/s]

Extracting files...
Path to dataset: /root/.cache/kagglehub/datasets/nikhil7280/weather-type-classification/versions/1/weather_classification_data.csv





In [3]:
# Read the raw data file (csv file)
df = pd.read_csv(complete_path)

In [4]:
df.head()

Unnamed: 0,Temperature,Humidity,Wind Speed,Precipitation (%),Cloud Cover,Atmospheric Pressure,UV Index,Season,Visibility (km),Location,Weather Type
0,14.0,73,9.5,82.0,partly cloudy,1010.82,2,Winter,3.5,inland,Rainy
1,39.0,96,8.5,71.0,partly cloudy,1011.43,7,Spring,10.0,inland,Cloudy
2,30.0,64,7.0,16.0,clear,1018.72,5,Spring,5.5,mountain,Sunny
3,38.0,83,1.5,82.0,clear,1026.25,7,Spring,1.0,coastal,Sunny
4,27.0,74,17.0,66.0,overcast,990.67,1,Winter,2.5,mountain,Rainy


In [5]:
df.describe()

Unnamed: 0,Temperature,Humidity,Wind Speed,Precipitation (%),Atmospheric Pressure,UV Index,Visibility (km)
count,13200.0,13200.0,13200.0,13200.0,13200.0,13200.0,13200.0
mean,19.127576,68.710833,9.832197,53.644394,1005.827896,4.005758,5.462917
std,17.386327,20.194248,6.908704,31.946541,37.199589,3.8566,3.371499
min,-25.0,20.0,0.0,0.0,800.12,0.0,0.0
25%,4.0,57.0,5.0,19.0,994.8,1.0,3.0
50%,21.0,70.0,9.0,58.0,1007.65,3.0,5.0
75%,31.0,84.0,13.5,82.0,1016.7725,7.0,7.5
max,109.0,109.0,48.5,109.0,1199.21,14.0,20.0


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13200 entries, 0 to 13199
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Temperature           13200 non-null  float64
 1   Humidity              13200 non-null  int64  
 2   Wind Speed            13200 non-null  float64
 3   Precipitation (%)     13200 non-null  float64
 4   Cloud Cover           13200 non-null  object 
 5   Atmospheric Pressure  13200 non-null  float64
 6   UV Index              13200 non-null  int64  
 7   Season                13200 non-null  object 
 8   Visibility (km)       13200 non-null  float64
 9   Location              13200 non-null  object 
 10  Weather Type          13200 non-null  object 
dtypes: float64(5), int64(2), object(4)
memory usage: 1.1+ MB


In [7]:
# Prüfen, ob die Daten der Zielvariable ausgewogen sind
for target_val in df["Weather Type"].unique():
    print(f"{target_val} has {len(df[df['Weather Type'] == target_val])} samples")

Rainy has 3300 samples
Cloudy has 3300 samples
Sunny has 3300 samples
Snowy has 3300 samples


### Die Daten sind schon gut vorverarbeitet. Allerdings sind einige Daten noch nicht numerisch

In [8]:
from sklearn.preprocessing import LabelEncoder

def encode_df(df, categorical_cols = ["Cloud Cover", "Season", "Location", "Weather Type"]):# Im dict werden alle LabelEncoder gespeichert
    label_encoders = {}

    for col in categorical_cols:
        # Für jede Spalte wird ein neuer LabelEncoder erstellt
        le = LabelEncoder()
        
        if col in df.columns:
            df[col] = le.fit_transform(df[col])
            label_encoders[col] = le

    # Ausgabe welcher Wert welche nummerische Repräsentation hat
    for col, encoder in label_encoders.items():
        print(f"\n{col} mapping:")
        for i, class_name in enumerate(encoder.classes_):
            print(f"  {class_name} -> {i}")

    return df, label_encoders

In [9]:
data, label_encoders = encode_df(df)


Cloud Cover mapping:
  clear -> 0
  cloudy -> 1
  overcast -> 2
  partly cloudy -> 3

Season mapping:
  Autumn -> 0
  Spring -> 1
  Summer -> 2
  Winter -> 3

Location mapping:
  coastal -> 0
  inland -> 1
  mountain -> 2

Weather Type mapping:
  Cloudy -> 0
  Rainy -> 1
  Snowy -> 2
  Sunny -> 3


In [10]:
import pickle
with open('../../data/day_3/label_encoders.pkl', 'wb') as f:
    pickle.dump(label_encoders, f)

In [11]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13200 entries, 0 to 13199
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Temperature           13200 non-null  float64
 1   Humidity              13200 non-null  int64  
 2   Wind Speed            13200 non-null  float64
 3   Precipitation (%)     13200 non-null  float64
 4   Cloud Cover           13200 non-null  int64  
 5   Atmospheric Pressure  13200 non-null  float64
 6   UV Index              13200 non-null  int64  
 7   Season                13200 non-null  int64  
 8   Visibility (km)       13200 non-null  float64
 9   Location              13200 non-null  int64  
 10  Weather Type          13200 non-null  int64  
dtypes: float64(5), int64(6)
memory usage: 1.1 MB


### Jetzt sind die Daten bereit für das Machine Learning

# 2) Daten in Training und Test aufteilen

In [12]:
# Speichern der Features, ohne die Zielvariable
X = data.drop(columns=["Weather Type"])
X.head()

Unnamed: 0,Temperature,Humidity,Wind Speed,Precipitation (%),Cloud Cover,Atmospheric Pressure,UV Index,Season,Visibility (km),Location
0,14.0,73,9.5,82.0,3,1010.82,2,3,3.5,1
1,39.0,96,8.5,71.0,3,1011.43,7,1,10.0,1
2,30.0,64,7.0,16.0,0,1018.72,5,1,5.5,2
3,38.0,83,1.5,82.0,0,1026.25,7,1,1.0,0
4,27.0,74,17.0,66.0,2,990.67,1,3,2.5,2


In [13]:
# Speichern der Zielvariable
y = data["Weather Type"]
y.head()

0    1
1    0
2    3
3    3
4    1
Name: Weather Type, dtype: int64

## 2.1) Statischer Split

In [14]:
from sklearn.model_selection import train_test_split

In [15]:
# train_test_split teilt sowohl die Features als auch die Zielvariable automatisch in Trainings- und Testdaten auf
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

In [16]:
for target_val in y_train.unique():
    print(f"{target_val} has {len(y_train[y_train == target_val])} samples")

1 has 2653 samples
3 has 2659 samples
0 has 2649 samples
2 has 2599 samples


## 2.2) Stratified Split

In [17]:
# Durch das Parameter stratify wird sichergestellt, dass die Verteilung der Zielvariable in den Trainings- und Testdaten gleich ist
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y)

In [18]:
for target_val in y_train.unique():
    print(f"{target_val} has {len(y_train[y_train == target_val])} samples")

2 has 2640 samples
3 has 2640 samples
1 has 2640 samples
0 has 2640 samples


In [19]:
print("Größe von X_train:", X_train.shape)
print("Größe von X_test:", X_test.shape)

Größe von X_train: (10560, 10)
Größe von X_test: (2640, 10)


In [20]:
y_test = pd.DataFrame(y_test)
y_train = pd.DataFrame(y_train)

In [21]:
# Speichern der Trainings- und Testdaten für spätere Reproduzierbarkeit und Evaluierung
X_train.to_parquet("../../data/day_3/X_train.parquet")
X_test.to_parquet("../../data/day_3/X_test.parquet")
y_train.to_parquet("../../data/day_3/y_train.parquet")
y_test.to_parquet("../../data/day_3/y_test.parquet")

# 3) Hyperparameter festlegen

In [22]:
# Modell auswählen
from sklearn.ensemble import RandomForestClassifier

# Dokumentation durchgehen und relevante Hyperparameter setzen
# https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html


In [23]:
from sklearn.neighbors import KNeighborsClassifier
# https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html

In [24]:
from sklearn.svm import SVC
# https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html

## 3.1) Manuelles Setzen der Hyperparameter

In [25]:
rfc = RandomForestClassifier(
    n_estimators=75,
    max_depth=10,
    min_samples_split=2,
    min_samples_leaf=2,
    max_features="sqrt",
    random_state=42,
)

In [26]:
knn = KNeighborsClassifier(
    n_neighbors=7,
    weights="distance",
    algorithm="auto",
    leaf_size=35,
    p=2,
)

In [27]:
svc = SVC(
    C=1.0,
    kernel="rbf",
    degree=3,
    gamma="scale",
    coef0=0.0,
    shrinking=True,
    probability=True,
    tol=0.001,
    cache_size=200,
    class_weight=None,
    verbose=False,
    max_iter=-1,
    decision_function_shape="ovr",
    break_ties=False,
)

## 3.2) Automatisiertes Tuning der Hyperparameter mit GridSearch

In [28]:
from sklearn.model_selection import GridSearchCV

In [29]:
rfc_grid=RandomForestClassifier(random_state=42)

param_grid_rfc = { 
    'n_estimators': [75, 150],
    'max_features': ['sqrt', 'log2'],
    'max_depth' : [5, 10, 15],
    'min_samples_leaf': [2, 3],
    'criterion' :['gini', 'entropy']
}

grid_rfc = GridSearchCV(estimator=rfc_grid, param_grid=param_grid_rfc, cv=5, n_jobs=-1, verbose=2)

In [30]:
knn_grid=KNeighborsClassifier()

param_grid_knn = { 
    'n_neighbors': [3, 5, 7],
    'weights': ['uniform', 'distance'],
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
    'leaf_size': [20, 30, 40],
    'p': [1, 2]
}

grid_knn = GridSearchCV(estimator=knn_grid, param_grid=param_grid_knn, cv=5, n_jobs=-1, verbose=2)

In [31]:
svc_grid=SVC(probability=True, random_state=42)

param_grid_svc = {
    'C': [0.1, 2, 10],
    'kernel': ['rbf', 'sigmoid'],
    'degree': [2, 3],
    'coef0': [0.0, 0.5]
}

grid_svc = GridSearchCV(estimator=svc_grid, param_grid=param_grid_svc, cv=5, n_jobs=-1, verbose=2)

# 4) Das Modell trainieren

In [32]:
from time import time
import pickle

## 4.1) Ein einzelnes Modell trainieren

In [33]:
# Den random forest auf die Trainingsdaten anpassen
start_time = time()
rfc.fit(X_train, y_train.values.ravel())
end_time = time()
print("Trainingszeit Random Forest:", end_time - start_time, "Sekunden")

Trainingszeit Random Forest: 2.7924845218658447 Sekunden


In [34]:
print("Feature importances in percent:")
{f"{X.columns[i]}":float(list(rfc.feature_importances_)[i]) * 100 for i in range(len(X.columns))}

Feature importances in percent:


{'Temperature': 20.675947680138076,
 'Humidity': 4.353038947973376,
 'Wind Speed': 2.7131267318697945,
 'Precipitation (%)': 15.120352581298976,
 'Cloud Cover': 9.97621379561131,
 'Atmospheric Pressure': 10.265036261496297,
 'UV Index': 14.133846956823536,
 'Season': 5.023913988276159,
 'Visibility (km)': 16.640508663657204,
 'Location': 1.0980143928552575}

In [35]:
def save_model(model, filename):
    # Ordner für alle Modelle erstellen
    model_dir = '../../models'

    # Speichern des Modells als pickle-Datei
    model_path = os.path.join(model_dir, filename)
    with open(model_path, 'wb') as file:
        pickle.dump(model, file)

    print(f"Modell gespeichert im Pfad:\n- {model_path}")

In [36]:
save_model(rfc, 'random_forest_model.pkl')

Modell gespeichert im Pfad:
- ../../models/random_forest_model.pkl


In [37]:
# Den random forest auf die Trainingsdaten anpassen
start_time = time()
knn.fit(X_train, y_train.values.ravel())
end_time = time()
print("Trainingszeit KNN:", end_time - start_time, "Sekunden")

save_model(knn, 'KNN_model.pkl')

Trainingszeit KNN: 0.04648900032043457 Sekunden
Modell gespeichert im Pfad:
- ../../models/KNN_model.pkl


In [38]:
# Den random forest auf die Trainingsdaten anpassen
start_time = time()
svc.fit(X_train, y_train.values.ravel())
end_time = time()
print("Trainingszeit SVC:", end_time - start_time, "Sekunden")

save_model(svc, 'SVC_model.pkl')

Trainingszeit SVC: 50.00624632835388 Sekunden
Modell gespeichert im Pfad:
- ../../models/SVC_model.pkl


# 4.2) Ein Hyperparametertuning trainieren

In [39]:
print(X_train)

       Temperature  Humidity  Wind Speed  Precipitation (%)  Cloud Cover  \
13143         -7.0        70         1.5               72.0            2   
5670          -2.0        60        13.5               75.0            2   
1113           1.0        95         4.0               97.0            2   
2602          20.0        59        10.0               16.0            3   
10526         29.0        32         3.0               15.0            0   
...            ...       ...         ...                ...          ...   
7593          25.0        65         0.0               16.0            0   
1996          12.0        89         6.5               86.0            2   
9763           0.0        86         9.0               79.0            2   
6507          41.0        41         4.5               10.0            0   
11046         77.0        73        15.5              104.0            0   

       Atmospheric Pressure  UV Index  Season  Visibility (km)  Location  
13143       

In [40]:
start_time = time()
y_train = y_train.values.ravel()
grid_rfc.fit(X_train, y_train)
end_time = time()
print("Trainingszeit des Hyperparametertunings RFC:", end_time - start_time, "Sekunden")

Fitting 5 folds for each of 48 candidates, totalling 240 fits
Trainingszeit des Hyperparametertunings RFC: 132.5537815093994 Sekunden


In [41]:
print("Best parameters found: ", grid_rfc.best_params_)

Best parameters found:  {'criterion': 'gini', 'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 3, 'n_estimators': 75}


In [42]:
best_rfc = grid_rfc.best_estimator_
print("Feature importances in percent:")
{f"{X.columns[i]}":float(list(best_rfc.feature_importances_)[i]) * 100 for i in range(len(X.columns))}

Feature importances in percent:


{'Temperature': 20.611755674922776,
 'Humidity': 4.401025222658636,
 'Wind Speed': 2.587499210027659,
 'Precipitation (%)': 14.71803455707028,
 'Cloud Cover': 10.185694471831384,
 'Atmospheric Pressure': 10.635225815658467,
 'UV Index': 13.888855767471142,
 'Season': 5.541720640256782,
 'Visibility (km)': 16.09591601272705,
 'Location': 1.3342726273758054}

In [43]:
save_model(best_rfc, "tuned_rfc_model.pkl")

Modell gespeichert im Pfad:
- ../../models/tuned_rfc_model.pkl


In [44]:
start_time = time()
grid_knn.fit(X_train, y_train)
end_time = time()
print("Trainingszeit des Hyperparametertunings KNN:", end_time - start_time, "Sekunden")

print("Best parameters found: ", grid_knn.best_params_)

best_knn = grid_knn.best_estimator_

save_model(best_knn, "tuned_knn_model.pkl")

Fitting 5 folds for each of 144 candidates, totalling 720 fits
Trainingszeit des Hyperparametertunings KNN: 63.55686974525452 Sekunden
Best parameters found:  {'algorithm': 'ball_tree', 'leaf_size': 20, 'n_neighbors': 7, 'p': 1, 'weights': 'uniform'}
Modell gespeichert im Pfad:
- ../../models/tuned_knn_model.pkl


In [None]:
start_time = time()
grid_svc.fit(X_train, y_train)
end_time = time()
print("Trainingszeit des Hyperparametertunings SVC:", end_time - start_time, "Sekunden")

print("Best parameters found: ", grid_svc.best_params_)

best_svc = grid_svc.best_estimator_

save_model(best_svc, "tuned_svc_model.pkl")

Fitting 5 folds for each of 24 candidates, totalling 120 fits
