In [76]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

**Loading data**

In [77]:
df = pd.read_csv("../data/weather_data_combined.csv")
df.info()
df.head(5)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16900 entries, 0 to 16899
Data columns (total 11 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Timestamp            16900 non-null  int64  
 1   Temperature          16900 non-null  float64
 2   Feels Like           16900 non-null  float64
 3   Temp Min             16900 non-null  float64
 4   Temp Max             16900 non-null  float64
 5   Pressure             16900 non-null  int64  
 6   Humidity             16900 non-null  int64  
 7   Weather Description  16900 non-null  object 
 8   Wind Speed           16900 non-null  float64
 9   Wind Degree          16900 non-null  int64  
 10  City                 16900 non-null  object 
dtypes: float64(5), int64(4), object(2)
memory usage: 1.4+ MB


Unnamed: 0,Timestamp,Temperature,Feels Like,Temp Min,Temp Max,Pressure,Humidity,Weather Description,Wind Speed,Wind Degree,City
0,1699542000,285.12,284.25,280.22,288.73,1011,72,overcast clouds,5.36,360,United States-New York City
1,1699545600,285.33,284.43,281.12,289.25,1011,70,overcast clouds,2.57,0,United States-New York City
2,1699549200,285.77,284.89,282.1,289.28,1010,69,overcast clouds,4.02,41,United States-New York City
3,1699552800,286.35,285.55,284.12,288.8,1009,70,broken clouds,4.63,270,United States-New York City
4,1699556400,286.98,286.19,285.34,289.13,1009,68,broken clouds,2.57,260,United States-New York City


**Preprocessing data**

In [78]:
#df['Timestamp'] = pd.to_datetime(df['Timestamp'], unit='s')
df['Timestamp'] = df['Timestamp'].astype('int64')/1e9
lb = LabelEncoder()
df['Weather Description'] = lb.fit_transform(df['Weather Description'])

In [79]:
# Split the data
X = df.drop(['Weather Description', 'City', 'Timestamp'], axis=1)
y = df['Weather Description']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [80]:
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [81]:
# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)

recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

# Print metrics
print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
print(f'Confusion Matrix: \n{cm}')

Accuracy: 0.586094674556213
Precision: 0.5522337305248808
Recall: 0.586094674556213
F1 Score: 0.5245935683789931
Confusion Matrix: 
[[  87  224    0    3    2    0    0    0    4    0    0    0  102   12
     0    0    0]
 [  23 1313    0   10    1    1    0    0    0    0    2    0   68    7
     0    0    0]
 [   0    0    0    0    0    0    0    0    1    0    0    0    0    0
     0    0    0]
 [  14  215    0   23    0    0    0    0    0    1    1    0   25   11
     0    0    0]
 [   0    5    0    0    3    0    0    0    0    0    3    0    3    0
     0    0    0]
 [   0   37    0    0    1    3    0    0    0    0    0    0    0    1
     0    0    0]
 [   0    1    0    0    0    0    2    0    1    0    0    1    5    0
     0    0    0]
 [   0    1    0    0    0    0    0    0    0    0    0    0    5    0
     0    0    0]
 [   0    5    0    0    0    0    0    0    4    1    4    1   45    0
     0    0    0]
 [   0    0    0    0    0    0    0    0    1    6    0  

In [82]:
model = LogisticRegression(solver='liblinear', multi_class='auto')

# Train the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Print a classification report
print(classification_report(y_test, y_pred, zero_division=0))

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)

recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

# Print metrics
print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
print(f'Confusion Matrix: \n{cm}')

              precision    recall  f1-score   support

           0       0.00      0.00      0.00       434
           1       0.52      0.91      0.67      1425
           2       0.00      0.00      0.00         1
           3       0.00      0.00      0.00       290
           4       0.00      0.00      0.00        14
           5       0.00      0.00      0.00        42
           6       0.00      0.00      0.00        10
           8       0.00      0.00      0.00         6
           9       0.00      0.00      0.00        60
          10       0.00      0.00      0.00         8
          11       0.50      0.04      0.08        98
          12       0.00      0.00      0.00        27
          13       0.45      0.55      0.50       722
          14       0.00      0.00      0.00       229
          16       0.00      0.00      0.00         7
          17       0.00      0.00      0.00         5
          19       0.00      0.00      0.00         2

    accuracy              

In [89]:
# K-Nearest Neighbors (KNN)
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train, y_train)
y_pred_knn = knn_model.predict(X_test)

# Calculate metrics for KNN
accuracy_knn = accuracy_score(y_test, y_pred_knn)
precision_knn = precision_score(y_test, y_pred_knn, average='weighted', zero_division=0)
recall_knn = recall_score(y_test, y_pred_knn, average='weighted')
f1_knn = f1_score(y_test, y_pred_knn, average='weighted')
cm_knn = confusion_matrix(y_test, y_pred_knn)

# Create DataFrame for KNN metrics
metrics_df_knn = pd.DataFrame({
    'Accuracy': [accuracy_knn],
    'Precision': [precision_knn],
    'Recall': [recall_knn],
    'F1 Score': [f1_knn]
})

# Print KNN metrics
print("Metrics (KNN):")
print(metrics_df_knn)

# Create DataFrame for KNN confusion matrix
cm_df_knn = pd.DataFrame(cm_knn, index=np.unique(y_test), columns=np.unique(y_test))

# Print KNN confusion matrix
print("\nConfusion Matrix (KNN):")
print(cm_df_knn)

Metrics (KNN):
   Accuracy  Precision    Recall  F1 Score
0  0.501479   0.441872  0.501479  0.455627

Confusion Matrix (KNN):
     0     1   2   3   4   5   6   8   9   10  11  12   13  14  16  17  19
0   109   225   0   8   3   0   0   0   1   1   2   1   70  14   0   0   0
1   101  1176   0  31   4   0   0   0   1   0   4   1   85  22   0   0   0
2     0     0   0   0   0   0   0   0   1   0   0   0    0   0   0   0   0
3    43   181   0  25   0   0   0   0   0   0   0   0   31  10   0   0   0
4     2     6   0   0   2   0   0   0   0   0   2   1    1   0   0   0   0
5     1    36   0   1   1   0   0   0   0   0   0   0    3   0   0   0   0
6     0     1   0   0   0   0   2   0   0   0   1   1    5   0   0   0   0
8     1     1   0   0   0   0   0   0   1   0   0   0    3   0   0   0   0
9     4     8   0   1   0   0   1   0   5   0   7   1   32   1   0   0   0
10    1     0   0   0   0   0   0   0   0   4   0   0    3   0   0   0   0
11    6    14   0   2   1   0   1   0   1   2  29