In [27]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report


**Loading data**

In [28]:
df = pd.read_csv("../data/weather_data_combined.csv")
df.info()
df.head(5)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7300 entries, 0 to 7299
Data columns (total 11 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Timestamp            7300 non-null   int64  
 1   Temperature          7300 non-null   float64
 2   Feels Like           7300 non-null   float64
 3   Temp Min             7300 non-null   float64
 4   Temp Max             7300 non-null   float64
 5   Pressure             7300 non-null   int64  
 6   Humidity             7300 non-null   int64  
 7   Weather Description  7300 non-null   object 
 8   Wind Speed           7300 non-null   float64
 9   Wind Degree          7300 non-null   int64  
 10  City                 7300 non-null   object 
dtypes: float64(5), int64(4), object(2)
memory usage: 627.5+ KB


Unnamed: 0,Timestamp,Temperature,Feels Like,Temp Min,Temp Max,Pressure,Humidity,Weather Description,Wind Speed,Wind Degree,City
0,1701882000,277.92,274.38,276.45,279.29,1015,61,broken clouds,4.63,330,United States-New York City
1,1701885600,277.97,274.09,276.62,279.12,1014,57,overcast clouds,5.36,344,United States-New York City
2,1701889200,277.78,273.3,276.35,278.83,1014,55,overcast clouds,6.69,330,United States-New York City
3,1701892800,277.3,272.5,275.83,278.4,1014,56,overcast clouds,7.2,290,United States-New York City
4,1701896400,276.86,271.4,275.51,277.71,1015,53,broken clouds,8.75,320,United States-New York City


**Preprocessing data**

In [29]:
#df['Timestamp'] = pd.to_datetime(df['Timestamp'], unit='s')
df['Timestamp'] = df['Timestamp'].astype('int64')/1e9
lb = LabelEncoder()
df['City'] = lb.fit_transform(df['City'])

In [30]:
# Split the data
X = df.drop('Weather Description', axis=1)
y = df['Weather Description']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [31]:
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [32]:
# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

# Print metrics
print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
print(f'Confusion Matrix: \n{cm}')

Accuracy: 0.6678082191780822
Precision: 0.6588381406529871
Recall: 0.6678082191780822
F1 Score: 0.6471458316925507
Confusion Matrix: 
[[133  51   4   0   0   0   0   1   0   0  44  11   0]
 [ 15 528  19   0   0   0   0   0   1   0  11   7   0]
 [ 18  67  63   0   0   0   0   0   0   0   5  10   0]
 [  0   1   0   6   0   0   0   0   3   0   0   0   0]
 [  0  12   0   0  10   0   0   0   1   0   2   0   0]
 [  0   0   0   1   0   0   0   0   1   0   0   0   0]
 [  1   3   0   0   0   0   2   0   0   0   7   0   0]
 [  0   1   0   0   0   0   0   5   1   0   0   0   0]
 [  4  12   0   2   0   1   0   0  17   1   4   1   0]
 [  1   0   0   0   0   0   0   0   1   1   3   0   0]
 [ 48  15   0   0   0   0   0   1   2   1 178   4   0]
 [ 29  36  10   0   0   0   0   0   0   0   9  27   0]
 [  0   1   0   0   0   0   0   0   1   0   0   0   5]]


In [33]:
model = LogisticRegression(solver='liblinear', multi_class='auto')

# Train the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Print a classification report
print(classification_report(y_test, y_pred))

                         precision    recall  f1-score   support

          broken clouds       1.00      0.00      0.01       244
              clear sky       0.46      0.89      0.61       581
             few clouds       0.00      0.00      0.00       163
                    fog       0.00      0.00      0.00        10
                   haze       0.00      0.00      0.00        25
light intensity drizzle       0.00      0.00      0.00         2
             light rain       0.00      0.00      0.00        13
             light snow       1.00      0.14      0.25         7
                   mist       0.37      0.24      0.29        42
          moderate rain       0.00      0.00      0.00         6
        overcast clouds       0.43      0.53      0.47       249
       scattered clouds       0.00      0.00      0.00       111
                   snow       0.50      0.14      0.22         7

               accuracy                           0.46      1460
              macro avg

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
