In [1]:
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

In [2]:
data = pd.read_csv('weather_classification_data.csv')
data.head()

Unnamed: 0,Temperature,Humidity,Wind Speed,Precipitation (%),Cloud Cover,Atmospheric Pressure,UV Index,Season,Visibility (km),Location,Weather Type
0,14.0,73,9.5,82.0,partly cloudy,1010.82,2,Winter,3.5,inland,Rainy
1,39.0,96,8.5,71.0,partly cloudy,1011.43,7,Spring,10.0,inland,Cloudy
2,30.0,64,7.0,16.0,clear,1018.72,5,Spring,5.5,mountain,Sunny
3,38.0,83,1.5,82.0,clear,1026.25,7,Spring,1.0,coastal,Sunny
4,27.0,74,17.0,66.0,overcast,990.67,1,Winter,2.5,mountain,Rainy


In [3]:
data.rename(columns={'Precipitation (%)':'Precipitation', 'Visibility (km)':'Visibility'}, inplace=True)
for col in data.columns:
    data.rename(columns={col: col.replace(' ', '_')}, inplace=True)
data.columns

Index(['Temperature', 'Humidity', 'Wind_Speed', 'Precipitation', 'Cloud_Cover',
       'Atmospheric_Pressure', 'UV_Index', 'Season', 'Visibility', 'Location',
       'Weather_Type'],
      dtype='object')

In [45]:
from sklearn.model_selection import train_test_split

X, y = data.iloc[:, :-1], data.iloc[:, -1]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [53]:
X_train

Unnamed: 0,Temperature,Humidity,Wind_Speed,Precipitation,Cloud_Cover,Atmospheric_Pressure,UV_Index,Visibility,Season_Autumn,Season_Spring,Season_Summer,Season_Winter,Location_coastal,Location_inland,Location_mountain
13143,-7.0,70,1.5,72.0,overcast,989.30,1,2.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
5670,-2.0,60,13.5,75.0,overcast,997.81,1,4.5,0.0,0.0,0.0,1.0,0.0,0.0,1.0
1113,1.0,95,4.0,97.0,overcast,987.61,0,3.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
2602,20.0,59,10.0,16.0,partly cloudy,1015.12,10,9.5,0.0,0.0,1.0,0.0,0.0,0.0,1.0
10526,29.0,32,3.0,15.0,clear,1012.12,11,6.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7593,25.0,65,0.0,16.0,clear,1019.39,5,9.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
1996,12.0,89,6.5,86.0,overcast,992.27,2,1.5,0.0,0.0,1.0,0.0,1.0,0.0,0.0
9763,0.0,86,9.0,79.0,overcast,995.35,0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
6507,41.0,41,4.5,10.0,clear,1013.37,6,9.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0


In [67]:
from sklearn.ensemble import RandomForestClassifier

train_attributes = X_train.columns

rf_for_features = RandomForestClassifier(n_estimators=100)
rf_for_features.fit(X_train, y_train)

importances = rf_for_features.feature_importances_
importances_dict = {feature: score for feature, score in zip(train_attributes, importances)}


In [68]:
sorted_importances = sorted(importances_dict.items(), key=lambda item: item[1], reverse=True)

for feature, importance in sorted_importances:
    print(f"{feature}: {importance}")

Temperature: 0.20339869058336912
Visibility: 0.1491530735006459
Precipitation: 0.13681133200177278
UV_Index: 0.12800234425140017
Atmospheric_Pressure: 0.12662632899897291
Cloud_Cover: 0.08533139520752475
Humidity: 0.06057728343120094
Wind_Speed: 0.04023164058508057
Season_Winter: 0.037416886541205306
Location_coastal: 0.011495844636029167
Season_Summer: 0.00499777066182871
Season_Spring: 0.0048725923763579815
Season_Autumn: 0.004837538450466666
Location_inland: 0.003353369737988751
Location_mountain: 0.002893909036156521


In [72]:
low_importance_features = [item[0] for item in sorted_importances if item[1]<0.03]
low_importance_features

['Location_coastal',
 'Season_Summer',
 'Season_Spring',
 'Season_Autumn',
 'Location_inland',
 'Location_mountain']

In [5]:
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder

numeric_cols = X_train.select_dtypes(include=['number']).columns
categorical_cols = list(set(X_train.columns) - set(numeric_cols))

sc = StandardScaler()
X_train[numeric_cols] = sc.fit_transform(X_train[numeric_cols])

In [6]:
X_test[numeric_cols] = sc.transform(X_test[numeric_cols])

In [55]:
cloud_cover_mapping = {
    'clear': 0,
    'partly cloudy': 1,
    'overcast': 2,
    'cloudy': 3
}

X_train['Cloud_Cover'] = X_train['Cloud_Cover'].map(cloud_cover_mapping)
X_test['Cloud_Cover'] = X_test['Cloud_Cover'].map(cloud_cover_mapping)

In [8]:
categorical_cols.remove('Cloud_Cover')

In [48]:
oh_encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

train_encoded = oh_encoder.fit_transform(X_train[categorical_cols])
test_encoded = oh_encoder.transform(X_test[categorical_cols])

In [49]:
train_cat_encoded_df = pd.DataFrame(train_encoded, columns=oh_encoder.get_feature_names_out(categorical_cols))
test_cat_encoded_df = pd.DataFrame(test_encoded, columns=oh_encoder.get_feature_names_out(categorical_cols))

In [50]:
X_train.drop(columns=categorical_cols, inplace=True)

In [35]:
X_test.drop(columns=categorical_cols, inplace=True)

In [51]:
train_cat_encoded_df.index = X_train.index
test_cat_encoded_df.index = X_test.index

In [52]:
X_train = pd.concat([X_train, train_cat_encoded_df], axis=1)
X_test = pd.concat([X_test, test_cat_encoded_df], axis=1)

In [14]:
X_train['Cloud_Cover'] = sc.fit_transform(X_train['Cloud_Cover'].values.reshape(-1, 1))
X_test['Cloud_Cover'] = sc.transform(X_test['Cloud_Cover'].values.reshape(-1, 1))


In [34]:
set(X_test.columns) - set(X_train.columns)

{'Location', 'Season'}

In [16]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

In [28]:
param_range = [10**i for i in range(-4, 2)]
param_grid = [
    {
        'C': param_range,
        'degree': [3, 4, 5, 6],
        'kernel': ['poly']
    },
    {
        'C': param_range,
        'gamma': param_range,
        'kernel': ['rbf']
    }
]

In [29]:
gs = GridSearchCV(
    estimator=SVC(),
    param_grid=param_grid,
    scoring = 'f1_micro',
    cv=5,
    refit=True,
    n_jobs=-1
)

In [30]:
gs = gs.fit(X_train, y_train)

In [31]:
print(gs.best_score_)
print(gs.best_estimator_)
print(gs.best_params_)

0.9119318181818181
SVC(C=10, gamma=0.1)
{'C': 10, 'gamma': 0.1, 'kernel': 'rbf'}


In [40]:
preds = gs.predict(X_test)

In [73]:
from sklearn.metrics import f1_score, recall_score, precision_score, accuracy_score


print(f"accuracy value: {accuracy_score(y_test, preds)}")
print(f"precision value: {precision_score(y_test, preds, average='macro')}")
print(f"recall value: {recall_score(y_test, preds, average='macro')}")
print(f"f1 value: {f1_score(y_test, preds, average='macro')}")


accuracy value: 0.9094696969696969
precision value: 0.910499075916531
recall value: 0.9094696969696969
f1 value: 0.909722244930058
