Импортируем необходимые библиотеки

In [230]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_selection import RFE

Загружаем данные. Weather Type
Текстовое описание набора данных: https://www.kaggle.com/datasets/nikhil7280/weather-type-classification

In [231]:
weather = pd.read_csv('weather_classification_data.csv')

Разведочный анализ данных. Узнаем, есть ли пустые значения, какие типы данных присутствуют, сбалансированы ли они

In [232]:
weather.isnull().sum()

Temperature             0
Humidity                0
Wind Speed              0
Precipitation (%)       0
Cloud Cover             0
Atmospheric Pressure    0
UV Index                0
Season                  0
Visibility (km)         0
Location                0
Weather Type            0
dtype: int64

In [233]:
weather.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13200 entries, 0 to 13199
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Temperature           13200 non-null  float64
 1   Humidity              13200 non-null  int64  
 2   Wind Speed            13200 non-null  float64
 3   Precipitation (%)     13200 non-null  float64
 4   Cloud Cover           13200 non-null  object 
 5   Atmospheric Pressure  13200 non-null  float64
 6   UV Index              13200 non-null  int64  
 7   Season                13200 non-null  object 
 8   Visibility (km)       13200 non-null  float64
 9   Location              13200 non-null  object 
 10  Weather Type          13200 non-null  object 
dtypes: float64(5), int64(2), object(4)
memory usage: 1.1+ MB


In [234]:
class_counts = weather["Weather Type"].value_counts()
print(class_counts)

Weather Type
Rainy     3300
Cloudy    3300
Sunny     3300
Snowy     3300
Name: count, dtype: int64


Классы сбалансированы, поэтому нет необходимости использовать метод баланса классов.

С помощью OneHotEncoder представляем строковые данные в виде числовых (LabelEncoder не подойдет, потому что так появится больше зависимостей)
Преобразуем Cloud Cover, Season и Location. 

In [235]:
onehot_encoder = OneHotEncoder(sparse_output=False)

In [236]:
encoded_Cloud_Cover = pd.DataFrame(onehot_encoder.fit_transform(weather[['Cloud Cover']]))
encoded_Cloud_Cover.columns = onehot_encoder.get_feature_names_out()

weather = weather.join(encoded_Cloud_Cover)

In [237]:
encoded_Season = pd.DataFrame(onehot_encoder.fit_transform(weather[['Season']]))
encoded_Season.columns = onehot_encoder.get_feature_names_out()

weather = weather.join(encoded_Season)

In [238]:
encoded_Location = pd.DataFrame(onehot_encoder.fit_transform(weather[['Location']]))
encoded_Location.columns = onehot_encoder.get_feature_names_out()

weather = weather.join(encoded_Location)

Столбцы со строковыми данными удаляем.
Вывод columns

In [239]:
weather = weather.drop('Cloud Cover', axis=1).drop('Season', axis=1).drop('Location', axis=1)

weather

Unnamed: 0,Temperature,Humidity,Wind Speed,Precipitation (%),Atmospheric Pressure,UV Index,Visibility (km),Weather Type,Cloud Cover_clear,Cloud Cover_cloudy,Cloud Cover_overcast,Cloud Cover_partly cloudy,Season_Autumn,Season_Spring,Season_Summer,Season_Winter,Location_coastal,Location_inland,Location_mountain
0,14.0,73,9.5,82.0,1010.82,2,3.5,Rainy,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
1,39.0,96,8.5,71.0,1011.43,7,10.0,Cloudy,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
2,30.0,64,7.0,16.0,1018.72,5,5.5,Sunny,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
3,38.0,83,1.5,82.0,1026.25,7,1.0,Sunny,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
4,27.0,74,17.0,66.0,990.67,1,2.5,Rainy,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13195,10.0,74,14.5,71.0,1003.15,1,1.0,Rainy,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
13196,-1.0,76,3.5,23.0,1067.23,1,6.0,Snowy,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
13197,30.0,77,5.5,28.0,1012.69,3,9.0,Cloudy,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
13198,3.0,76,10.0,94.0,984.27,0,2.0,Snowy,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0


Создание фрейма данных из свойств данных. Убираем целевую переменную.
Выводим первые пять строк данных.

In [240]:
X = weather.drop('Weather Type', axis=1)

X.head()

Unnamed: 0,Temperature,Humidity,Wind Speed,Precipitation (%),Atmospheric Pressure,UV Index,Visibility (km),Cloud Cover_clear,Cloud Cover_cloudy,Cloud Cover_overcast,Cloud Cover_partly cloudy,Season_Autumn,Season_Spring,Season_Summer,Season_Winter,Location_coastal,Location_inland,Location_mountain
0,14.0,73,9.5,82.0,1010.82,2,3.5,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
1,39.0,96,8.5,71.0,1011.43,7,10.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
2,30.0,64,7.0,16.0,1018.72,5,5.5,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
3,38.0,83,1.5,82.0,1026.25,7,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
4,27.0,74,17.0,66.0,990.67,1,2.5,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0


Выводим статистические характеристики исследуемого набора данных.

In [241]:
X.describe()

Unnamed: 0,Temperature,Humidity,Wind Speed,Precipitation (%),Atmospheric Pressure,UV Index,Visibility (km),Cloud Cover_clear,Cloud Cover_cloudy,Cloud Cover_overcast,Cloud Cover_partly cloudy,Season_Autumn,Season_Spring,Season_Summer,Season_Winter,Location_coastal,Location_inland,Location_mountain
count,13200.0,13200.0,13200.0,13200.0,13200.0,13200.0,13200.0,13200.0,13200.0,13200.0,13200.0,13200.0,13200.0,13200.0,13200.0,13200.0,13200.0,13200.0
mean,19.127576,68.710833,9.832197,53.644394,1005.827896,4.005758,5.462917,0.162045,0.031136,0.461364,0.345455,0.189394,0.196818,0.188788,0.425,0.27053,0.364848,0.364621
std,17.386327,20.194248,6.908704,31.946541,37.199589,3.8566,3.371499,0.368506,0.173693,0.498524,0.475534,0.391836,0.397609,0.391355,0.494362,0.444251,0.481406,0.481342
min,-25.0,20.0,0.0,0.0,800.12,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,4.0,57.0,5.0,19.0,994.8,1.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,21.0,70.0,9.0,58.0,1007.65,3.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,31.0,84.0,13.5,82.0,1016.7725,7.0,7.5,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0
max,109.0,109.0,48.5,109.0,1199.21,14.0,20.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


Создание фрейма данных для целевой переменной Weather Type
Выводим первые пять строк для преобразованных столбцов целевой переменной

In [242]:
weather['Weather Type'] = weather['Weather Type'].astype('category')
weather['Weather Type_encoded'] = weather['Weather Type'].cat.codes

In [243]:
y = pd.DataFrame(weather, columns=['Weather Type_encoded'])

y.head()

Unnamed: 0,Weather Type_encoded
0,1
1,0
2,3
3,3
4,1


Отбор признаков. Метод рекурсивного исключения признаков (recursive feature elimination, RFE) реализует следующий алгоритм: модель обучается на исходном наборе признаков и оценивает их значимость, затем исключается один или несколько наименее значимых признаков, модель обучается на оставшихся признаках, и так далее, пока не останется заданное количество лучших признаков.

Создаем экземпляр класса LogisticRegression, указываем, что взвешенные классы - class_weight='balanced'. 

In [244]:
Logreg = LogisticRegression(class_weight='balanced', solver="newton-cg")
rfe = RFE(Logreg, n_features_to_select=12)
fit = rfe.fit(X, y.values.ravel())

print(f"Num Features: {fit.n_features_}")
print(f"Feature Ranking: {fit.support_}")
print(f"Feature Ranking: {fit.ranking_}")

FEATURE_NAMES = [X.columns[i] for i in range(len(fit.support_)) if fit.support_[i] == True]
print(f"Selected Features: {FEATURE_NAMES}")

Num Features: 12
Feature Ranking: [False False False False False False  True  True  True  True  True  True
  True  True  True  True  True  True]
Feature Ranking: [4 6 3 5 7 2 1 1 1 1 1 1 1 1 1 1 1 1]
Selected Features: ['Visibility (km)', 'Cloud Cover_clear', 'Cloud Cover_cloudy', 'Cloud Cover_overcast', 'Cloud Cover_partly cloudy', 'Season_Autumn', 'Season_Spring', 'Season_Summer', 'Season_Winter', 'Location_coastal', 'Location_inland', 'Location_mountain']


Вывод датафрейма с отобранными признаками

In [245]:
X = X[FEATURE_NAMES]

X.head()

Unnamed: 0,Visibility (km),Cloud Cover_clear,Cloud Cover_cloudy,Cloud Cover_overcast,Cloud Cover_partly cloudy,Season_Autumn,Season_Spring,Season_Summer,Season_Winter,Location_coastal,Location_inland,Location_mountain
0,3.5,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
1,10.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
2,5.5,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
3,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
4,2.5,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0


Разделяем набора данных в отношении 90/10 с помощью scikit-learn

In [246]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state=20)

Подгоняем модель на тренировочных данных и предсказываем по тестовым данным.
Получаем отчет о классификации. Также можно отдельно вывести метрику F1 по среднему, например, по средне-взвешенному (weighted).

In [247]:
Logreg.fit(X_train, y_train.values.ravel())
y_pred = Logreg.predict(X_test)

In [248]:
f1 = f1_score(y_test, y_pred, average="weighted")

In [249]:
print(f"classification_report: \n {classification_report(y_test, y_pred)}")
print(f"F1 score: {f1}")

classification_report: 
               precision    recall  f1-score   support

           0       0.64      0.72      0.67       315
           1       0.80      0.71      0.75       342
           2       0.74      0.93      0.83       336
           3       0.94      0.70      0.80       327

    accuracy                           0.76      1320
   macro avg       0.78      0.76      0.76      1320
weighted avg       0.78      0.76      0.76      1320

F1 score: 0.7648436160833486


Вывод по отчету: модель машинного обучения показала указанные выше результаты при отобранных 12 признаках из 18, точность предсказания оценивается как 0.76. 
Метрика F1 (гармоническое среднее точности (precision) и полноты (recall)), равная 0.76, указывает на относительно хорошую производительность модели классификации. Стоит заметить, что при отборе признаков до 18 из 18, метрика F1 становится равной 0.87.