In [35]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score,precision_score, recall_score
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

In [36]:
weather_df = pd.read_csv('../data/internal/processed/weather_processed.csv')

In [37]:
weather_df = weather_df.drop("Country", axis='columns')
weather_df = weather_df.drop("Name", axis='columns')

In [38]:
weather_df["Weather"].unique()

array(['Clear', 'Haze', 'Mist', 'Clouds', 'Snow', 'Thunderstorm', 'Rain',
       'Drizzle', 'Fog', 'Smoke'], dtype=object)

In [39]:
weather_df

Unnamed: 0,Weather,Temp,Humidity,Visibility,Wind speed,Clouds
0,Clear,280.77,57,10000,3.60,0
1,Haze,298.99,83,6000,2.06,20
2,Mist,293.25,83,1500,0.00,0
3,Clouds,288.07,47,10000,5.16,27
4,Mist,300.18,83,2200,3.09,20
...,...,...,...,...,...,...
3497,Clouds,280.24,50,10000,2.68,19
3498,Clear,285.26,56,10000,4.12,0
3499,Clouds,297.75,66,10000,1.02,56
3500,Clear,297.39,70,10000,3.74,6


## 1. Normal prediction:

In [40]:
# Thay đổi nhãn của các loại thời tiết không thuộc Clear hoặc Clouds thành 'Thời tiết xấu'
weather_df['Weather'] = weather_df['Weather'].replace(['Haze', 'Mist', 'Snow', 'Thunderstorm', 'Rain', 'Drizzle', 'Fog', 'Smoke'], 'Poor weather')

# Lấy các feature và nhãn đã chỉnh sửa
features = weather_df[['Temp', 'Humidity', 'Visibility', 'Wind speed', 'Clouds']]
labels = weather_df['Weather']

# Chia dữ liệu thành tập huấn luyện và tập kiểm tra
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=80)

for i in range(1,3):
    trees = i*300
    # Tạo mô hình Random Forest và huấn luyện nó
    rf_model = RandomForestClassifier(n_estimators= trees, random_state=44)  
    rf_model.fit(X_train, y_train)

    # Dự đoán trên tập kiểm tra
    predictions = rf_model.predict(X_test)

    # Đánh giá mô hình
    precision = precision_score(y_test, predictions, average='weighted')
    accuracy = accuracy_score(y_test, predictions)
    recall = recall_score(y_test, predictions, average='weighted')
    print(trees, 'samples')
    print(f'Precision: {precision * 100:.2f}%')
    print(f'Accuracy: {accuracy * 100:.2f}%')
    print(f'Recall: {recall * 100:.2f}%')

300 samples
Precision: 92.20%
Accuracy: 92.58%
Recall: 92.58%
600 samples
Precision: 92.05%
Accuracy: 92.44%
Recall: 92.44%


In [41]:
test = rf_model.predict([[ 293.16,	53,	10000	,2.29,	0],
                         [271.08	,62	,10000	,1.91,	0],
                         [279.33	,58	,10000	,0.52	,100],
                         [280.77,57,10000,3.60,0],
                         [269.27	,97	,141	,2.38	,100]])
test



array(['Clear', 'Clear', 'Clouds', 'Clear', 'Poor weather'], dtype=object)

## 2. Oversampling

- Oversampling involves randomly duplicating examples from the minority class and adding them to the training dataset

In [52]:
# Imbalance data
# Oversampling

# Resample
smt = SMOTE(random_state= 80)
X_train_sm, y_train_sm = smt.fit_resample(X_train, y_train)

# Tạo mô hình Random Forest và huấn luyện nó
for i in range(1,3):
    trees = i*100
    rf_model = RandomForestClassifier(n_estimators= trees, random_state=44)
    rf_model.fit(X_train_sm, y_train_sm)

    # Dự đoán trên tập kiểm tra
    predictions = rf_model.predict(X_test)

    # Đánh giá mô hình
    precision = precision_score(y_test, predictions, average='weighted')
    accuracy = accuracy_score(y_test, predictions)
    recall = recall_score(y_test, predictions, average='weighted')
    print(trees, 'samples')
    print(f'Precision: {precision * 100:.2f}%')
    print(f'Accuracy: {accuracy * 100:.2f}%')
    print(f'Recall: {recall * 100:.2f}%')

100 samples
Precision: 91.55%
Accuracy: 91.16%
Recall: 91.16%
200 samples
Precision: 91.10%
Accuracy: 90.73%
Recall: 90.73%


In [43]:
test = rf_model.predict([[ 293.16,	53,	10000	,2.29,	0],
                         [271.08	,62	,10000	,1.91,	0],
                         [279.33	,58	,10000	,0.52	,100],
                         [280.77,57,10000,3.60,0],
                         [269.27	,97	,141	,2.38	,100]])
test



array(['Clear', 'Clear', 'Clouds', 'Clear', 'Poor weather'], dtype=object)

## 3. Undersampling

- Undersampling involves randomly selecting examples from the majority class to delete from the training dataset.

In [53]:
# Imbalance data
# Undersampling

# Resample 
undersampling = RandomUnderSampler(random_state= 80)
X_train_sm, y_train_sm = undersampling.fit_resample(X_train, y_train)

# Tạo mô hình Random Forest và huấn luyện nó
for i in range(1,3):
    trees = i*100
    rf_model = RandomForestClassifier(n_estimators=trees, random_state=44) 
    rf_model.fit(X_train_sm, y_train_sm)

    # Dự đoán trên tập kiểm tra
    predictions = rf_model.predict(X_test)

    # Đánh giá mô hình
    precision = precision_score(y_test, predictions, average='weighted')
    accuracy = accuracy_score(y_test, predictions)
    recall = recall_score(y_test, predictions, average='weighted')
    print(trees, 'samples')
    print(f'Precision: {precision * 100:.2f}%')
    print(f'Accuracy: {accuracy * 100:.2f}%')
    print(f'Recall: {recall * 100:.2f}%')

100 samples
Precision: 91.40%
Accuracy: 87.87%
Recall: 87.87%
200 samples
Precision: 91.50%
Accuracy: 87.73%
Recall: 87.73%


In [45]:
test = rf_model.predict([[ 293.16,	53,	10000	,2.29,	0],
                         [271.08	,62	,10000	,1.91,	0],
                         [279.33	,58	,10000	,0.52	,100],
                         [280.77,57,10000,3.60,0],
                         [269.27	,97	,141	,2.38	,100]])
test



array(['Clear', 'Clear', 'Clouds', 'Clear', 'Poor weather'], dtype=object)

## 4. Grid Search Cross-Validation: 
- Grid Search Cross-Validation is the process of performing hyperparameter tuning in order to determine the optimal values for a given model.

In [50]:
# Cross-validation
# Grid Search

# Hyperparameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None],
    'min_samples_split': [2],
    'min_samples_leaf': [1],
    'class_weight': [None]
}

# Tạo mô hình Random Forest và huấn luyện nó
rf_model = RandomForestClassifier(random_state=44)
grid_search = GridSearchCV(rf_model, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Chọn ra mô hình tốt nhất
best_rf_model = grid_search.best_estimator_

# Dự đoán trên tập kiểm tra
best_predictions = best_rf_model.predict(X_test)

# Đánh giá mô hình
best_precision = precision_score(y_test, best_predictions, average='weighted')
best_accuracy = accuracy_score(y_test, best_predictions)
best_recall = recall_score(y_test, best_predictions, average='weighted')

print(f'Precision: {best_precision * 100:.2f}%')
print(f'Accuracy: {best_accuracy * 100:.2f}%')
print(f'Recall: {best_recall * 100:.2f}%')

Precision: 92.20%
Accuracy: 92.58%
Recall: 92.58%


In [47]:
test = best_rf_model.predict([[ 293.16,	53,	10000	,2.29,	0],
                         [271.08	,62	,10000	,1.91,	0],
                         [279.33	,58	,10000	,0.52	,100],
                         [280.77,57,10000,3.60,0],
                         [269.27	,97	,141	,2.38	,100]])
test



array(['Clear', 'Clear', 'Clouds', 'Clear', 'Poor weather'], dtype=object)

## 5. Randomized Search Cross-Validation: 
-  This method works by randomly sampling hyperparameter combinations from a specified distribution and evaluating the model's performance with each set of hyperparameters.

In [51]:
# Cross-validation
# Randomized Search

# Hyperparameter grid
param_dist = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None],
    'min_samples_split': [2],
    'min_samples_leaf': [1],
    'class_weight': [None]
}

# Tạo mô hình Random Forest và huấn luyện nó
rf_model = RandomForestClassifier(random_state=44)
random_search = RandomizedSearchCV(rf_model, param_distributions=param_dist, n_iter=10, cv=5, scoring='accuracy', random_state=44)
random_search.fit(X_train, y_train)

# Chọn ra mô hình tốt nhất
best_rf_model = random_search.best_estimator_

# Dự đoán trên tập kiểm tra
best_predictions = best_rf_model.predict(X_test)

# Đánh giá mô hình
best_precision = precision_score(y_test, best_predictions, average='weighted')
best_accuracy = accuracy_score(y_test, best_predictions)
best_recall = recall_score(y_test, best_predictions, average='weighted')

print(f'Precision: {best_precision * 100:.2f}%')
print(f'Accuracy: {best_accuracy * 100:.2f}%')
print(f'Recall: {best_recall * 100:.2f}%')




Precision: 92.20%
Accuracy: 92.58%
Recall: 92.58%


In [49]:
test = best_rf_model.predict([[ 293.16,	53,	10000	,2.29,	0],
                         [271.08	,62	,10000	,1.91,	0],
                         [279.33	,58	,10000	,0.52	,100],
                         [280.77,57,10000,3.60,0],
                         [269.27	,97	,141	,2.38	,100]])
test



array(['Clear', 'Clear', 'Clouds', 'Clear', 'Poor weather'], dtype=object)