In [10]:
from google.colab import files
uploaded = files.upload()  # Upload file yagi_storm.csv

Saving yagi_storm.csv to yagi_storm.csv


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import joblib

In [12]:
# 1. Load dữ liệu
df = pd.read_csv('yagi_storm.csv')
print(f"Dataset shape: {df.shape}")
print(df.columns.tolist())  # Xem tất cả tên cột
print(df.head())

Dataset shape: (144, 24)
['name', 'datetime', 'temp', 'feelslike', 'dew', 'humidity', 'precip', 'precipprob', 'preciptype', 'snow', 'snowdepth', 'windgust', 'windspeed', 'winddir', 'sealevelpressure', 'cloudcover', 'visibility', 'solarradiation', 'solarenergy', 'uvindex', 'severerisk', 'conditions', 'icon', 'stations']
         name             datetime  temp  feelslike   dew  humidity  precip  \
0  quang ninh  2024-09-05T00:00:00  30.1       39.0  27.1     84.01     0.0   
1  quang ninh  2024-09-05T01:00:00  29.1       37.2  27.2     89.41     0.0   
2  quang ninh  2024-09-05T02:00:00  29.1       37.0  27.1     89.00     0.0   
3  quang ninh  2024-09-05T03:00:00  29.1       37.0  27.1     89.00     0.0   
4  quang ninh  2024-09-05T04:00:00  28.8       36.7  27.3     91.69     0.0   

   precipprob preciptype  snow  ...  sealevelpressure  cloudcover  visibility  \
0           0        NaN     0  ...            1002.0        99.8        10.0   
1           0        NaN     0  ...       

# 2. Feature Engineering
# Tạo label: 1 = Nguy hiểm (gió > 60km/h), 0 = An toàn
# Lưu ý: Cột gió là 'windspeed' không phải 'wind_kph'

In [13]:
df['is_dangerous'] = (df['windspeed'] > 60).astype(int)

# Features để predict (dùng đúng tên cột từ CSV)
features = ['temp', 'sealevelpressure', 'humidity', 'cloudcover', 'precip', 'windgust']
X = df[features].fillna(0)
y = df['is_dangerous']
print(y)
print(f"\nDangerous records: {y.sum()} / {len(y)}")

0      0
1      0
2      0
3      0
4      0
      ..
139    0
140    0
141    0
142    0
143    0
Name: is_dangerous, Length: 144, dtype: int64

Dangerous records: 5 / 144


# 3. Train/Test split

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 4. Train Classification Model (Nguy hiểm hay không?)

In [15]:
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

# 5. Evaluate

In [16]:
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"\nModel Accuracy: {accuracy:.2%}")


Model Accuracy: 93.10%


# 6. Feature Importance

In [17]:
importance = pd.DataFrame({
    'feature': features,
    'importance': clf.feature_importances_
}).sort_values('importance', ascending=False)
print("\nFeature Importance:")
print(importance)


Feature Importance:
            feature  importance
1  sealevelpressure    0.578506
5          windgust    0.205613
2          humidity    0.091492
0              temp    0.076064
3        cloudcover    0.042242
4            precip    0.006084


# 7. Save Model

In [None]:
joblib.dump(clf, 'storm_classifier.pkl')
print("\n✅ Model saved to storm_classifier.pkl")


✅ Model saved to storm_classifier.pkl


# 8. Download model

In [None]:
files.download('storm_classifier.pkl')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>