In [2]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
import pandas as pd
import joblib

# Load the cleaned dataset
df = pd.read_csv("data/clean_disaster_text_dataset.csv")

# Check class distribution
print("Label distribution:", df["label"].value_counts())

# Remove stratify parameter since some classes have too few samples
X_train, X_test, y_train, y_test = train_test_split(
    df["text"], df["label"], test_size=0.2, random_state=42
    # Removed stratify=df["label"] to fix the error
)

# Vectorize text
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1,2))
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Train model
model = LogisticRegression(max_iter=1000, multi_class='auto')
model.fit(X_train_vec, y_train)

# Evaluate
y_pred = model.predict(X_test_vec)
print("‚úÖ Model trained successfully!")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Save model + vectorizer
joblib.dump(model, "models/disaster_classifier5.pkl")
joblib.dump(vectorizer, "models/disaster_vectorizer3.pkl")

print("\nüíæ Model and vectorizer saved!")

Label distribution: label
Flood                               4033
Storm                               2563
Road                                1917
Water                                977
Epidemic                             877
Earthquake                           641
Mass movement (wet)                  458
Extreme temperature                  438
Explosion (Industrial)               432
Air                                  426
Fire (Miscellaneous)                 415
Drought                              399
Wildfire                             304
Rail                                 241
Miscellaneous accident (General)     183
Explosion (Miscellaneous)            159
Collapse (Miscellaneous)             149
Collapse (Industrial)                146
Volcanic activity                    121
Fire (Industrial)                    108
Industrial accident (General)        100
Gas leak                              34
Infestation                           29
Chemical spill                 



‚úÖ Model trained successfully!
Accuracy: 0.9967148488830486

Classification Report:
                                   precision    recall  f1-score   support

                             Air       1.00      1.00      1.00        85
                  Chemical spill       1.00      1.00      1.00         4
           Collapse (Industrial)       1.00      1.00      1.00        22
        Collapse (Miscellaneous)       1.00      1.00      1.00        32
                         Drought       1.00      0.98      0.99        88
                      Earthquake       1.00      1.00      1.00       127
                        Epidemic       1.00      0.99      1.00       196
          Explosion (Industrial)       1.00      1.00      1.00        63
       Explosion (Miscellaneous)       1.00      1.00      1.00        32
             Extreme temperature       1.00      0.99      1.00       102
               Fire (Industrial)       1.00      1.00      1.00        21
            Fire (Miscell

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [1]:
import pandas as pd

# Load dataset
df = pd.read_csv("public_emdat_project.csv", encoding='latin1')

# Select relevant columns
text_columns = ["Country", "Location", "Disaster Group", "Disaster Subgroup", 
                "Disaster Type", "Disaster Subtype", "Event Name", 
                "Origin", "Region", "Subregion"]
label_column = "Disaster Type"

# Combine text fields into a single column
df["text"] = df[text_columns].astype(str).agg(" ".join, axis=1)

# Drop missing labels and duplicates
df = df.dropna(subset=[label_column])
df = df.drop_duplicates(subset=["text"])

# Keep only text + label columns
df_final = df[["text", label_column]].rename(columns={label_column: "label"})

print("‚úÖ Clean dataset ready!")
print("Shape:", df_final.shape)
print("\nSample:\n", df_final.head())

# Optional: Save for later use
df_final.to_csv("data/clean_disaster_text_dataset.csv", index=False)


‚úÖ Clean dataset ready!
Shape: (15219, 2)

Sample:
                                                 text    label
0  Djibouti Ali Sabieh, Dikhil, Djibouti, Obock, ...  Drought
1  Sudan Northern Darfur, Northern Kordofan, Red ...  Drought
2  Somalia Ceel Barde, Rab Dhuure, Tayeeglow, Xud...  Drought
3  Angola Calulo Technological Transport Road Roa...     Road
4  Angola Dombre Grande village (Baia Farta distr...    Flood


In [9]:
import pandas as pd
import os
import re
import joblib
import subprocess

# === 0Ô∏è‚É£ Run cleaning.py to generate cleaned dataset ===
cleaning_script_path = os.path.join("model_build", "cleaning.py")
print("üöÄ Running cleaning.py to generate cleaned dataset...\n")
try:
    subprocess.run(["python", cleaning_script_path], check=True)
    print("‚úÖ cleaning.py executed successfully!\n")
except subprocess.CalledProcessError as e:
    raise RuntimeError(f"‚ùå Error while running cleaning.py: {e}")

# === 1Ô∏è‚É£ Load cleaned dataset ===
cleaned_file_path = os.path.join("data", "cleaned_dataset.csv")
if not os.path.exists(cleaned_file_path):
    raise FileNotFoundError(f"‚ùå Cleaned dataset not found at {cleaned_file_path}")

df = pd.read_csv(cleaned_file_path)
print("‚úÖ Cleaned dataset loaded successfully!")
print("Shape:", df.shape)
print("Columns:", df.columns.tolist(), "\n")

# === 2Ô∏è‚É£ Load pre-trained multi-class model ===
model_file = os.path.join("models", "disaster_classifier5.pkl")
vectorizer_file = os.path.join("models", "disaster_vectorizer3.pkl")

try:
    model = joblib.load(model_file)
    vectorizer = joblib.load(vectorizer_file)
    print("‚úÖ Multi-class model and vectorizer loaded successfully!\n")
except:
    raise FileNotFoundError("‚ùå New multi-class model/vectorizer not found!")

# === 3Ô∏è‚É£ Optional keyword list (for hybrid boost) ===
DISASTER_KEYWORDS = [
    "earthquake", "flood", "cyclone", "hurricane", "storm", "tornado", "tsunami",
    "landslide", "disaster", "wildfire", "volcano", "eruption", "drought",
    "avalanche", "rescue", "collapsed", "aftershock", "damaged", "mudslide", "explosion"
]

# === 4Ô∏è‚É£ Prediction function ===
def classify_disaster(text):
    cleaned = re.sub(r'\s+', ' ', str(text).strip().lower())
    tfidf_input = vectorizer.transform([cleaned])
    pred_label = model.predict(tfidf_input)[0]
    probs = model.predict_proba(tfidf_input)[0]
    confidence = round(float(max(probs)), 2)

    keyword_flag = any(kw in cleaned for kw in DISASTER_KEYWORDS)
    reason = "keyword assist" if keyword_flag else "model"

    return pred_label, confidence, reason

# === 5Ô∏è‚É£ Apply classification safely ===
final_results = []
for _, row in df.iterrows():
    # pick the right column (clean_text or text)
    text_data = str(row.get("clean_text") or row.get("text") or "").strip()
    if not text_data or text_data.lower() == "nan":
        continue  # skip empty rows

    pred, conf, reason = classify_disaster(text_data)
    final_results.append({
    "timestamp": row.get("timestamp", ""),
    "text": row["clean_text"],
    "location": row.get("location", "Unknown"),
    "predicted_disaster_type": pred,
    "confidence": conf,
    "reason": reason
})


final_df = pd.DataFrame(final_results)
final_df = final_df.sort_values(by="timestamp", ascending=False)

# === 6Ô∏è‚É£ Save final CSV ===
final_csv_path = os.path.join("data", "final_disaster_data.csv")
final_df.to_csv(final_csv_path, index=False)
print(f"üíæ Final multi-class disaster data saved to {final_csv_path}")
print("üéØ Model processing completed successfully!")

# Optional sanity check
print("\nSample of final predictions:\n")
print(final_df.head())


üöÄ Running cleaning.py to generate cleaned dataset...

‚úÖ cleaning.py executed successfully!

‚úÖ Cleaned dataset loaded successfully!
Shape: (272, 4)
Columns: ['source', 'timestamp', 'clean_text', 'location'] 

‚úÖ Multi-class model and vectorizer loaded successfully!

üíæ Final multi-class disaster data saved to data\final_disaster_data.csv
üéØ Model processing completed successfully!

Sample of final predictions:

                   timestamp  \
0  2025-11-09 08:04:06+00:00   
1  2025-11-09 07:50:19+00:00   
2  2025-11-09 07:44:21+00:00   
3  2025-11-09 07:35:00+00:00   
4  2025-11-09 07:30:17+00:00   

                                                text  \
0  philippines braces for super typhoon with 9000...   
1  2025 sks weekly climate change global warming ...   
2  nearly 1 million filipinos evacuate as super t...   
3  are we entering a golden age of nuclear power ...   
4  dani rodrik the world needs a new economic pla...   

                                      locati

In [11]:
import pandas as pd

df = pd.read_csv("data/final_disaster_data.csv")

# 1Ô∏è‚É£ Check top 5 rows
print("Sample of final predictions:\n")
print(df.head())




Sample of final predictions:

                   timestamp  \
0  2025-11-09 08:04:06+00:00   
1  2025-11-09 07:50:19+00:00   
2  2025-11-09 07:44:21+00:00   
3  2025-11-09 07:35:00+00:00   
4  2025-11-09 07:30:17+00:00   

                                                text  \
0  philippines braces for super typhoon with 9000...   
1  2025 sks weekly climate change global warming ...   
2  nearly 1 million filipinos evacuate as super t...   
3  are we entering a golden age of nuclear power ...   
4  dani rodrik the world needs a new economic pla...   

                                      location predicted_disaster_type  \
0  Philippines, Luzon, Super Typhoon Fung-wong                   Storm   
1                                      Unknown                   Flood   
2              Philippines, Super Typhoon Fung                   Storm   
3                                           UK                   Flood   
4                                      Unknown                   Storm

In [12]:
df.head()

Unnamed: 0,timestamp,text,location,predicted_disaster_type,confidence,reason
0,2025-11-09 08:04:06+00:00,philippines braces for super typhoon with 9000...,"Philippines, Luzon, Super Typhoon Fung-wong",Storm,0.5,keyword assist
1,2025-11-09 07:50:19+00:00,2025 sks weekly climate change global warming ...,Unknown,Flood,0.26,model
2,2025-11-09 07:44:21+00:00,nearly 1 million filipinos evacuate as super t...,"Philippines, Super Typhoon Fung",Storm,0.8,keyword assist
3,2025-11-09 07:35:00+00:00,are we entering a golden age of nuclear power ...,UK,Flood,0.21,model
4,2025-11-09 07:30:17+00:00,dani rodrik the world needs a new economic pla...,Unknown,Storm,0.23,model
