In [None]:
!pip install lightgbm



In [1]:
import pandas as pd
import numpy as np
import warnings
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import f1_score
from lightgbm import LGBMClassifier
import re
import string
from sklearn.base import BaseEstimator, TransformerMixin

warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=UserWarning)

In [2]:
df = pd.read_csv('/content/train.csv')
test_df = pd.read_csv('/content/Hazards_UNLABELLED_TEST.csv')

In [11]:
from text_cleaner import TextCleaner

In [16]:
df = df[df['hazard-type'].notna()]
df = df[df['text'].notna()]
df = df[df['title'].notna()]

# --- Merge rare classes
def merge_rare_classes(label):
    rare = [0,
        'NB',
        'migration']
    return 'migration' if label in rare else label

df['hazard-type'] = df['hazard-type'].apply(merge_rare_classes)


df['full_text'] = df['title'].astype(str) + " " + df['text'].astype(str)


X = df[['year', 'month', 'day', 'country', 'full_text']]
y = df['hazard-type']


X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

text_pipeline = Pipeline([
    ('cleaner', TextCleaner()),
    ('tfidf', TfidfVectorizer(
        stop_words='english',
        max_features=5000,
        ngram_range=(1, 2),
        min_df=2
    )),
    ('select', SelectKBest(chi2, k=2500))
])


preprocessor = ColumnTransformer([
    ('num', StandardScaler(), ['year', 'month', 'day']),
    ('cat', OneHotEncoder(handle_unknown='ignore'), ['country']),
    ('text', text_pipeline, 'full_text')
])

# --- Model
model = make_pipeline(
    preprocessor,
    LGBMClassifier(
        n_estimators=300,
        learning_rate=0.05,
        class_weight='balanced',
        random_state=42,
        n_jobs=-1,
        verbose=-1
    )
)

# --- Train
model.fit(X_train, y_train)

# --- Predict and Evaluate
preds = model.predict(X_val)
f1 = f1_score(y_val, preds, average='macro')
print("F1 Macro Score (after class merge):", round(f1, 4))

F1 Macro Score (after class merge): 0.6893


In [17]:
#Test Data Prediction Code
cleaner = TextCleaner()

test_df['title'] = cleaner.transform(test_df['title'].fillna(''))
test_df['text'] = cleaner.transform(test_df['text'].fillna(''))

test_df['full_text'] = test_df['title'] + ' ' + test_df['text']

X_test = test_df[['year', 'month', 'day', 'country', 'title', 'text', 'full_text']]

hazard_test_preds = model.predict(X_test)

output_df = pd.DataFrame({
    'ID': test_df['ID'],
    'hazard': hazard_test_preds
})

# Save to CSV
output_df.to_csv('/content/predictions.csv', index=False)
print("Cleaned test predictions saved to: /content/predictions.csv")

Cleaned test predictions saved to: /content/predictions.csv
