## 1. Import Libraries

In [None]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import f1_score
from sklearn.utils.class_weight import compute_class_weight

# 2. Load Datasets

In [None]:
train_file = "/content/Hazards_LABELLED_TRAIN (1).csv"
test_file = "/content/Hazards_UNLABELLED_TEST (1).csv"
df_train = pd.read_csv(train_file)
df_test = pd.read_csv(test_file)

# 3. Combine Title and Text

In [None]:
df_train["full-text"] = df_train["title"].fillna('') + " " + df_train["text"].fillna('')
df_test["full-text"] = df_test["title"].fillna('') + " " + df_test["text"].fillna('')

# 4. Encode Categorical Variables

In [None]:
label_encoders = {}
for col in ["country", "hazard-type", "product-category"]:
    label_encoders[col] = LabelEncoder()
    if col in df_train.columns:
        df_train[col] = label_encoders[col].fit_transform(df_train[col])
    if col in df_test.columns and col != "hazard-type" and col != "product-category":
        # For 'country', handle unseen countries in test set
        known_countries = set(label_encoders['country'].classes_)
        df_test['country'] = df_test['country'].apply(
            lambda x: x if x in known_countries else label_encoders['country'].classes_[0]
        )
        df_test['country'] = label_encoders['country'].transform(df_test['country'])

# 5. Feature Engineering (optional)

In [None]:
df_train["text_len"] = df_train["full-text"].apply(len)
df_test["text_len"] = df_test["full-text"].apply(len)

# 6. TF-IDF Vectorization (fit on train, transform on both)

In [None]:
tfidf = TfidfVectorizer(
    max_features=20000,
    ngram_range=(1, 3),
    stop_words='english',
    sublinear_tf=True
)
X_text_train = tfidf.fit_transform(df_train["full-text"]).toarray()
X_text_test = tfidf.transform(df_test["full-text"]).toarray()

# 7. Prepare Structured Features

In [None]:
struct_cols = ["year", "month", "day", "country", "text_len"]
X_struct_train = df_train[struct_cols].values
X_struct_test = df_test[struct_cols].values

# 8. Merge Structured and Text Features

In [None]:
X_train = np.hstack((X_struct_train, X_text_train))
X_test = np.hstack((X_struct_test, X_text_test))

y_hazard = df_train["hazard-type"].values
y_product = df_train["product-category"].values

# 9. Compute Class Weights (for hazard and product)

In [None]:
class_weights_hazard = compute_class_weight('balanced', classes=np.unique(y_hazard), y=y_hazard)
class_weights_product = compute_class_weight('balanced', classes=np.unique(y_product), y=y_product)
weights_hazard = np.array([class_weights_hazard[label] for label in y_hazard])
weights_product = np.array([class_weights_product[label] for label in y_product])

# 10. Split Train/Validation for Local Validation

In [None]:
X_tr, X_val, y_tr_hazard, y_val_hazard, y_tr_product, y_val_product, w_tr_hazard, w_val_hazard, w_tr_product, w_val_product = train_test_split(
    X_train, y_hazard, y_product, weights_hazard, weights_product, test_size=0.2, random_state=42, stratify=y_hazard
)

# 11. LightGBM Datasets

In [None]:
train_data_hazard = lgb.Dataset(X_tr, label=y_tr_hazard, weight=w_tr_hazard)
val_data_hazard = lgb.Dataset(X_val, label=y_val_hazard, weight=w_val_hazard)
train_data_product = lgb.Dataset(X_tr, label=y_tr_product, weight=w_tr_product)
val_data_product = lgb.Dataset(X_val, label=y_val_product, weight=w_val_product)

# 12. LightGBM Parameters

In [None]:
params = {
    'objective': 'multiclass',
    'metric': 'multi_logloss',
    'boosting_type': 'gbdt',
    'num_leaves': 128,
    'learning_rate': 0.03,
    'max_depth': 24,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.9,
    'bagging_freq': 3,
    'lambda_l1': 2.0,
    'lambda_l2': 2.0,
    'verbose': -1
}
params_hazard = params.copy()
params_hazard['num_class'] = len(np.unique(y_hazard))
params_product = params.copy()
params_product['num_class'] = len(np.unique(y_product))

# 13. Train Hazard Model

In [None]:
model_hazard = lgb.train(
    params_hazard,
    train_data_hazard,
    num_boost_round=1500,
    valid_sets=[val_data_hazard],
    callbacks=[lgb.early_stopping(stopping_rounds=100), lgb.log_evaluation(100)]
)

Training until validation scores don't improve for 100 rounds
[100]	valid_0's multi_logloss: 0.881874
Early stopping, best iteration is:
[75]	valid_0's multi_logloss: 0.852054


# 14. Train Product Model

In [None]:
model_product = lgb.train(
    params_product,
    train_data_product,
    num_boost_round=1500,
    valid_sets=[val_data_product],
    callbacks=[lgb.early_stopping(stopping_rounds=100), lgb.log_evaluation(100)]
)

Training until validation scores don't improve for 100 rounds
[100]	valid_0's multi_logloss: 1.6501
[200]	valid_0's multi_logloss: 1.68671
Early stopping, best iteration is:
[114]	valid_0's multi_logloss: 1.64363


# 15. Validation F1 Score (optional, for your reference)

In [None]:
y_pred_hazard_val = np.argmax(model_hazard.predict(X_val, num_iteration=model_hazard.best_iteration), axis=1)
y_pred_product_val = np.argmax(model_product.predict(X_val, num_iteration=model_product.best_iteration), axis=1)
macro_f1_hazard = f1_score(y_val_hazard, y_pred_hazard_val, average='macro')
macro_f1_product = f1_score(y_val_product, y_pred_product_val, average='macro')
print(f"Validation Macro F1-score (Hazard Type): {macro_f1_hazard:.4f}")
print(f"Validation Macro F1-score (Product Category): {macro_f1_product:.4f}")

Validation Macro F1-score (Hazard Type): 0.7189
Validation Macro F1-score (Product Category): 0.4470


# 16. Predict on Test Set

In [None]:
preds_hazard_test = model_hazard.predict(X_test, num_iteration=model_hazard.best_iteration)
preds_product_test = model_product.predict(X_test, num_iteration=model_product.best_iteration)
pred_labels_hazard = np.argmax(preds_hazard_test, axis=1)
pred_labels_product = np.argmax(preds_product_test, axis=1)

# 17. Decode labels to original strings

In [None]:
pred_hazard = label_encoders['hazard-type'].inverse_transform(pred_labels_hazard)
pred_product = label_encoders['product-category'].inverse_transform(pred_labels_product)

# 18. Create Submission DataFrame

In [None]:
submission = pd.DataFrame({
    'ID': df_test['ID'],
    'hazard': pred_hazard,
    'product': pred_product
})

submission.to_csv('submission.csv', index=False, encoding='utf-8')
print("Submission file 'submission.csv' created successfully.")

Submission file 'submission.csv' created successfully.


In [None]:
from google.colab import files
files.download('submission.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>