In [1]:
import pandas as pd
import pickle
from sklearn.metrics import classification_report, accuracy_score

In [20]:
model_filename = "classification_model.pkl"
vectorizer_filename = "tfidf_vectorizer.pkl"
encoder_filename = "label_encoder.pkl"
file_path_test = "../news_datasets/test_20_labeled.csv"

In [21]:
with open(model_filename, 'rb') as file:
    classifier = pickle.load(file)

with open(vectorizer_filename, 'rb') as file:
    tfidf_vectorizer = pickle.load(file)

with open(encoder_filename, 'rb') as file:
    label_encoder = pickle.load(file)
    
df_test = pd.read_csv(file_path_test)

TEXT_COLUMN = 'Full Text'
TARGET_COLUMN = 'llm_category'

print(f"Test dataset uploaded: {len(df_test)} records")
print(f"Training categories: {list(label_encoder.classes_)}")

Test dataset uploaded: 20 records
Training categories: ['BIODIVERSITY AND ECOSYSTEMS', 'CLIMATE AND EMISSIONS', 'ENERGY AND TRANSITION', 'NATURAL RESOURCES', 'POLICIES AND REGULATION', 'POLLUTION AND ENVIRONMENTAL QUALITY', 'RISKS AND DISASTERS', 'SOCIO-ECONOMIC IMPACT']


In [22]:
# Encoding target column
df_test['target_encoded'] = label_encoder.transform(df_test[TARGET_COLUMN])

X_test = df_test[TEXT_COLUMN]
y_test = df_test['target_encoded']

# Vectorization of test data
X_test_tfidf = tfidf_vectorizer.transform(X_test)

print(f"dimensions of vectorized test data: {X_test_tfidf.shape}")

dimensions of vectorized test data: (20, 1183)


In [23]:
# Prediction on test data
y_pred = classifier.predict(X_test_tfidf)

# Decoding predictions (converting numerical indices into category names)
y_pred_categories = label_encoder.inverse_transform(y_pred)

df_test['predicted_category'] = y_pred_categories

print("\n--- preview for predictions ---")
print(df_test[[TARGET_COLUMN, 'predicted_category']].head(10))


--- preview for predictions ---
                          llm_category           predicted_category
0          BIODIVERSITY AND ECOSYSTEMS  BIODIVERSITY AND ECOSYSTEMS
1                ENERGY AND TRANSITION        CLIMATE AND EMISSIONS
2                CLIMATE AND EMISSIONS        CLIMATE AND EMISSIONS
3                SOCIO-ECONOMIC IMPACT        CLIMATE AND EMISSIONS
4              POLICIES AND REGULATION  BIODIVERSITY AND ECOSYSTEMS
5                ENERGY AND TRANSITION  BIODIVERSITY AND ECOSYSTEMS
6  POLLUTION AND ENVIRONMENTAL QUALITY  BIODIVERSITY AND ECOSYSTEMS
7                CLIMATE AND EMISSIONS  BIODIVERSITY AND ECOSYSTEMS
8              POLICIES AND REGULATION  BIODIVERSITY AND ECOSYSTEMS
9                    NATURAL RESOURCES  BIODIVERSITY AND ECOSYSTEMS


In [24]:
accuracy = accuracy_score(y_test, y_pred)

report = classification_report(y_test, y_pred, target_names=label_encoder.classes_, zero_division=0)

print("\n=============================================")
print(f"| ACCURACY : {accuracy:.4f}             |")
print("=============================================")
print("\nCLASSIFICATION REPORT BY CATEGORY :\n")
print(report)


| ACCURACY : 0.2500             |

CLASSIFICATION REPORT BY CATEGORY :

                                     precision    recall  f1-score   support

        BIODIVERSITY AND ECOSYSTEMS       0.33      0.80      0.47         5
              CLIMATE AND EMISSIONS       0.14      0.50      0.22         2
              ENERGY AND TRANSITION       0.00      0.00      0.00         2
                  NATURAL RESOURCES       0.00      0.00      0.00         1
            POLICIES AND REGULATION       0.00      0.00      0.00         6
POLLUTION AND ENVIRONMENTAL QUALITY       0.00      0.00      0.00         2
                RISKS AND DISASTERS       0.00      0.00      0.00         1
              SOCIO-ECONOMIC IMPACT       0.00      0.00      0.00         1

                           accuracy                           0.25        20
                          macro avg       0.06      0.16      0.09        20
                       weighted avg       0.10      0.25      0.14        20

