In [14]:
import pandas as pd
import pickle

from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer

In [16]:
file = "../news_datasets/full_training_dataset.csv"
model_filename_pkl = 'classification_report/tfidf_classification_model.pkl'
vectorizer_filename_pkl = 'classification_report/tfidf_vectorizer.pkl'
encoder_filename_pkl = 'classification_report/tfidf_label_encoder.pkl'

In [3]:
df = pd.read_csv(file)
df.head()

Unnamed: 0,Title,Authors,Date Published,Full Text,llm_category
0,Britain’s first climate assembly agrees plan ...,['Molly Blackall'],2021-02-02,"Camden citizens’ proposals focus on housing, t...",CLIMATE AND EMISSIONS
1,We are not all doomed. Not yet,['Ammar Kalia'],2020-11-07,This week’s Upside digest looks at the ways to...,CLIMATE AND EMISSIONS
2,This article is more than 9 months oldCoral-e...,['Aneesa Ahmed'],2023-04-25,Corallivorous fish were regarded as harmful to...,BIODIVERSITY AND ECOSYSTEMS
3,Alarming link between fungicides and bee decli...,"['Damian Carrington', ' Environment editor', '...",2020-11-26,Fungicides are found to be the strongest facto...,POLLUTION AND ENVIRONMENTAL QUALITY
4,Climate impact labels could help people eat l...,"['Helena Horton', ' Environment reporter']",2023-01-05,Information on environmental impact can persua...,CLIMATE AND EMISSIONS


In [4]:
df.shape

(398, 5)

In [5]:
# Train test split
X = df['Full Text']
y = df['llm_category']

# Stratified split to maintain class distribution
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(f"Training set size (X_train): {len(X_train)}")
print(f"Testing set size (X_test): {len(X_test)}")

Training set size (X_train): 318
Testing set size (X_test): 80


In [7]:
# Label encoding
label_encoder = LabelEncoder()

# Train the transformer and apply it to the training and test labels
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

# Display correspondence for reference
print("\nCorrespondence Category -> Encoded:")
for i, category in enumerate(label_encoder.classes_):
    print(f"{category} -> {i}")


Correspondence Category -> Encoded:
BIODIVERSITY AND ECOSYSTEMS -> 0
CLIMATE AND EMISSIONS -> 1
ENERGY AND TRANSITION -> 2
NATURAL RESOURCES -> 3
POLICIES AND REGULATION -> 4
POLLUTION AND ENVIRONMENTAL QUALITY -> 5
RISKS AND DISASTERS -> 6
SOCIO-ECONOMIC IMPACT -> 7


In [9]:
# Initialize TfidfVectorizer
tftidf = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))

# Train the vectorizer on the training set
X_train_tfidf = tftidf.fit_transform(X_train)
X_test_tfidf = tftidf.transform(X_test)

In [None]:
# Initialize and train the Logistic Regression model
tftidf_model = LogisticRegression(max_iter=1000, random_state=42)
tftidf_model.fit(X_train_tfidf, y_train_encoded)

# Make predictions on the test set
y_pred_tfidf = tftidf_model.predict(X_test_tfidf)

# Evaluate the model
accuracy_tfidf = accuracy_score(y_test_encoded, y_pred_tfidf)
print(f"\n🚀 Accuracy for TF-IDF sur 1000 datas: {accuracy_tfidf:.4f}")

# Detailed classification report
report_tfidf = classification_report(y_test_encoded, y_pred_tfidf, target_names=label_encoder.classes_)
print("\nClassification Report for TF-IDF:\n", report_tfidf)


🚀 Accuracy for TF-IDF sur 800 (398) datas: 0.5125

Classification Report for TF-IDF:
                                      precision    recall  f1-score   support

        BIODIVERSITY AND ECOSYSTEMS       0.50      1.00      0.67        24
              CLIMATE AND EMISSIONS       0.50      0.74      0.60        19
              ENERGY AND TRANSITION       0.00      0.00      0.00         8
                  NATURAL RESOURCES       0.00      0.00      0.00         3
            POLICIES AND REGULATION       0.00      0.00      0.00         7
POLLUTION AND ENVIRONMENTAL QUALITY       1.00      0.33      0.50         9
                RISKS AND DISASTERS       0.00      0.00      0.00         5
              SOCIO-ECONOMIC IMPACT       0.00      0.00      0.00         5

                           accuracy                           0.51        80
                          macro avg       0.25      0.26      0.22        80
                       weighted avg       0.38      0.51      0.

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [17]:
with open(model_filename_pkl, 'wb') as model_file:
    pickle.dump(tftidf_model, model_file)

with open(vectorizer_filename_pkl, 'wb') as vectorizer_file:
    pickle.dump(tftidf, vectorizer_file)

with open(encoder_filename_pkl, 'wb') as encoder_file:
    pickle.dump(label_encoder, encoder_file)