# Text Classification Model Training

This notebook contains the exact logic from `main.py` for training and evaluating text classification models.

**Models:**
- Logistic Regression
- Naive Bayes

**Categories:**
- World (1)
- Sports (2)
- Business (3)
- Sci/Tech (4)

## 1. Import Libraries

In [4]:
import pandas as pd 
import pickle
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

from utils import preprocess
import config

In [3]:
# Add parent directory to path to import local modules
import sys
import os
sys.path.insert(0, os.path.abspath('..'))

## 2. Load Dataset

In [5]:
# Loading the dataset from Kaggle
train_df = pd.read_csv(config.TRAIN_DATA_PATH)
test_df = pd.read_csv(config.TEST_DATA_PATH)
train_df.head()

Unnamed: 0,Class Index,Title,Description
0,3,Wall St. Bears Claw Back Into the Black (Reuters),"Reuters - Short-sellers, Wall Street's dwindli..."
1,3,Carlyle Looks Toward Commercial Aerospace (Reu...,Reuters - Private investment firm Carlyle Grou...
2,3,Oil and Economy Cloud Stocks' Outlook (Reuters),Reuters - Soaring crude prices plus worries\ab...
3,3,Iraq Halts Oil Exports from Main Southern Pipe...,Reuters - Authorities have halted oil export\f...
4,3,"Oil prices soar to all-time record, posing new...","AFP - Tearaway world oil prices, toppling reco..."


## 3. Data Overview

In [6]:
#Testing the data if it works correctly
print(f"Training samples: {len(train_df)}")
print(f"Test samples: {len(test_df)}")

Training samples: 120000
Test samples: 7600


## 4. Data Preprocessing

In [7]:
# Cleaning the data
train_df['clean_text'] = train_df['Description'].apply(preprocess)
test_df['clean_text'] = test_df['Description'].apply(preprocess)
train_df[['Description', 'clean_text']].head()

Unnamed: 0,Description,clean_text
0,"Reuters - Short-sellers, Wall Street's dwindli...",reuters shortsellers wall streets dwindlingban...
1,Reuters - Private investment firm Carlyle Grou...,reuters private investment firm carlyle groupw...
2,Reuters - Soaring crude prices plus worries\ab...,reuters soaring crude prices plus worriesabout...
3,Reuters - Authorities have halted oil export\f...,reuters authorities halted oil exportflows mai...
4,"AFP - Tearaway world oil prices, toppling reco...",afp tearaway world oil prices toppling records...


## 5. Vectorization and Model Training Setup

In [8]:
# Vectorization and model training
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(train_df['clean_text'])
X_test = vectorizer.transform(test_df['clean_text'])
y_train = train_df['Class Index']
y_test = test_df['Class Index']

## 6. Model 1: Logistic Regression

In [9]:
# 1st Model: Logistic Regression
clf = LogisticRegression(max_iter=config.LOGISTIC_REGRESSION_MAX_ITER)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
print('Accuracy:', accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.9076315789473685
              precision    recall  f1-score   support

           1       0.92      0.90      0.91      1900
           2       0.95      0.97      0.96      1900
           3       0.88      0.87      0.88      1900
           4       0.87      0.89      0.88      1900

    accuracy                           0.91      7600
   macro avg       0.91      0.91      0.91      7600
weighted avg       0.91      0.91      0.91      7600



### Confusion Matrix - Logistic Regression

In [None]:
ConfusionMatrixDisplay.from_predictions(y_test, y_pred)
plt.title('Confusion Matrix Logic Regression')
plt.show()

## 7. Model 2: Naive Bayes

In [None]:
#2nd Model: Naive Bayes

nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)

y_pred_nb = nb_model.predict(X_test)
print('Naive Bayes Accuracy:', accuracy_score(y_test, y_pred_nb))
print(classification_report(y_test, y_pred_nb))

### Confusion Matrix - Naive Bayes

In [None]:
ConfusionMatrixDisplay.from_predictions(y_test, y_pred_nb)
plt.title('Confusion Matrix Naive Bayes')
plt.show()

## 8. Prediction Function - Ensemble Method

In [None]:
# Prediction function - Both Models to compare results

def predict_category(text):
    
    clean = preprocess(text)
    vect = vectorizer.transform([clean])
    
    pred_lr = clf.predict(vect)[0]
    pred_nb = nb_model.predict(vect)[0]
    
    prob_lr = clf.predict_proba(vect)[0]
    prob_nb = nb_model.predict_proba(vect)[0]
    
    avg_prob = (prob_lr + prob_nb) / 2
    final_pred = avg_prob.argmax() + 1  
    
    print(f"\nInput text: '{text}'")
    print(f"Logistic Regression prediction: {config.CATEGORIES[pred_lr]}")
    print(f"Naive Bayes prediction: {config.CATEGORIES[pred_nb]}")
    print(f"Final ensemble prediction: {config.CATEGORIES[final_pred]}")
    
    return config.CATEGORIES[final_pred]

## 10. Test Prediction

In [None]:
# Test the prediction function
result = predict_category("ronaldo has scored a last minute goal")
print(f"\nFinal result: {result}")