# **Text Classification**

## Loading Libraries

In [6]:
import json
import pandas as pd
import re
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, accuracy_score
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download necessary NLTK data
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to C:\Users\Ali/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Ali/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to C:\Users\Ali/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

## Load dataset (JSON Lines format)

In [7]:
def load_json_lines(path):
    with open(path, 'r', encoding='utf-8') as f:
        return pd.DataFrame([json.loads(line) for line in f])

train_df = load_json_lines('./Datasets/train.json')
val_df = load_json_lines('./Datasets/validation.json')
test_df = load_json_lines('./Datasets/test.json')

## Preprocess Data

In [8]:

# Advanced text preprocessing
def preprocess_text(text_series):
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))
    corpus = []
    for text in text_series:
        r = re.sub('[^a-zA-Z]', ' ', text)
        r = r.lower()
        r = r.split()
        r = [word for word in r if word not in stop_words]
        r = [lemmatizer.lemmatize(word) for word in r]
        r = ' '.join(r)
        corpus.append(r)
    return corpus


train_df['text'] = preprocess_text(train_df['text'])
val_df['text'] = preprocess_text(val_df['text'])
test_df['text'] = preprocess_text(test_df['text'])

## TF-IDF Vectorization

In [9]:
vectorizer = TfidfVectorizer(max_features=5000)
X_train = vectorizer.fit_transform(train_df['text'])
X_val = vectorizer.transform(val_df['text'])
X_test = vectorizer.transform(test_df['text'])

y_train = train_df['label']
y_val = val_df['label']
y_test = test_df['label']

## Train and Evaluate Models

In [10]:
def evaluate_model(model, name, X, y, dataset_type="Validation"):
    model.fit(X_train, y_train)
    y_pred = model.predict(X)
    acc = accuracy_score(y, y_pred)
    print(f"\n{name} Results on {dataset_type} Set:")
    print(f"{dataset_type} Accuracy: {acc}")
    return model, acc

models = [
    (LogisticRegression(max_iter=1000), "Logistic Regression"),
    (MultinomialNB(), "Multinomial Naive Bayes"),
    (LinearSVC(), "Support Vector Machine (SVM)")
]

# Evaluate models on validation set and select the best one
best_model = None
best_model_name = None
best_val_acc = 0
trained_models = {}

for model, name in models:
    trained_model, val_acc = evaluate_model(model, name, X_val, y_val, "Validation")
    trained_models[name] = trained_model
    if val_acc > best_val_acc:
        best_val_acc = val_acc
        best_model = trained_model
        best_model_name = name

print(f"\nBest Model: {best_model_name} with Validation Accuracy: {best_val_acc}")


Logistic Regression Results on Validation Set:
Validation Accuracy: 0.848

Multinomial Naive Bayes Results on Validation Set:
Validation Accuracy: 0.8323333333333334

Support Vector Machine (SVM) Results on Validation Set:
Validation Accuracy: 0.8473333333333334

Best Model: Logistic Regression with Validation Accuracy: 0.848


## Evaluate on Test Set and Display Metrics

In [11]:
# Evaluate the best model on test set
test_preds = best_model.predict(X_test)
print(f"\nFinal Evaluation on Test Set:")
print(f"Test Accuracy for {best_model_name}: {accuracy_score(y_test, test_preds)}")

# Generate and save classification report to CSV
test_report = classification_report(y_test, test_preds, output_dict=True)
report_df = pd.DataFrame(test_report).transpose()
report_df.to_csv('test_classification_report.csv', index=True)
print("Classification report saved to test_classification_report.csv")


Final Evaluation on Test Set:
Test Accuracy for Logistic Regression: 0.8484
Classification report saved to test_classification_report.csv
