# TASK 2- Text classification

In [1]:
# Import needed libraries
import os
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import precision_score, recall_score, f1_score

import pandas as pd

## 1. Use the "Train and Test" and the "Negative examples" directories

In [2]:
# Function to load documents from directory along with filenames
def load_data(directory, label):
    documents = []
    labels = []
    filenames = []  # Store filenames (titles)
    for filename in os.listdir(directory):
        with open(os.path.join(directory, filename), 'r', encoding='latin-1') as file:
            documents.append(file.read())
            labels.append(label)
            filenames.append(filename)  # Store docs names- for results discussing (5)
    return documents, labels, filenames


## 2. Label the documents

In [3]:
# Load data and label the documents
train_pos_documents, train_pos_labels, train_pos_filenames = load_data(r"C:\Users\anate\IRProject\Train and Test", label=1)
train_neg_documents, train_neg_labels, train_neg_filenames = load_data(r"C:\Users\anate\IRProject\Negative examples", label=0)

## 3. Linguistic Operations

In [4]:
# Combine positive and negative examples
all_documents = train_pos_documents + train_neg_documents
all_labels = train_pos_labels + train_neg_labels
all_filenames = train_pos_filenames + train_neg_filenames

# Function to tokenize and remove stopwords
def preprocess_document(document):
    # Tokenize
    tokens = word_tokenize(document)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens_without_stop_words = [word for word in tokens if word.lower() not in stop_words]
    return tokens_without_stop_words

# Apply preprocessing to all documents
preprocessed_documents = [preprocess_document(doc) for doc in all_documents ]


## 4. Machine learning

### a. 4 classifiers - as we learned in lecturs

In [5]:
classifiers = {
    'Multinomial Naive Bayes': MultinomialNB(),
    'SVM': SVC(),
    'Rocchio': LinearSVC(dual=False),
    'K Nearest Neighbors': KNeighborsClassifier()
}

### b. Perform 10-fold cross validation – train with 90% of the documents and test with the rest

In [6]:
# Split data into training and testing sets (90% training, 10% testing)
X_train, X_test, y_train, y_test = train_test_split(preprocessed_documents, all_labels, test_size=0.1, random_state=42)

# Funtion fo Perform 10-fold cross-validation
def perform_cross_validation(classifier_name, classifier, X_train, y_train, all_filenames):
    vectorizer = CountVectorizer()
    X_train_counts = vectorizer.fit_transform([' '.join(doc) for doc in X_train])  # Join documents into a single string
    X_train_dense = X_train_counts.toarray()  # Convert sparse matrix to dense numpy array
   
    # Perform 10 cross-validation and store the predictions
    cv_predictions[classifier_name] = cross_val_predict(classifier, X_train_dense, y_train, cv=10)
    
    scores = cross_val_score(classifier, X_train_dense, y_train, cv=10)
    misclassified = [y_train, cv_predictions[classifier_name], X_train, all_filenames]
    
    return scores, misclassified


## 5. Report results

In [7]:
# Function to display misclassified documents with titles
def display_misclassified_documents(misclassified):
    true_labels = misclassified[0]  # Extract true labels from misclassified
    predicted_labels = misclassified[1]  # Extract predicted labels from misclassified
    documents = misclassified[2]  # Extract documents from misclassified
    filenames = misclassified[3]  # Extract filenames from misclassified
    
    misclassified_indices = [i for i in range(len(true_labels)) if true_labels[i] != predicted_labels[i]]
    data = []  
    for index in misclassified_indices:
        # Split the title into words - to show prefix of document title in result discussing (5)
        title_words = filenames[index].split()
        # Take the first five words (or fewer if the title has fewer than five words)
        truncated_title = ' '.join(title_words[:3])
        
        data.append({ 
            "Title": f"{truncated_title}..." ,  # Use truncated title
            "True Label": true_labels[index],
            "Predicted Label": predicted_labels[index],
        })
        
    df = pd.DataFrame(data)
    return df



In [8]:
cv_predictions = {}

for classifier_name, classifier in classifiers.items():
    scores, misclassified = perform_cross_validation(classifier_name, classifier, X_train, y_train, all_filenames)
    
    # Evaluation of each classifier
    print(f"\033[1m{classifier_name}\033[0m")
    print(f"Accuracy: {scores.mean():.4f}")
    print(f"Precision: {precision_score(y_train, cv_predictions[classifier_name], average='weighted'):.4f}")
    print(f"Recall: {recall_score(y_train, cv_predictions[classifier_name], average='weighted'):.4f}")
    print(f"F1-Score: {f1_score(y_train, cv_predictions[classifier_name], average='weighted'):.4f} \n")
    
    misclassified_df = display_misclassified_documents(misclassified)
    print("Misclassified Documents:")
    display(misclassified_df)
    print ("\n")
    

[1mMultinomial Naive Bayes[0m
Accuracy: 0.9078
Precision: 0.9161
Recall: 0.9062
F1-Score: 0.9063 

Misclassified Documents:


Unnamed: 0,Title,True Label,Predicted Label
0,A Neural Marker...,1,0
1,Addressing Bias in...,1,0
2,Analyzing social biases...,1,0
3,Casuistry and Social...,1,0
4,Methods to Reduce...,1,0
5,SOCIAL DESIRABILITY BIAS...,1,0
6,Bias mitigation for...,0,1
7,Human Trust Modeling...,1,0
8,Mitigating gender bias...,1,0




[1mSVM[0m
Accuracy: 0.8978
Precision: 0.9012
Recall: 0.8958
F1-Score: 0.8949 

Misclassified Documents:


Unnamed: 0,Title,True Label,Predicted Label
0,A practical tool...,0,1
1,American J Political...,0,1
2,Negative social bias...,0,1
3,Quantifying Social Biases...,0,1
4,Social Biases in...,0,1
5,AI Fairness 360_...,0,1
6,Balanced datasets are...,0,1
7,Bias mitigation with...,0,1
8,Human Trust Modeling...,1,0
9,Individualised responsible artificial...,1,0




[1mRocchio[0m
Accuracy: 0.8756
Precision: 0.8798
Recall: 0.8750
F1-Score: 0.8738 

Misclassified Documents:


Unnamed: 0,Title,True Label,Predicted Label
0,American J Political...,0,1
1,"Applied Linguistics, Social...",0,1
2,Casuistry and Social...,1,0
3,Social Biases in...,0,1
4,SOCIAL DESIRABILITY BIAS...,1,0
5,Towards Understanding and...,0,1
6,Unlearning implicit social...,0,1
7,Balanced datasets are...,0,1
8,Human Trust Modeling...,1,0
9,Medicine and the...,0,1




[1mK Nearest Neighbors[0m
Accuracy: 0.7811
Precision: 0.8169
Recall: 0.7812
F1-Score: 0.7704 

Misclassified Documents:


Unnamed: 0,Title,True Label,Predicted Label
0,American J Political...,0,1
1,British J of...,0,1
2,Exploring Social Desirability...,0,1
3,MODEL BIAS IN...,0,1
4,Negative social bias...,0,1
5,Quantifying Social Biases...,0,1
6,Social Bias and...,0,1
7,Social Biases in...,0,1
8,Towards Understanding and...,0,1
9,Unlearning implicit social...,0,1




