Importing Libraries

In [1]:
import warnings
warnings.filterwarnings('ignore')

import os
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer

Defining Paths

In [2]:
train_data_path = "D:/4th year 2nd semester/NLP/Assignment-1 Text Classification/training"
test_data_path="D:/4th year 2nd semester/NLP/Assignment-1 Text Classification/test"

Perprocessing

In [3]:
# # Download NLTK resources
# nltk.download('punkt')

# Initialize Porter Stemmer
stemmer = PorterStemmer()

# Tokenization and normalization
def tokenize_normalize(text):
    tokens = word_tokenize(text)
    normalized_tokens = [stemmer.stem(token) for token in tokens]
    return normalized_tokens

# Extract vocabulary set
def extract_vocabulary(documents):
    vectorizer = CountVectorizer(tokenizer=tokenize_normalize)
    vectorizer.fit_transform(documents)
    return vectorizer.vocabulary_

In [4]:
# Data Preprocessing
def preprocess_data(data_path):
    documents = []
    classes = []
    
    for label in os.listdir(data_path):
        label_path = os.path.join(data_path, label)
        if os.path.isdir(label_path):
            for file_name in os.listdir(label_path):
                file_path = os.path.join(label_path, file_name)
                with open(file_path, 'r', encoding='utf-8', errors='replace') as file:
                    content = file.read()
                    documents.append(content)
                    classes.append(label) 
    
    vocabulary = extract_vocabulary(documents)
    
    return documents, classes, vocabulary

# Load and preprocess data
documents, classes, vocabulary = preprocess_data(train_data_path)
test_documents, test_classes, test_vocabulary = preprocess_data(test_data_path)


In [10]:
print(len (documents))
print(len (test_documents))

11413
4024


TFIDF

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

# TF-IDF Feature Encoding
def tfidf_feature_encoding(documents):
    vectorizer = TfidfVectorizer(tokenizer=tokenize_normalize, vocabulary=vocabulary)
    X = vectorizer.fit_transform(documents)
    return X

# Splitting data into train and test sets
X_tfidf = tfidf_feature_encoding(documents)
y = classes
X_train_tfidf, X_test_tfidf, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)


test_x_tfidf = tfidf_feature_encoding(test_documents)
test_y = test_classes 


Word Embedding 

word2vec

In [12]:
from gensim.models import Word2Vec
import numpy as np

# Tokenize documents
tokenized_documents = [tokenize_normalize(doc) for doc in documents]
test_tokenized_documents = [tokenize_normalize(doc) for doc in test_documents]


# Training Word2Vec model
word2vec_model = Word2Vec(sentences=tokenized_documents, vector_size=100, window=5, min_count=1, workers=4)

def generate_doc_embeddings(documents, word2vec_model):
    embeddings = []
    for doc in documents:
        doc_embedding = np.mean([word2vec_model.wv[word] for word in doc if word in word2vec_model.wv], axis=0)
        embeddings.append(doc_embedding)
    return np.array(embeddings)

# Generate document embeddings
x_word2vec = generate_doc_embeddings(tokenized_documents, word2vec_model)

# Splitting data into train and test sets
x_train_word2vec, x_test_word2vec, _, _ = train_test_split(x_word2vec, y, test_size=0.2, random_state=42)

test_x_word2vec = generate_doc_embeddings(test_tokenized_documents, word2vec_model)


Fasttext

In [13]:
from gensim.models import FastText

# Train FastText model
fasttext_model = FastText(sentences=tokenized_documents, vector_size=100, window=5, min_count=1, workers=4)

# Function to generate document embeddings using FastText
def generate_doc_embeddings_fasttext(documents, fasttext_model):
    embeddings = []
    for doc in documents:
        doc_embedding = np.mean([fasttext_model.wv[word] for word in doc if word in fasttext_model.wv], axis=0)
        embeddings.append(doc_embedding)
    return np.array(embeddings)

# Generate document embeddings using FastText
X_fasttext = generate_doc_embeddings_fasttext(tokenized_documents, fasttext_model)

# Splitting data into train and test sets
X_train_fasttext, X_test_fasttext, _, _ = train_test_split(X_fasttext, y, test_size=0.2, random_state=42)

# Generate document embeddings for test data using FastText
test_X_fasttext = generate_doc_embeddings_fasttext(test_tokenized_documents, fasttext_model)


SVM

In [14]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report

def svm(X_train, y_train, X_test, y_test):
    # Training SVM classifier
    svm_classifier = SVC(kernel='linear')
    svm_classifier.fit(X_train, y_train)

    # Predictions
    y_pred_svm = svm_classifier.predict(X_test)

    # Evaluating SVM classifier
    svm_report = classification_report(y_test, y_pred_svm, output_dict=True)
    macro_avg_f1_score = svm_report['macro avg']['f1-score']
    return macro_avg_f1_score

In [15]:
# Evaluating SVM classifier with tfidf
score = svm(X_train_tfidf, y_train, X_test_tfidf, y_test)
print("SVM Classifier with TF-IDF features (train test split):", score)

test_score = svm(X_tfidf, y, test_x_tfidf, test_y)
print("SVM Classifier with TF-IDF features (test file):", test_score)


SVM Classifier with TF-IDF features (train test split): 0.2864851217060963
SVM Classifier with TF-IDF features (test file): 0.3088820036862362


In [16]:
# Evaluating SVM classifier with Word2Vec embeddings
word2vec_train_score = svm(x_train_word2vec, y_train, x_test_word2vec, y_test)
print("SVM Classifier with Word2Vec embeddings (train-test split):", word2vec_train_score)
word2vec_test_score = svm(x_word2vec, y, test_x_word2vec, test_y)
print("SVM Classifier with Word2Vec embeddings (test file):", word2vec_test_score)


# Evaluating SVM classifier with FastText embeddings
fasttext_train_score = svm(X_train_fasttext, y_train, X_test_fasttext, y_test)
print("SVM Classifier with FastText embeddings (train-test split):", fasttext_train_score)
fasttext_test_score = svm(X_fasttext, y, test_X_fasttext, test_y)
print("SVM Classifier with FastText embeddings (test file):", fasttext_test_score)


SVM Classifier with Word2Vec embeddings (train-test split): 0.22449812843364533
SVM Classifier with Word2Vec embeddings (test file): 0.190862263844645
SVM Classifier with FastText embeddings (train-test split): 0.20494788202662198
SVM Classifier with FastText embeddings (test file): 0.17210019967246384


Random Forest

In [17]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

def randomForest(X_train,y_train, X_test, y_test):
    # Initialize Random Forest Classifier
    rf_classifier = RandomForestClassifier()

    # Train the model
    rf_classifier.fit(X_train, y_train)

    # Predictions
    rf_predictions = rf_classifier.predict(X_test)

    # Evaluation
    rf_report = classification_report(y_test, rf_predictions, output_dict=True)
    rf_macro_f1 = rf_report['macro avg']['f1-score']
    # print("Random Forest Macro F1 Score:", rf_macro_f1)
    return rf_macro_f1


In [25]:
# tfidf
rf_score=randomForest(X_train_tfidf, y_train, X_test_tfidf, y_test)
print('RF score with tfidf (test train split)', rf_score)
test_rf_score=randomForest(X_tfidf, y, test_x_tfidf, test_y)
print('RF score with tfidf (test)', test_rf_score)


RF score with tfidf (test train split) 0.15471053802528603
RF score with tfidf (test) 0.16923094708235253


In [18]:
# word2vec
rf_score_word2vec=randomForest(x_train_word2vec, y_train, x_test_word2vec, y_test)
print('RF score with word2vec (test train split)', rf_score_word2vec)
test_rf_score_word2vec=randomForest(x_word2vec, y, test_x_word2vec, test_y)
print('RF score with word2vec (test)', test_rf_score_word2vec)

# fasttext
rf_score_fasttext=randomForest(X_train_fasttext, y_train, X_test_fasttext, y_test)
print('RF score with fasttext (test train split)', rf_score_fasttext)
test_rf_score_fasttext=randomForest(X_fasttext, y, test_X_fasttext, test_y)
print('RF score with fasttext (test)', test_rf_score_fasttext)


RF score with word2vec (test train split) 0.13994509605607774
RF score with word2vec (test) 0.15076227715864104
RF score with fasttext (test train split) 0.1096545262357273
RF score with fasttext (test) 0.1332533705474679


KNN

In [19]:
from sklearn.neighbors import KNeighborsClassifier

def knn(X_train, y_train, X_test, y_test):
    # Initialize KNN Classifier
    knn_classifier = KNeighborsClassifier()

    # Train the model
    knn_classifier.fit(X_train, y_train)

    # Predictions
    knn_predictions = knn_classifier.predict(X_test)

    # Evaluation
    knn_report = classification_report(y_test, knn_predictions, output_dict=True)
    knn_macro_f1 = knn_report['macro avg']['f1-score']
    #print("KNN Macro F1 Score:", knn_macro_f1)
    return knn_macro_f1

In [26]:
# tfidf
knn_score=knn(X_train_tfidf, y_train, X_test_tfidf, y_test)
print('KNN score with tfidf (test train split)', knn_score)
test_knn_score=knn(X_tfidf, y, test_x_tfidf, test_y)
print('KNN score with tfidf (test)', test_knn_score)

KNN score with tfidf (test train split) 0.24910580950546563
KNN score with tfidf (test) 0.2974893559065393


In [21]:
# word2vec
knn_score_word2vec=knn(x_train_word2vec, y_train, x_test_word2vec, y_test)
print('KNN score with word2vec (test train split)', knn_score_word2vec)
test_knn_score_word2vec=randomForest(x_word2vec, y, test_x_word2vec, test_y)
print('KNN score with word2vec (test)', test_knn_score_word2vec)

# fasttext
knn_score_fasttext=knn(X_train_fasttext, y_train, X_test_fasttext, y_test)
print('KNN score with fasttext (test train split)', knn_score_fasttext)
test_knn_score_fasttext=randomForest(X_fasttext, y, test_X_fasttext, test_y)
print('KNN score with fasttext (test)', test_knn_score_fasttext)


KNN score with word2vec (test train split) 0.16470255508656254
KNN score with word2vec (test) 0.1416713169278412
KNN score with fasttext (test train split) 0.12837508831502473
KNN score with fasttext (test) 0.11888476470699161


RNN

In [48]:
import numpy as np
import tensorflow as tf
from sklearn.metrics import f1_score

from sklearn.preprocessing import LabelEncoder

# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Encode string labels to integer labels
y_encoded = label_encoder.fit_transform(y)

# Define the RNN model
model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(X_tfidf.shape[1],)),
    tf.keras.layers.Reshape((1, X_tfidf.shape[1])),
    tf.keras.layers.SimpleRNN(64, activation='relu'),
    tf.keras.layers.Dense(91, activation='softmax')
])


# X_tfidf = X_tfidf.toarray()  # Convert to NumPy array if not already
# y = np.array(y)  # Convert to NumPy array if not already

# Compile the model
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

# Train the model
model.fit(X_tfidf, y_encoded, epochs=10, batch_size=32, validation_split=0.2)

# Convert sparse matrix to dense array
test_x_tfidf_dense = test_x_tfidf.toarray()

# Predict on test set
predictions = np.argmax(model.predict(test_x_tfidf_dense), axis=-1)

# Convert string labels to integer labels for test_y
test_y_encoded = label_encoder.transform(test_y)

# Compute macro avg f1 score
macro_f1 = f1_score(test_y_encoded, predictions, average='macro')
print("Macro Avg F1 Score:", macro_f1)


Epoch 1/10
[1m286/286[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 33ms/step - accuracy: 0.4544 - loss: 3.4734 - val_accuracy: 0.0066 - val_loss: 8.8413
Epoch 2/10
[1m286/286[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 26ms/step - accuracy: 0.6765 - loss: 1.4537 - val_accuracy: 0.0197 - val_loss: 10.7349
Epoch 3/10
[1m286/286[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 27ms/step - accuracy: 0.7514 - loss: 1.0090 - val_accuracy: 0.0219 - val_loss: 11.9475
Epoch 4/10
[1m286/286[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 29ms/step - accuracy: 0.7892 - loss: 0.7659 - val_accuracy: 0.0206 - val_loss: 12.7933
Epoch 5/10
[1m286/286[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 32ms/step - accuracy: 0.8076 - loss: 0.6157 - val_accuracy: 0.0210 - val_loss: 13.3172
Epoch 6/10
[1m286/286[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 28ms/step - accuracy: 0.8101 - loss: 0.5309 - val_accuracy: 0.0219 - val_loss: 13.8087
Epoch 7/10
[1m2

LSTM

In [49]:
# Define the RNN model with LSTM
model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(X_tfidf.shape[1],)),
    tf.keras.layers.Reshape((1, X_tfidf.shape[1])),
    tf.keras.layers.LSTM(64, activation='relu'),
    tf.keras.layers.Dense(91, activation='softmax')
])

# Compile the model
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

# Train the model
model.fit(X_tfidf, y_encoded, epochs=10, batch_size=32, validation_split=0.2)

# Convert string labels to integer labels for test_y
test_y_encoded = label_encoder.transform(test_y)

# Predict on test set
predictions = np.argmax(model.predict(test_x_tfidf_dense), axis=-1)

# Compute macro avg f1 score
macro_f1 = f1_score(test_y_encoded, predictions, average='macro')
print("Macro Avg F1 Score:", macro_f1)

Epoch 1/10
[1m286/286[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m111s[0m 96ms/step - accuracy: 0.3780 - loss: 3.7801 - val_accuracy: 0.0066 - val_loss: 8.4407
Epoch 2/10
[1m286/286[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 83ms/step - accuracy: 0.6448 - loss: 1.6078 - val_accuracy: 0.0210 - val_loss: 11.0633
Epoch 3/10
[1m286/286[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 82ms/step - accuracy: 0.7286 - loss: 1.1006 - val_accuracy: 0.0228 - val_loss: 12.9492
Epoch 4/10
[1m286/286[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 83ms/step - accuracy: 0.7785 - loss: 0.8089 - val_accuracy: 0.0223 - val_loss: 14.2877
Epoch 5/10
[1m286/286[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 83ms/step - accuracy: 0.8009 - loss: 0.6569 - val_accuracy: 0.0228 - val_loss: 15.3792
Epoch 6/10
[1m286/286[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 87ms/step - accuracy: 0.8052 - loss: 0.5455 - val_accuracy: 0.0223 - val_loss: 16.1051
Epoch 7/10

In [50]:
# Define the RNN model with LSTM
model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(X_tfidf.shape[1],)),
    tf.keras.layers.Reshape((1, X_tfidf.shape[1])),
    tf.keras.layers.LSTM(128, activation='relu'),
    tf.keras.layers.Dense(91, activation='softmax')
])

# Compile the model
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

# Train the model
model.fit(X_tfidf, y_encoded, epochs=10, batch_size=32, validation_split=0.2)

# Convert string labels to integer labels for test_y
test_y_encoded = label_encoder.transform(test_y)

# Predict on test set
predictions = np.argmax(model.predict(test_x_tfidf_dense), axis=-1)

# Compute macro avg f1 score
macro_f1 = f1_score(test_y_encoded, predictions, average='macro')
print("Macro Avg F1 Score:", macro_f1)

Epoch 1/10
[1m286/286[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m56s[0m 169ms/step - accuracy: 0.3866 - loss: 3.5482 - val_accuracy: 0.0127 - val_loss: 9.0764
Epoch 2/10
[1m286/286[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 166ms/step - accuracy: 0.6765 - loss: 1.4124 - val_accuracy: 0.0223 - val_loss: 12.0484
Epoch 3/10
[1m286/286[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m49s[0m 170ms/step - accuracy: 0.7670 - loss: 0.8851 - val_accuracy: 0.0232 - val_loss: 14.4091
Epoch 4/10
[1m286/286[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 167ms/step - accuracy: 0.7972 - loss: 0.6593 - val_accuracy: 0.0223 - val_loss: 15.5666
Epoch 5/10
[1m286/286[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 169ms/step - accuracy: 0.8083 - loss: 0.5378 - val_accuracy: 0.0223 - val_loss: 16.6488
Epoch 6/10
[1m286/286[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 167ms/step - accuracy: 0.8168 - loss: 0.4494 - val_accuracy: 0.0206 - val_loss: 17.0948
Epoch