In [14]:
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, f1_score

from collections import Counter

import spacy
nlp = spacy.load("en_core_web_sm")

In [4]:
path = "C:/Users/Baleid/Desktop/BirkBeck Study/NLP/Coursework/nlp-coursework-2024-25-N-PolarStar/p2-texts/hansard40000.csv"
df = pd.read_csv(path)


In [5]:
# Part A

#Data Cleaning

df['party'] = df['party'].replace('Labour (Co-op)', 'Labour')

#top 4 parties
df_no_speaker= df[df['party'] != 'Speaker']
top_four_parties = df_no_speaker['party'].value_counts().nlargest(4).index
df_top_four = df_no_speaker[df_no_speaker['party'].isin(top_four_parties)]

# not speech
df_not_speech = df_top_four[df_top_four['speech_class'] == 'Speech']

# less than 1000 characters
df_clean = df_not_speech[df_not_speech['speech'].str.len() >= 1000]
print("Dimensions of the final dataframe:")
print(df_clean.shape)


Dimensions of the final dataframe:
(8084, 8)


In [6]:
#Part B
vectorizer = TfidfVectorizer(stop_words='english', max_features=3000)
X = vectorizer.fit_transform(df_clean['speech'])
y = df_clean['party']

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=26)

In [None]:
#Part C: RF and SVM

#Random Forest Classifier
rf_clf = RandomForestClassifier(n_estimators=300, random_state=26)
rf_clf.fit(X_train, y_train)
y_pred_rf = rf_clf.predict(X_test)

print("Random Forest Classifier")
print(f"Macro-average F1 score: {f1_score(y_test, y_pred_rf, average='macro')}")
print("Classification Report:")
print(classification_report(y_test, y_pred_rf, zero_division=0))

#Support Vector Machine Classifier
svm_clf = SVC(kernel='linear')
svm_clf.fit(X_train, y_train)
y_pred_svm = svm_clf.predict(X_test)

print("SVM Classifier with Linear Kernel")
print(f"Macro-average F1 score: {f1_score(y_test, y_pred_svm, average='macro')}")
print("Classification Report:")
print(classification_report(y_test, y_pred_svm))

Random Forest Classifier
Macro-average F1 score: 0.44849276102645497
Classification Report:
                         precision    recall  f1-score   support

           Conservative       0.73      0.97      0.83      1205
                 Labour       0.74      0.45      0.56       579
       Liberal Democrat       0.00      0.00      0.00        67
Scottish National Party       0.88      0.26      0.40       170

               accuracy                           0.73      2021
              macro avg       0.59      0.42      0.45      2021
           weighted avg       0.72      0.73      0.69      2021



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


SVM Classifier with Linear Kernel
Macro-average F1 score: 0.5846137591595653
Classification Report:
                         precision    recall  f1-score   support

           Conservative       0.82      0.92      0.87      1205
                 Labour       0.72      0.68      0.70       579
       Liberal Democrat       0.83      0.07      0.14        67
Scottish National Party       0.78      0.53      0.63       170

               accuracy                           0.79      2021
              macro avg       0.79      0.55      0.58      2021
           weighted avg       0.79      0.79      0.78      2021



In [None]:
#Part D 
ngram_vectorizer = TfidfVectorizer(stop_words='english', max_features=3000, ngram_range=(1, 3))
X_ngram = ngram_vectorizer.fit_transform(df_clean['speech'])
X_train_ngram, X_test_ngram, y_train_ngram, y_test_ngram = train_test_split(X_ngram, y, stratify=y, random_state=26, test_size=0.30)

#Random Forest Classifier with N-grams
rf_clf_ngram = RandomForestClassifier(n_estimators=300, random_state=26)
rf_clf_ngram.fit(X_train_ngram, y_train_ngram)
y_pred_rf_ngram = rf_clf_ngram.predict(X_test_ngram)
print("Random Forest Classifier with N-grams")
print("Classification Report:")
print(classification_report(y_test_ngram, y_pred_rf_ngram, zero_division=0))

#Support Vector Machine Classifier with N-grams
svm_clf_ngram = SVC(kernel='linear')
svm_clf_ngram.fit(X_train_ngram, y_train_ngram)
y_pred_svm_ngram = svm_clf_ngram.predict(X_test_ngram)
print("SVM Classifier with N-grams")
print("Classification Report:")
print(classification_report(y_test_ngram, y_pred_svm_ngram))

Random Forest Classifier with N-grams
Classification Report:
                         precision    recall  f1-score   support

           Conservative       0.74      0.96      0.84      1446
                 Labour       0.76      0.50      0.61       695
       Liberal Democrat       0.00      0.00      0.00        81
Scottish National Party       0.85      0.37      0.51       204

               accuracy                           0.75      2426
              macro avg       0.59      0.46      0.49      2426
           weighted avg       0.73      0.75      0.72      2426



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


SVM Classifier with N-grams
Classification Report:
                         precision    recall  f1-score   support

           Conservative       0.83      0.92      0.87      1446
                 Labour       0.72      0.72      0.72       695
       Liberal Democrat       1.00      0.02      0.05        81
Scottish National Party       0.76      0.53      0.63       204

               accuracy                           0.80      2426
              macro avg       0.83      0.55      0.57      2426
           weighted avg       0.80      0.80      0.78      2426



In [17]:
#Part E
def custom_tokenizer(text):
    doc = nlp(text)
    tokens = []
    for token in doc:
        if token.is_alpha and not token.is_stop:
            tokens.append(token.lemma_.lower())
    # Count frequencies
    token_counts = Counter(tokens)
    tokens = [t for t in tokens if token_counts[t] > 1]
    
    return tokens

custom_vectorizer = TfidfVectorizer(tokenizer=custom_tokenizer, max_features=3000)
X_custom = custom_vectorizer.fit_transform(df_clean['speech'])
X_train_custom, X_test_custom, y_train_custom, y_test_custom = train_test_split(X_custom, y, stratify=y, random_state=26)

rf_clf_custom = RandomForestClassifier(n_estimators=300)
rf_clf_custom.fit(X_train_custom, y_train_custom)
y_pred_rf_custom = rf_clf_custom.predict(X_test_custom)
print("Random Forest Classifier with Custom Tokenizer")
print("Classification Report:")
print(classification_report(y_test_custom, y_pred_rf_custom, zero_division=0))

svm_clf_custom = SVC(kernel='linear')
svm_clf_custom.fit(X_train_custom, y_train_custom)
y_pred_svm_custom = svm_clf_custom.predict(X_test_custom)
print("SVM Classifier with Custom Tokenizer")
print("Classification Report:")
print(classification_report(y_test_custom, y_pred_svm_custom))



Random Forest Classifier with Custom Tokenizer
Classification Report:
                         precision    recall  f1-score   support

           Conservative       0.72      0.97      0.82      1205
                 Labour       0.72      0.43      0.54       579
       Liberal Democrat       0.00      0.00      0.00        67
Scottish National Party       0.81      0.28      0.42       170

               accuracy                           0.72      2021
              macro avg       0.56      0.42      0.44      2021
           weighted avg       0.70      0.72      0.68      2021

SVM Classifier with Custom Tokenizer
Classification Report:
                         precision    recall  f1-score   support

           Conservative       0.79      0.91      0.84      1205
                 Labour       0.69      0.62      0.65       579
       Liberal Democrat       1.00      0.03      0.06        67
Scottish National Party       0.75      0.43      0.55       170

               accur