In [1]:
import pandas as pd

df = pd.read_csv('p2-texts/hansard40000.csv')
# Rename the ‘Labour (Co-op)’ value in ‘party’ column to ‘Labour’
df['party'] = df['party'].replace('Labour (Co-op)', 'Labour')
# Remove any rows where the value of the ‘party’ column is not one of the
# four most common party names, and remove the ‘Speaker’ value
top_parties = df['party'].value_counts().drop('Speaker').nlargest(4).index
df = df[df['party'].isin(top_parties)]
# Remove any rows where the value in the ‘speech_class’ column is not ‘Speech’.
df = df[df['speech_class'] == 'Speech']
# Remove any rows where the text in the ‘speech’ column is less than 1000
# characters long.
df = df[df['speech'].str.len() >= 1000]

print(df.shape)

(8084, 8)


In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

vectorizer = TfidfVectorizer(stop_words='english', max_features=3000)
X = vectorizer.fit_transform(df['speech'])

y = df['party']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=26, stratify=y)

In [3]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import f1_score, classification_report

rf = RandomForestClassifier(n_estimators=300, random_state=26)
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)

print('RandomForest Macro F1 Score:', f1_score(y_test, rf_pred, average='macro'))
print('RandomForest Classification Report:\n', classification_report(y_test, rf_pred))

svm = LinearSVC(random_state=26)
svm.fit(X_train, y_train)
svm_pred = svm.predict(X_test)

print('SVM Macro F1 Score:', f1_score(y_test, svm_pred, average='macro'))
print('SVM Classification Report:\n', classification_report(y_test, svm_pred))

RandomForest Macro F1 Score: 0.4456041754705794
RandomForest Classification Report:
                          precision    recall  f1-score   support

           Conservative       0.72      0.97      0.83      1205
                 Labour       0.75      0.45      0.56       579
       Liberal Democrat       0.00      0.00      0.00        67
Scottish National Party       0.84      0.25      0.39       170

               accuracy                           0.73      2021
              macro avg       0.58      0.42      0.45      2021
           weighted avg       0.72      0.73      0.69      2021



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


SVM Macro F1 Score: 0.6438622039189352
SVM Classification Report:
                          precision    recall  f1-score   support

           Conservative       0.84      0.91      0.87      1205
                 Labour       0.73      0.71      0.72       579
       Liberal Democrat       0.94      0.22      0.36        67
Scottish National Party       0.70      0.56      0.62       170

               accuracy                           0.80      2021
              macro avg       0.80      0.60      0.64      2021
           weighted avg       0.80      0.80      0.79      2021



In [4]:
vectorizer = TfidfVectorizer(
    stop_words='english',
    max_features=3000,
    ngram_range=(1, 3)
)

X = vectorizer.fit_transform(df['speech'])
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=26, stratify=y
)

rf = RandomForestClassifier(n_estimators=300, random_state=26)
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)
print('RandomForest with N-grams Macro F1 Score:', f1_score(y_test, rf_pred, average='macro'))
print('RandomForest with N-grams Classification Report:\n', classification_report(y_test, rf_pred))

svm = LinearSVC(random_state=26)
svm.fit(X_train, y_train)
svm_pred = svm.predict(X_test)
print('SVM with N-grams Macro F1 Score:', f1_score(y_test, svm_pred, average='macro'))
print('SVM with N-grams Classification Report:\n', classification_report(y_test, svm_pred))

RandomForest with N-grams Macro F1 Score: 0.4837657895594431
RandomForest with N-grams Classification Report:
                          precision    recall  f1-score   support

           Conservative       0.74      0.97      0.84      1205
                 Labour       0.76      0.47      0.58       579
       Liberal Democrat       0.00      0.00      0.00        67
Scottish National Party       0.87      0.36      0.51       170

               accuracy                           0.74      2021
              macro avg       0.59      0.45      0.48      2021
           weighted avg       0.73      0.74      0.71      2021



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


SVM with N-grams Macro F1 Score: 0.6470757532450477
SVM with N-grams Classification Report:
                          precision    recall  f1-score   support

           Conservative       0.85      0.91      0.88      1205
                 Labour       0.74      0.73      0.74       579
       Liberal Democrat       0.76      0.19      0.31        67
Scottish National Party       0.75      0.59      0.66       170

               accuracy                           0.81      2021
              macro avg       0.78      0.61      0.65      2021
           weighted avg       0.81      0.81      0.80      2021

