In [None]:
import pandas as pd
import csv

In [61]:
def read_and_clean_file(path):
    df = pd.read_csv(path)

    # rename the ‘Labour (Co-op)’ value in ‘party’ column to ‘Labour’
    df["party"] = df["party"].replace("Labour (Co-op)", "Labour")
    #print(df.shape)

    # remove any rows where the value of the ‘party’ column is not one of the four most common party names, and remove the ‘Speaker’ value
    df = df[df["party"] != "Speaker"]
    top4_parties = df["party"].value_counts().index[:4]
    df = df[df["party"].isin(top4_parties)]
    #print(df.shape)

    # remove any rows where the value in the ‘speech_class’ column is not ‘Speech’
    df = df[df["speech_class"] != "Speaker"]
    #print(df.shape)

    #remove any rows where the text in the ‘speech’ column is less than 1000 characters long.
    df = df[df["speech"].str.len() >= 1000]
    #print(df.shape)

    return df



In [63]:
df = read_and_clean_file("p2-texts/hansard40000.csv")
df["party"].value_counts()

party
Conservative               4819
Labour                     2317
Scottish National Party     679
Liberal Democrat            269
Name: count, dtype: int64

In [64]:
#2a
df.shape

(8084, 8)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

In [65]:
def Tfidfvectorizer_split_data(df, ngram: str):
    X = df["speech"]
    y = df["party"]
    if ngram == "tri-gram":
        vectorizer = TfidfVectorizer(stop_words = "english", max_features = 3000, ngram_range=(1,3))
    else :
        vectorizer = TfidfVectorizer(stop_words = "english", max_features = 3000)
    X_vector = vectorizer.fit_transform(X)
    X_train, X_test, y_train, y_test = train_test_split(X_vector, y, test_size=0.2, stratify=y, random_state=26)
    return X_train, X_test, y_train, y_test

In [66]:
#2b
X_train, X_test, Y_train, Y_test = Tfidfvectorizer_split_data(df, "default")
print("X Train set shape:", X_train.shape)
print("X Test set shape:", X_test.shape)
print("Y Train set shape:", Y_train.shape)
print("Y Test set shape:", Y_test.shape)

X Train set shape: (6467, 3000)
X Test set shape: (1617, 3000)
Y Train set shape: (6467,)
Y Test set shape: (1617,)


In [58]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import f1_score, classification_report 

In [67]:
def Random_Forest_and_SVM(X_train, X_test, y_train, y_test):
    results = {}
    #models
    random_forest = RandomForestClassifier(n_estimators=300, random_state=26)
    svm = SVC(kernel='linear', random_state=26)

    #Training models
    print("Training Random Forest model")
    random_forest.fit(X_train, y_train)
    print("Training SVM model")
    svm.fit(X_train, y_train)

    #predictions
    random_forest_pred = random_forest.predict(X_test)
    svm_pred = svm.predict(X_test)

    #macro-average f1 score
    random_forest_f1 = f1_score(y_test, random_forest_pred, average='macro')
    results["Random Forest"] = random_forest_f1
    svm_f1 = f1_score(y_test, svm_pred, average='macro')
    results["SVM"] = svm_f1

    #classification report 
    random_forest_report = classification_report(y_test, random_forest_pred)
    svm_report = classification_report(y_test, svm_pred)

    print("Random Forest Model")
    print(f"Macro-average f1 score: {random_forest_f1}")
    print("Classification Report:")
    print(random_forest_report)

    print("\n")
    print("SVM Model")
    print(f"Macro-average f1 score: {svm_f1}")
    print("Classification Report:")
    print(svm_report)

    return results

In [68]:
#2c
Random_Forest_and_SVM(X_train, X_test, Y_train, Y_test)

Training Random Forest model
Training SVM model
Random Forest Model
Macro-average f1 score: 0.45469001950616234
Classification Report:
                         precision    recall  f1-score   support

           Conservative       0.72      0.98      0.83       964
                 Labour       0.75      0.44      0.56       463
       Liberal Democrat       0.00      0.00      0.00        54
Scottish National Party       0.87      0.29      0.43       136

               accuracy                           0.73      1617
              macro avg       0.59      0.43      0.45      1617
           weighted avg       0.72      0.73      0.69      1617



SVM Model
Macro-average f1 score: 0.5933446121140653
Classification Report:
                         precision    recall  f1-score   support

           Conservative       0.83      0.92      0.87       964
                 Labour       0.74      0.71      0.72       463
       Liberal Democrat       1.00      0.07      0.14        54
Sco

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'Random Forest': 0.45469001950616234, 'SVM': 0.5933446121140653}

In [54]:
#2d
X_train, X_test, Y_train, Y_test = Tfidfvectorizer_split_data(df, "tri-gram")
Random_Forest_and_SVM(X_train, X_test, Y_train, Y_test)

Training Random Forest model
Training SVM model
Random Forest Model
Macro-average f1 score: 0.6748975043716996
Classification Report:
                         precision    recall  f1-score   support

           Conservative       0.76      0.97      0.85       964
                 Labour       0.80      0.49      0.61       463
Scottish National Party       0.95      0.40      0.57       136

               accuracy                           0.78      1563
              macro avg       0.84      0.62      0.67      1563
           weighted avg       0.79      0.78      0.75      1563



SVM Model
Macro-average f1 score: 0.7955796360349998
Classification Report:
                         precision    recall  f1-score   support

           Conservative       0.86      0.92      0.89       964
                 Labour       0.79      0.73      0.76       463
Scottish National Party       0.84      0.65      0.74       136

               accuracy                           0.84      1563
   

{'Random Forest': 0.6748975043716996, 'SVM': 0.7955796360349998}