In [3]:
import pandas as pd
import csv

In [5]:
def read_and_clean_file(path):
    df = pd.read_csv(path)

    # rename the ‘Labour (Co-op)’ value in ‘party’ column to ‘Labour’
    df["party"] = df["party"].replace("Labour (Co-op)", "Labour")
    #print(df.shape)

    # remove any rows where the value of the ‘party’ column is not one of the four most common party names, and remove the ‘Speaker’ value
    df = df[df["party"] != "Speaker"]
    top4_parties = df["party"].value_counts().index[:4]
    df = df[df["party"].isin(top4_parties)]
    #print(df.shape)

    # remove any rows where the value in the ‘speech_class’ column is not ‘Speech’
    df = df[df["speech_class"] != "Speaker"]
    #print(df.shape)

    #remove any rows where the text in the ‘speech’ column is less than 1000 characters long.
    df = df[df["speech"].str.len() >= 1000]
    #print(df.shape)

    return df



In [6]:
df = read_and_clean_file("p2-texts/hansard40000.csv")
df["party"].value_counts()

party
Conservative               4819
Labour                     2317
Scottish National Party     679
Liberal Democrat            269
Name: count, dtype: int64

In [None]:
#2a
df.shape

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

In [None]:
def Tfidfvectorizer_split_data(df, ngram: str):
    X = df["speech"]
    y = df["party"]
    # Below condition will consider unigrams, bi-grams and tri-grams features - d part
    if ngram == "tri-gram":
        vectorizer = TfidfVectorizer(stop_words = "english", max_features = 3000, ngram_range=(1,3))
    # This condition will be slected for b part
    else :
        vectorizer = TfidfVectorizer(stop_words = "english", max_features = 3000)
    X_vector = vectorizer.fit_transform(X)
    X_train, X_test, y_train, y_test = train_test_split(X_vector, y, test_size=0.2, stratify=y, random_state=26)
    return X_train, X_test, y_train, y_test

In [12]:
#2b
X_train, X_test, Y_train, Y_test = Tfidfvectorizer_split_data(df, "default")
print("X Train set shape:", X_train.shape)
print("X Test set shape:", X_test.shape)
print("Y Train set shape:", Y_train.shape)
print("Y Test set shape:", Y_test.shape)

X Train set shape: (6467, 3000)
X Test set shape: (1617, 3000)
Y Train set shape: (6467,)
Y Test set shape: (1617,)


In [13]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import f1_score, classification_report 

In [15]:
def Random_Forest_and_SVM(X_train, X_test, y_train, y_test):
    results = {}
    #models
    random_forest = RandomForestClassifier(n_estimators=300, random_state=26)
    svm = SVC(kernel='linear', random_state=26)

    #Training models
    print("Training Random Forest model")
    random_forest.fit(X_train, y_train)
    print("Training SVM model")
    svm.fit(X_train, y_train)

    #predictions
    random_forest_pred = random_forest.predict(X_test)
    svm_pred = svm.predict(X_test)

    #macro-average f1 score
    random_forest_f1 = f1_score(y_test, random_forest_pred, average='macro')
    results["Random Forest"] = random_forest_f1
    svm_f1 = f1_score(y_test, svm_pred, average='macro')
    results["SVM"] = svm_f1

    #classification report 
    random_forest_report = classification_report(y_test, random_forest_pred)
    svm_report = classification_report(y_test, svm_pred)

    print("Random Forest Model")
    print(f"Macro-average f1 score: {random_forest_f1}")
    print("Classification Report:")
    print(random_forest_report)

    print("\n")
    print("SVM Model")
    print(f"Macro-average f1 score: {svm_f1}")
    print("Classification Report:")
    print(svm_report)

    return results

In [16]:
#2c
Random_Forest_and_SVM(X_train, X_test, Y_train, Y_test)

Training Random Forest model
Training SVM model
Random Forest Model
Macro-average f1 score: 0.45469001950616234
Classification Report:
                         precision    recall  f1-score   support

           Conservative       0.72      0.98      0.83       964
                 Labour       0.75      0.44      0.56       463
       Liberal Democrat       0.00      0.00      0.00        54
Scottish National Party       0.87      0.29      0.43       136

               accuracy                           0.73      1617
              macro avg       0.59      0.43      0.45      1617
           weighted avg       0.72      0.73      0.69      1617



SVM Model
Macro-average f1 score: 0.5933446121140653
Classification Report:
                         precision    recall  f1-score   support

           Conservative       0.83      0.92      0.87       964
                 Labour       0.74      0.71      0.72       463
       Liberal Democrat       1.00      0.07      0.14        54
Sco

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'Random Forest': 0.45469001950616234, 'SVM': 0.5933446121140653}

In [17]:
#2d
X_train, X_test, Y_train, Y_test = Tfidfvectorizer_split_data(df, "tri-gram")
Random_Forest_and_SVM(X_train, X_test, Y_train, Y_test)

Training Random Forest model
Training SVM model
Random Forest Model
Macro-average f1 score: 0.47930475175651455
Classification Report:
                         precision    recall  f1-score   support

           Conservative       0.74      0.96      0.83       964
                 Labour       0.75      0.48      0.58       463
       Liberal Democrat       0.00      0.00      0.00        54
Scottish National Party       0.84      0.35      0.50       136

               accuracy                           0.74      1617
              macro avg       0.58      0.45      0.48      1617
           weighted avg       0.72      0.74      0.71      1617



SVM Model
Macro-average f1 score: 0.5854220473255666
Classification Report:
                         precision    recall  f1-score   support

           Conservative       0.84      0.92      0.88       964
                 Labour       0.75      0.73      0.74       463
       Liberal Democrat       1.00      0.04      0.07        54
Sco

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'Random Forest': 0.47930475175651455, 'SVM': 0.5854220473255666}

Implement a new custom tokenizer and pass it to the tokenizer argument of Tfidfvectorizer. You can use this function in any way you like to try to achieve
the best classification performance while keeping the number of features to nomore than 3000, and using the same three classifiers as above. Print the classification report for the best performing classifier using your tokenizer.

In [18]:
import spacy
import re
nlp = spacy.load("en_core_web_sm")
nlp.max_length = 2000000

In [23]:
def custom_tokenizer(text):

    # Process text with spaCy
    doc = nlp(text)
    
    tokens = []
    for token in doc:
        if not(token.is_space or token.like_num or token.is_stop):
             if token.pos_ in ["NOUN", "VERB", "ADJ", "ADV"]:
                  if token.is_alpha:
                      tokens.append(token.lemma_.lower())
    return tokens

def Tfidfvectorizer_customtokeniser_split_data(df):
    X = df["speech"]
    y = df["party"]
    vectorizer = TfidfVectorizer(max_features = 3000, ngram_range=(1,3),tokenizer=custom_tokenizer,min_df=20,max_df=0.7)
    X_vector = vectorizer.fit_transform(X)
    X_train, X_test, y_train, y_test = train_test_split(X_vector, y, test_size=0.2, stratify=y, random_state=26)
    return X_train, X_test, y_train, y_test



In [24]:
X_train, X_test, y_train, y_test = Tfidfvectorizer_customtokeniser_split_data(df)
Random_Forest_and_SVM(X_train, X_test, y_train, y_test)



Training Random Forest model
Training SVM model
Random Forest Model
Macro-average f1 score: 0.4333299494722551
Classification Report:
                         precision    recall  f1-score   support

           Conservative       0.72      0.98      0.83       964
                 Labour       0.73      0.45      0.56       463
       Liberal Democrat       0.00      0.00      0.00        54
Scottish National Party       0.91      0.21      0.35       136

               accuracy                           0.73      1617
              macro avg       0.59      0.41      0.43      1617
           weighted avg       0.72      0.73      0.68      1617



SVM Model
Macro-average f1 score: 0.5733794251858857
Classification Report:
                         precision    recall  f1-score   support

           Conservative       0.82      0.93      0.87       964
                 Labour       0.72      0.69      0.70       463
       Liberal Democrat       1.00      0.07      0.14        54
Scot

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'Random Forest': 0.4333299494722551, 'SVM': 0.5733794251858857}