In [None]:
import pandas as pd
import csv

In [None]:
def read_and_clean_file(path):
    df = pd.read_csv(path)

    # rename the ‘Labour (Co-op)’ value in ‘party’ column to ‘Labour’
    df["party"] = df["party"].replace("Labour (Co-op)", "Labour")
    #print(df.shape)

    # remove any rows where the value of the ‘party’ column is not one of the four most common party names, and remove the ‘Speaker’ value
    df = df[df["party"] != "Speaker"]
    top4_parties = df["party"].value_counts().index[:4]
    df = df[df["party"].isin(top4_parties)]
    #print(df.shape)

    # remove any rows where the value in the ‘speech_class’ column is not ‘Speech’
    df = df[df["speech_class"] != "Speaker"]
    #print(df.shape)

    #remove any rows where the text in the ‘speech’ column is less than 1000 characters long.
    df = df[df["speech"].str.len() >= 1000]
    
    print(df.shape)

    return df



In [None]:
df = read_and_clean_file("p2-texts/hansard40000.csv")
df
#df["party"].value_counts()

In [None]:
#2a
df.shape

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

In [None]:
def Tfidfvectorizer_split_data(df, ngram: str):
    X = df["speech"]
    y = df["party"]
    
    # Below condition will consider unigrams, bi-grams and tri-grams features - d part
    # Adjust the parameters of the Tfidfvectorizer so that unigrams, bi-grams and tri-grams will be considered as features
    if ngram == "3-gram":
        vectorizer = TfidfVectorizer(stop_words = "english", max_features = 3000, ngram_range=(1,3))

    # This condition will be slected for b part
    # Use the default parameters, except for omitting English stopwords and setting max_features to 3000.
    else :
        vectorizer = TfidfVectorizer(stop_words = "english", max_features = 3000)
    X_vector = vectorizer.fit_transform(X)

    # Split the data into a train and test set, using stratified sampling, with a random seed of 26.
    X_train, X_test, y_train, y_test = train_test_split(X_vector, y, test_size=0.2, stratify=y, random_state=26)
    return X_train, X_test, y_train, y_test

In [None]:
#2b
X_train, X_test, Y_train, Y_test = Tfidfvectorizer_split_data(df, "default")
print("X Train set shape:", X_train.shape)
print("X Test set shape:", X_test.shape)
print("Y Train set shape:", Y_train.shape)
print("Y Test set shape:", Y_test.shape)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import f1_score, classification_report 

In [None]:
def Random_Forest_and_SVM(X_train, X_test, y_train, y_test):
    results = {}
    #models
    random_forest = RandomForestClassifier(n_estimators=300, random_state=26)
    svm = SVC(kernel='linear', random_state=26)

    #Training models
    print("Training Random Forest model \n")
    random_forest.fit(X_train, y_train)
    
    print("Training SVM model \n")
    svm.fit(X_train, y_train)
    
    #predictions
    random_forest_pred = random_forest.predict(X_test)
    svm_pred = svm.predict(X_test)

    #macro-average f1 score
    random_forest_f1 = f1_score(y_test, random_forest_pred, average='macro')
    results["Random Forest"] = random_forest_f1
    svm_f1 = f1_score(y_test, svm_pred, average='macro')
    results["SVM"] = svm_f1

    #classification report 
    random_forest_report = classification_report(y_test, random_forest_pred)
    svm_report = classification_report(y_test, svm_pred)

    print("Random Forest Model:")
    print(f"Macro-average f1 score: {random_forest_f1}")
    print("Classification Report:")
    print(random_forest_report)

    print("\n")
    print("SVM Model:")
    print(f"Macro-average f1 score: {svm_f1}")
    print("Classification Report:")
    print(svm_report)

    return results

In [None]:
#2c
Random_Forest_and_SVM(X_train, X_test, Y_train, Y_test)

In [None]:
#2d
X_train, X_test, Y_train, Y_test = Tfidfvectorizer_split_data(df, "3-gram")
Random_Forest_and_SVM(X_train, X_test, Y_train, Y_test)

#part e
Implement a new custom tokenizer and pass it to the tokenizer argument of Tfidfvectorizer. You can use this function in any way you like to try to achieve
the best classification performance while keeping the number of features to nomore than 3000, and using the same three classifiers as above. Print the classification report for the best performing classifier using your tokenizer.

In [None]:
import spacy
import re
nlp = spacy.load("en_core_web_sm")
nlp.max_length = 2000000

In [None]:
def custom_tokenizer(text):

    # Process text with spaCy
    doc = nlp(text)
    
    tokens = []
    for token in doc:
        #if not(token.is_space or token.like_num or token.is_stop):
             #if token.pos_ in ["NOUN", "VERB", "ADJ", "ADV",""]:
        if token.is_alpha:
            tokens.append(token.lemma_.lower())
    return tokens

def Tfidfvectorizer_customtokeniser_split_data(df, ngram:str):
    X = df["speech"]
    y = df["party"]

    print("Starting TfidfVectorizer \n")
    
    if ngram == "3-gram":
        print("Using n-gram range : (1,3) - uni-gram, bi-gram and tri-gram \n")
        vectorizer = TfidfVectorizer(max_features=3000, ngram_range=(1,3),tokenizer=custom_tokenizer,min_df=20,max_df=0.7)
    else:
        print("Using default n-gram range : (1,1) \n")
        vectorizer = TfidfVectorizer(max_features=3000, tokenizer=custom_tokenizer, min_df=20, max_df=0.7 )

    print("Vectorizing data with custom tokenizer \n")
    X_vector = vectorizer.fit_transform(X)
    print("Vectorization completed \n")

    print("Splitting the data into training and testing sets \n")
    X_train, X_test, y_train, y_test = train_test_split(X_vector, y, test_size=0.2, stratify=y, random_state=26)
    print("Data Split completed \n")
    return X_train, X_test, y_train, y_test

In [None]:
import time 
start_time = time.time()
X_train, X_test, y_train, y_test = Tfidfvectorizer_customtokeniser_split_data(df, "default")
Random_Forest_and_SVM(X_train, X_test, y_train, y_test)
end_time = time.time()
duration = end_time - start_time
print(f"Duration for default n-gram: {duration}")
print("\n")

start_time = time.time()
print("For uni-gram, bi-gram and tri-gram")
X1_train, X1_test, y1_train, y1_test = Tfidfvectorizer_customtokeniser_split_data(df, "3-gram")
Random_Forest_and_SVM(X1_train, X1_test, y1_train, y1_test)
end_time = time.time()
duration = end_time - start_time
print(f"Duration for 3-gram: {duration}")
