In [51]:
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import f1_score, classification_report


In [52]:
def clean_hansard(filepath):
    """Returns a clean dataframe from the hansard dataset"""
    df = pd.read_csv(filepath)
    df['party'] = df['party'].replace({'Labour (Co-op)': 'Labour'})
    # print(df['party'].unique())

    party_counts = df['party'].value_counts()
    main_parties = party_counts.nlargest(4).index.tolist()
    df = df[df['party'].isin(main_parties)]
    # print(df['party'].value_counts())

    # 4th party 'Speaker'. Remove it.
    df = df[df['party'] != 'Speaker']
    # print(df['party'].value_counts())

    df = df[df["speech_class"] == "Speech"]
    speech_lengths = df["speech"].str.len()
    speeches = speech_lengths >= 1000
    df = df[speeches]

    rows, columns = df.shape
    print(f"Number of rows: {rows}")
    print(f"Number of columns: {columns}")

    return df

df = clean_hansard("p2-texts/hansard40000.csv")


Number of rows: 7815
Number of columns: 8


In [None]:
# Vectorizer that removes common English words and limits the number of features to the 3000 most frequent terms.
vectorizer = TfidfVectorizer(stop_words='english', max_features=3000)

# Vectorise the speeches
features = vectorizer.fit_transform(df['speech'])
labels = df["party"]

# Stratified by labels to ensure class proportions whithin parties, with a random seed of 26.
features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size= 0.2, random_state= 26, stratify= labels)

# print(features_train.shape, features_test.shape)
# print(labels_train.value_counts(normalize=True))   

(6252, 3000) (1563, 3000)
party
Conservative               0.616603
Labour                     0.296545
Scottish National Party    0.086852
Name: proportion, dtype: float64


The dataset is imbalanced (Conservative: 0.616603, Labour: 0.296545, Scottish National Party: 0.086852)

In [54]:
def random_forest_and_linearsvm_performance(x_train, x_test, y_train, y_test):
    """Trains RandomForest and SVM classifiers and prints macro-average f1 score and classification reports"""

    # 300 trees in the Random Forest, with a random seed of 26.
    random_forest = RandomForestClassifier(n_estimators=300, random_state=26)
    random_forest.fit(features_train, labels_train)
    rf_predict = random_forest.predict(features_test)

    random_forest_f1 = f1_score(labels_test, rf_predict, average="macro")
    random_forest_report = classification_report(labels_test, rf_predict)

    svm = SVC(kernel="linear", random_state=26)
    svm.fit(features_train, labels_train)
    svm_predict = svm.predict(features_test)

    svm_f1 = f1_score(labels_test, svm_predict, average="macro")
    svm_report = classification_report(labels_test, svm_predict)

    print(f"Random Forest classifier\nf1 score: {random_forest_f1}\nClassification report:\n{random_forest_report}")
    print(f"SVM linear classifier\nf1 score: {svm_f1}\nClassification report:\n{svm_report}")

random_forest_and_linearsvm_performance(features_train, features_test, labels_train, labels_test)


Random Forest classifier
f1 score: 0.6269718149483269
Classification report:
                         precision    recall  f1-score   support

           Conservative       0.74      0.98      0.84       964
                 Labour       0.79      0.43      0.56       463
Scottish National Party       0.96      0.32      0.48       136

               accuracy                           0.76      1563
              macro avg       0.83      0.58      0.63      1563
           weighted avg       0.78      0.76      0.73      1563

SVM linear classifier
f1 score: 0.7871964473639963
Classification report:
                         precision    recall  f1-score   support

           Conservative       0.85      0.92      0.89       964
                 Labour       0.79      0.71      0.75       463
Scottish National Party       0.83      0.65      0.73       136

               accuracy                           0.83      1563
              macro avg       0.83      0.76      0.79      1563

In [55]:
"""Prints the classification report adjusting the parameters of the Tfidfvectorizer so that unigrams, bi-grams and
tri-grams are considered as features."""

# Modified vectorizer to include unigrams (1 word), bigrams (2 word) and trigrams (3 word) sequences.
vectorizer = TfidfVectorizer(stop_words='english', max_features=3000, ngram_range=(1,3))
features = vectorizer.fit_transform(df['speech'])
labels = df['party']

features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size= 0.2, random_state= 26, stratify= labels)

random_forest_and_linearsvm_performance(features_train, features_test, labels_train, labels_test)


Random Forest classifier
f1 score: 0.6748975043716996
Classification report:
                         precision    recall  f1-score   support

           Conservative       0.76      0.97      0.85       964
                 Labour       0.80      0.49      0.61       463
Scottish National Party       0.95      0.40      0.57       136

               accuracy                           0.78      1563
              macro avg       0.84      0.62      0.67      1563
           weighted avg       0.79      0.78      0.75      1563

SVM linear classifier
f1 score: 0.7955796360349998
Classification report:
                         precision    recall  f1-score   support

           Conservative       0.86      0.92      0.89       964
                 Labour       0.79      0.73      0.76       463
Scottish National Party       0.84      0.65      0.74       136

               accuracy                           0.84      1563
              macro avg       0.83      0.77      0.80      1563

Adding bigrams and trigrams improved performance as F1 and accuracy slightly increased.

In [None]:
""" Implement a new custom tokenizer and pass it to the tokenizer argument of Tfidfvectorizer. 
Try to achieve the best classification performance with same number of features (3000) and the same three classifiers. 
Print the classification report for the best performing classifier using your tokenizer."""

def custom_tokenizer(text):
    text = text.lower()
    
    return tokens

' def custom_tokenizer(text):\n    text = text.lower(|)\n\n    return tokens '

For the custom tokenizer will try first to clean the text as in the parsing unwanted characters like  \n, \n\n, ', ., -- were observed (may try with NLTK and then with spaCy).
Will try also with up to 2 ngrams.
Will try removing rare tokens.
Will try removing very frequent words that may be used by every party and may not be distinctive.