In [36]:
import pandas as pd
import nltk
import spacy

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import f1_score, classification_report
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords


nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

nlp = spacy.load("en_core_web_sm")



[nltk_data] Downloading package punkt to /Users/jimena/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/jimena/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/jimena/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [22]:
# Clean the dataset and print its shape
    
df = pd.read_csv("p2-texts/hansard40000.csv")
df['party'] = df['party'].replace({'Labour (Co-op)': 'Labour'})
# print(df['party'].unique())

# Remove 'Speaker'
df = df[df['party'] != 'Speaker']
# print(df['party'].value_counts())

party_counts = df['party'].value_counts()
main_parties = party_counts.nlargest(4).index.tolist()
df = df[df['party'].isin(main_parties)]
# print(df['party'].value_counts())

df = df[df["speech_class"] == "Speech"]
speech_lengths = df["speech"].str.len()
speeches = speech_lengths >= 1000
df = df[speeches]

rows, columns = df.shape
print(f"Number of rows: {rows}")
print(f"Number of columns: {columns}")



Number of rows: 8084
Number of columns: 8


In [23]:
# Vectorizer that removes common English words and limits the number of features to the 3000 most frequent terms.
vectorizer = TfidfVectorizer(stop_words='english', max_features=3000)

# Vectorise the speeches
features = vectorizer.fit_transform(df['speech'])
labels = df["party"]

# Stratified by labels to ensure class proportions whithin parties, with a random seed of 26.
features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size= 0.2, random_state= 26, stratify= labels)

# print(features_train.shape, features_test.shape)
# print(labels_train.value_counts(normalize=True))   

The dataset is imbalanced (Conservative: 0.596103, Labour: 0.286686, Scottish National Party: 0.083965, Liberal Democrat 0.033246)

In [24]:
def random_forest_and_linearsvm_performance(x_train, x_test, y_train, y_test):
    """Trains RandomForest and SVM classifiers and prints macro-average f1 score and classification reports"""

    # 300 trees in the Random Forest, with a random seed of 26.
    random_forest = RandomForestClassifier(n_estimators=300, random_state=26)
    random_forest.fit(features_train, labels_train)
    rf_predict = random_forest.predict(features_test)

    # Added zero_division to handle 0.0 precision warning 
    random_forest_f1 = f1_score(labels_test, rf_predict, average="macro")
    random_forest_report = classification_report(labels_test, rf_predict, zero_division=0)

    svm = SVC(kernel="linear", random_state=26)
    svm.fit(features_train, labels_train)
    svm_predict = svm.predict(features_test)

    svm_f1 = f1_score(labels_test, svm_predict, average="macro")
    svm_report = classification_report(labels_test, svm_predict)

    print(f"Random Forest classifier\nf1 score: {random_forest_f1}\nClassification report:\n{random_forest_report}")
    print(f"SVM linear classifier\nf1 score: {svm_f1}\nClassification report:\n{svm_report}")

random_forest_and_linearsvm_performance(features_train, features_test, labels_train, labels_test)


Random Forest classifier
f1 score: 0.45422557608349956
Classification report:
                         precision    recall  f1-score   support

           Conservative       0.72      0.98      0.83       964
                 Labour       0.75      0.44      0.55       463
       Liberal Democrat       0.00      0.00      0.00        54
Scottish National Party       0.89      0.29      0.43       136

               accuracy                           0.73      1617
              macro avg       0.59      0.43      0.45      1617
           weighted avg       0.72      0.73      0.69      1617

SVM linear classifier
f1 score: 0.5933446121140653
Classification report:
                         precision    recall  f1-score   support

           Conservative       0.83      0.92      0.87       964
                 Labour       0.74      0.71      0.72       463
       Liberal Democrat       1.00      0.07      0.14        54
Scottish National Party       0.78      0.54      0.64       136

Liberal Democrat is not being predicted with the Random Forest classifier. Getting a warning message solved with zero_division.

Random Forest classifier
f1 score: 0.45422557608349956
SVM linear classifier
f1 score: 0.5933446121140653

In [25]:
"""Prints the classification report adjusting the parameters of the Tfidfvectorizer so that unigrams, bi-grams and
tri-grams are considered as features."""

# Modified vectorizer to include unigrams (1 word), bigrams (2 word) and trigrams (3 word) sequences.
vectorizer = TfidfVectorizer(stop_words='english', max_features=3000, ngram_range=(1,3))
features = vectorizer.fit_transform(df['speech'])
labels = df['party']

features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size= 0.2, random_state= 26, stratify= labels)

random_forest_and_linearsvm_performance(features_train, features_test, labels_train, labels_test)


Random Forest classifier
f1 score: 0.47930475175651455
Classification report:
                         precision    recall  f1-score   support

           Conservative       0.74      0.96      0.83       964
                 Labour       0.75      0.48      0.58       463
       Liberal Democrat       0.00      0.00      0.00        54
Scottish National Party       0.84      0.35      0.50       136

               accuracy                           0.74      1617
              macro avg       0.58      0.45      0.48      1617
           weighted avg       0.72      0.74      0.71      1617

SVM linear classifier
f1 score: 0.5854220473255666
Classification report:
                         precision    recall  f1-score   support

           Conservative       0.84      0.92      0.88       964
                 Labour       0.75      0.73      0.74       463
       Liberal Democrat       1.00      0.04      0.07        54
Scottish National Party       0.78      0.56      0.65       136

Random Forest classifier
f1 score: 0.47930475175651455
SVM linear classifier
f1 score: 0.5854220473255666
Adding bigrams and trigrams improved F1 score for RF, not accuracy. Wrost F1 score for SVM.

In [49]:
""" Implement a new custom tokenizer and pass it to the tokenizer argument of Tfidfvectorizer. 
Try to achieve the best classification performance with same number of features (3000) and the same three classifiers. 
Print the classification report for the best performing classifier using your tokenizer."""

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def custom_tokenizer_nltk(text):
    tokens = word_tokenize(text)
    clean_tokens = []
    for token in tokens:
       token = token.lower()
       if token.isalpha() and token not in stop_words:
        lemma = lemmatizer.lemmatize(token)
        if len(token) > 1:
           clean_tokens.append(lemma)
    return clean_tokens
        
# text = "Hello, world! This is a test"
# print(custom_tokenizer_nltk(text))

In [None]:
texts = df['speech']

tokenized_texts = texts.apply(custom_tokenizer_nltk)

# print(tokenized_texts.head())

63     [less, two, week, since, government, launched,...
99     [delighted, announce, last, friday, reached, a...
100    [thank, secretary, state, advance, sight, stat...
101    [right, hon, lady, congratulation, securing, i...
104    [congratulate, secretary, state, recognise, al...
Name: speech, dtype: object


In [50]:
vectorizer = TfidfVectorizer(
    tokenizer=custom_tokenizer_nltk, 
    preprocessor=None, lowercase=False, stop_words=None, 
    max_features=3000, ngram_range=(1, 3)
    )

features = vectorizer.fit_transform(df['speech'])
labels = df["party"]

features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size= 0.2, random_state= 26, stratify= labels)

random_forest_and_linearsvm_performance(features_train, features_test, labels_train, labels_test)



Random Forest classifier
f1 score: 0.5051579755563204
Classification report:
                         precision    recall  f1-score   support

           Conservative       0.73      0.96      0.83       964
                 Labour       0.75      0.48      0.58       463
       Liberal Democrat       1.00      0.06      0.11        54
Scottish National Party       0.90      0.35      0.50       136

               accuracy                           0.74      1617
              macro avg       0.85      0.46      0.51      1617
           weighted avg       0.76      0.74      0.71      1617

SVM linear classifier
f1 score: 0.6252689802421786
Classification report:
                         precision    recall  f1-score   support

           Conservative       0.83      0.93      0.87       964
                 Labour       0.74      0.70      0.72       463
       Liberal Democrat       0.89      0.15      0.25        54
Scottish National Party       0.80      0.55      0.65       136


unigrams
Random Forest classifier
f1 score: 0.43523748230145454
SVM linear classifier
f1 score: 0.5899504098866261
bigrams
Random Forest classifier
f1 score: 0.4960422798007194
SVM linear classifier
f1 score: 0.6227610514154456
trigrams
Random Forest classifier
f1 score: 0.5051579755563204 
# down to 0.4900020900816393 if vectorizer token len >2 insted of >1 
# down to 0.4762973139164108 if not applying lemmatization to tokenizer 
SVM linear classifier
f1 score: 0.6252689802421786
# down to 0.6095657098971895 if tokenizer len >2 insted of >1
# down to 0.5829744192780167 if not applying lemmatization to tokenizer

In [28]:
def custom_tokenizer_spacy(text):
    doc = nlp(text)
    clean_tokens = []
    for token in doc:
        if (token.is_alpha and 
            not token.is_stop and     
            len(token.lemma_) > 1): 
            clean_tokens.append(token.lemma_.lower())
    return clean_tokens

# text = "Hello, world! This is a test"
# print(custom_tokenizer_spacy(text))

In [29]:
vectorizer = TfidfVectorizer(tokenizer=lambda x: x.split(), preprocessor=None, lowercase=False)
features = vectorizer.fit_transform(df['speech'])
labels = df['party']

features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size= 0.2, random_state= 26, stratify= labels)

random_forest_and_linearsvm_performance(features_train, features_test, labels_train, labels_test)



Random Forest classifier
f1 score: 0.3177120531855518
Classification report:
                         precision    recall  f1-score   support

           Conservative       0.66      1.00      0.80       964
                 Labour       0.82      0.28      0.42       463
       Liberal Democrat       0.00      0.00      0.00        54
Scottish National Party       1.00      0.03      0.06       136

               accuracy                           0.68      1617
              macro avg       0.62      0.33      0.32      1617
           weighted avg       0.71      0.68      0.60      1617

SVM linear classifier
f1 score: 0.5628320204715067
Classification report:
                         precision    recall  f1-score   support

           Conservative       0.82      0.94      0.88       964
                 Labour       0.75      0.70      0.72       463
       Liberal Democrat       1.00      0.02      0.04        54
Scottish National Party       0.83      0.49      0.62       136


Lower performance and less efficient using spaCy.
Random Forest classifier
f1 score: 0.3177120531855518
SVM linear classifier
f1 score: 0.5628320204715067

After running diffrent values, the reducuction of very common words doesn't make any changes. But performance improves for the SVM classifier removing tokens present in fewer that 3 documments and decreces for the Random Forest classifier.

For the custom tokenizer will try first to clean the text as in the parsing unwanted characters like  \n, \n\n, ', ., -- were observed (may try with NLTK and then with spaCy).
Will try also with up to 2 ngrams.
Will try removing rare tokens.
Will try removing very frequent words that may be used by every party and may not be distinctive.