# Part Two

## Functions

### Libraries

In [87]:
# Libraries
from pathlib import Path
import pandas as pd
import os
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.svm import LinearSVC
from nltk import word_tokenize, sent_tokenize
import re
import contractions
import spacy


### Read data

In [88]:
def read_speeches_csv(path=Path.cwd() / "texts" / "p2-texts"):
    '''
    Function to load csv files into a pandas data frame

    Args:
        Function defaults to a specific location to search for the files unless otherwise specified

    Returns
        Pandas data frame
    '''
    # Extract file name
    file = os.listdir(path)[0]
    file_load = os.path.join(path, file)

    # Read data
    df = pd.read_csv(file_load)
    return df


### Clean data

In [89]:
def speeches_clean(df):
    '''
    Function that takes a data frame containing speeches, and performs custom cleaning tasks on it
    Custom cleaning tasks are:
        - Column 'party': replaces all entries 'Labour (Co-op)' with 'Labour'
        - Column 'party': removes all values where entry is 'Speaker'
        - Column 'party': only keeps the rows of the four most common parties
                          Finds the frequency count for each party, and keep the top 4 only
        - Column 'speech_class': removes all rows where value is NOT 'Speech'
        - Column 'speech': removes any entries where the length of the speech is less than 1000 characters

    Args: 
        df: Pandas data frame

    Returns:
        A Pandas data frame, cleaned
    '''
    # (a).i Clean Labour (Co-op) values
    df_cleaned = df.replace('Labour (Co-op)', 'Labour')

    # (a).ii Remove rows where 'party' == 'Speaker'
    '''Note: Remove speaker rows first, otherwise this will interfere with finding the most common parties'''
    df_cleaned = df_cleaned[df_cleaned['party'] != 'Speaker']

    # (a).ii Remove rows where the value of 'party' is not one of the 4 most common parties
    parties_count = df_cleaned['party'].value_counts().sort_values(ascending=False)
    # # Extract the name of the 4 most common parties 
    top4_parties = parties_count.index[:4].tolist()
    # # Filter to top 4 most common parties
    df_cleaned2 = df_cleaned[df_cleaned['party'].isin(top4_parties)]

    # (a).iii Remove rows where value in 'speech_class' is not 'Speech
    df_cleaned2 = df_cleaned2[df_cleaned2['speech_class'] == 'Speech']

    # (a).iv Remove rows where the text in speech columns is less than 1000
    df_out = df_cleaned2[df_cleaned2['speech'].str.len() >= 1000]

    return df_out


### Machine Learning Pipeline

In [None]:
def ml_pipeline(**kwargs):
    '''
    Function which processes and build ML models given the speeches data and prepares the data to be fed into ML models:
    The pipeline:
        Splits into train, test sets
        Vectorises the data
        Trains a RandomForest Model
        Trains a Linear SVM classifer
        Extracts the CLassification Report for each model
        Macro-Average F1 Score
    
    Arguments can be passed as key value pairs. Some arguments are mandatory whilst other are optionals. When optional arguments are not provided
    the function will use defaul values
    Ars:
        data (mandatory): A cleaned pandas data frame
        ngram (optional): A tuple containing the ngram to consider to pass in the TfidVectorizer function
                          default value: (1,1) unigrams
        stop_words (optional): A string containing the value for the stop_words argument for TfidVectorizer. If set ti 'english', stop words would be removed
                               default value: None - Stop words would not be removed
        class_weights (optional): Balances the weight for each class in the model depending on frequency counts

    '''
    # Extract input parameters
    input_dict = kwargs

    # Extract data from input
    df = input_dict.get('data')
    ngram = input_dict.get('ngram', (1,1))
    stop_words = input_dict.get('stop_words', None)
    tokenizer = input_dict.get('tokenizer', None)
    class_weight = input_dict.get('class_weight', None)

    # Tokenizer print object 
    if tokenizer is not None:
        token_print = tokenizer.__name__
    else:
        token_print = tokenizer
    print("\nArguments:")
    print(f"\tNgram: {ngram}\n\tStop words: {stop_words}\n\tTokenizer: {token_print}\n\tClass Weights: {class_weight}")

    # (b) Generate object that splits data using stratified sampling, and random seed of 26
    splitter_obj = StratifiedShuffleSplit(n_splits = 1, test_size = 0.2, random_state = 26) 
    # Split data
    for train_index, test_index in splitter_obj.split(df, df['party']):
        train = df.iloc[train_index]
        test = df.iloc[test_index]

    # (b) Split target in both training and testing set
    y_train, y_test = train['party'], test['party']

    # (b) Create vectorised data for x objects
    '''
    Fixed Arguments (required by asignment promt)
        Max features: set to 3000
        Stop words: set to Enlish

    Flexible Arguments
        stop_words: defined by parameters when function is called (Set to default value if not specified)
        ngram: defined by parameters when function is called (Set to default value if not specified)
    '''
    vectorizer = TfidfVectorizer(max_features = 3000, 
                                 stop_words=stop_words, 
                                 ngram_range = ngram,
                                 tokenizer = tokenizer)
    x_train = vectorizer.fit_transform(train['speech'])
    x_test = vectorizer.transform(test['speech'])

    # (c) Train random forest
    random_forest = RandomForestClassifier(n_estimators=300, n_jobs = -1, class_weight=class_weight) 
    random_forest.fit(x_train, y_train)
    random_forest_y_predict = random_forest.predict(x_test)

    # (c) Train SVM
    svm = LinearSVC(class_weight=class_weight)
    svm.fit(x_train, y_train)
    svm_y_predict = svm.predict(x_test)

    # Get label names
    target_names = y_test.unique()

    # Results section 
    print(f"{"="*20} Random Forest Performance {"="*20}")
    rf_cr = classification_report(y_test, random_forest_y_predict, target_names = target_names, output_dict = True)
    print(classification_report(y_test, random_forest_y_predict, target_names = target_names))

    print(f"{"="*20} SVC Performance {"="*20}")
    svc_cr = classification_report(y_test, svm_y_predict, target_names = target_names, output_dict = True)
    print(classification_report(y_test, svm_y_predict, target_names = target_names))

    return {'rf': rf_cr, 'svc': svc_cr}


### Custom Tokenizers

#### Basic Tokenizer

In [31]:
def my_tokenizer_basic(text):
    '''
    Basic tokenizer:
        Removes special break characters, such as \n, \t etc
        Removes any extra white spaces 
        Uses nltk word tokenizer to split the words into objects
        Only keeps alphabetical objects, ignores numeric and punctuation marks
        It keeps enligh stop words
    '''
    # Clean the text. Remove special characters, such as \n, \t etc and extra white spaces
    text = re.sub(r'\s+', ' ', text)
    text = text.strip()
    tokens = word_tokenize(text.lower())
    return [token for token in tokens if token.isalpha()]

In [32]:
'''Test tokenizer
    - How does it handle contractions: Don't, dont, let's, lets
    - How does it handle name objects: Prime Minister, Chris, The Conservative Party
    - Special characters: \n\t
'''
test_text = "test tokenizer\n\t. contractions!, SUCH as dont, won't, co-operate and punctuation? how the tokenizer handles these? #tokenizing #ml. Also, we check for Prime Minister, the speaker, mr Speaker and see how these are treated too, along with numeric values 1000 1,000 01/01/2020"

print(my_tokenizer_basic(test_text))

['test', 'tokenizer', 'contractions', 'such', 'as', 'dont', 'wo', 'and', 'punctuation', 'how', 'the', 'tokenizer', 'handles', 'these', 'tokenizing', 'ml', 'also', 'we', 'check', 'for', 'prime', 'minister', 'the', 'speaker', 'mr', 'speaker', 'and', 'see', 'how', 'these', 'are', 'treated', 'too', 'along', 'with', 'numeric', 'values']


#### Sentence Tokenizer

The function below tries a sentence tokenizer. By capturing the full embedded meaning of a sentence instead of a word itself, it is hoped that the model has access to more contextual information and might be able to better predict party based on speech. It is possible that sentences can carry more contextual information than specific words

In [33]:
def my_tokenizer_sentence(text):
    '''
    Sentence tokenizer:
        Removes special break characters, such as \n, \t etc
        Removes any extra white spaces 
        Uses nltk sentence tokenizer to split the senteces into objects
    '''
    # Clean the text. Remove special characters, such as \n, \t etc and extra white spaces
    text = re.sub(r'\s+', ' ', text)
    text = text.strip()
    return sent_tokenize(text)

In [34]:
# Test tokenizer
print(my_tokenizer_sentence(test_text))

['test tokenizer .', "contractions!, SUCH as dont, won't, co-operate and punctuation?", 'how the tokenizer handles these?', '#tokenizing #ml.', 'Also, we check for Prime Minister, the speaker, mr Speaker and see how these are treated too, along with numeric values 1000 1,000 01/01/2020']


From the examples above it seems that we need a tokenizer that provides cleaner data. 

The tokenizer below attempts to solve the following issues:

- Words contractions:
    Words like `won't` would be stored in a different vector than `will` and `not`, despite having the same meaning. The tokenizer below uses the `contraction` library to expand these contractions into a uniform output. Anytime `won't` is encountered, it gets transformed to `will not`. This consistency across the corpus might provide more accurate representation and improve the performance of the model.

- Named entities:
    Names entities like `Prime Minister` would be stored as two vectors, one for `prime` and one for `minister`. If in the corpus the word `prime` appears in a different context, such as a discussion on `prime TV shows`, the word `prime` would get a frequency count of 2, despite these two words having very different meaning given their context. The tokenizer below uses `Spacy` `en_core_web_lg` to detect all name entities, and when found, these are collapsed into a single word, i.e., `primeminister`. The objective is that each time `Prime Minister` appears, it is encoded as the entity of `Prime Minister` and not two separate words. 
    This should also help with stop words, as `The Church` would become `thechurch` as it refers to a specific entity, thus increasing the resolution and context of stop words as well. 

- Standard cleaning:
    This tokenizer also applies standard cleaning, such as converting all words to lower case, removes punctuation marks and digits. 

- Stop words:
    Stop words are not removed as these might be significant and perhaps could be part of named entities.

Unfortunately, this complex tokenization did not add any predictive power to our models. In fact, it performed worse than some simpler tokenizers. 



In [35]:

nlp = spacy.load("en_core_web_sm")

def my_tokenizer_spacy(text):
    

    # clean special chatacters and remove extra spaces 
    text_trimmed = re.sub(r'\s+', ' ', text)
    # Remove extra spaces and transform to lower case
    text_trimmed = text_trimmed.strip()
    # Do initial simple split
    token_iter = text_trimmed.split()

    # fix contractions only for words with '
    fixed_contractions = [] 
    for word in token_iter:
        if "'" in word:
            fixed_contractions.append(contractions.fix(word))
        else:
            fixed_contractions.append(word)
    # Join back to string
    text_string = " ".join(fixed_contractions)

    # Pass spacy parser
    doc = nlp(text_string)
    tokenized = []
    processed_token_indices = []
    # First, save named entities for accuracy (see text above for explanation)
    '''Loop using indeces, and save index number to not double count objects in tokenizer'''
    for ent in doc.ents:
        # Join named entities 
        ent_clean = re.sub(r"[^\w\s]", "", ent.text).replace(" ", "").lower()
        if ent_clean.isalpha():
            tokenized.append(ent_clean)
        for token in ent:
            processed_token_indices.append(token.i)

    # Loop over document to extract words, without double counting the already seen values
    for token in doc:
        if token.i not in processed_token_indices:
        # Clean punctuation marks in words (if any)
            cleaned_token = re.sub(r"[^\w\s]", "", token.text)
            # Then only append letters
            if cleaned_token.isalpha():
                tokenized.append(cleaned_token.lower())

    return tokenized




In [36]:
from spacy import tokenizer
from spacy.lang.en import English
from spacy.tokenizer import Tokenizer

nlp = English()
my_custom_tokenizer = Tokenizer(nlp.vocab)

def my_tokenizer_contractions_clean(text):
        
    # clean special chatacters and remove extra spaces 
    text_trimmed = re.sub(r'\s+', ' ', text)
    # Remove extra spaces and transform to lower case
    text_trimmed = text_trimmed.strip()
    # Do initial simple split
    token_iter = text_trimmed.split()

    # fix contractions only for words with '
    fixed_contractions = [] 
    for word in token_iter:
        if "'" in word:
            fixed_contractions.append(contractions.fix(word))
        else:
            fixed_contractions.append(word)
    # Join back to string
    text_string = " ".join(fixed_contractions)

    tokenized = my_custom_tokenizer(text_string)
    tokenized_out = []
    for token in tokenized:
        # Clean punctuation marks in words (if any)
        cleaned_token = re.sub(r"[^\w\s]", "", token.text)

        if cleaned_token.isalpha():
            tokenized_out.append(cleaned_token.lower())

    return tokenized_out

text = "test tokenizer\n\t. contractions!, SUCH as don't, won't, co-operate and punctuation? how the tokenizer handles these? #tokenizing #ml. Also, we check for U.K. Prime Minister, the speaker, mr Speaker and see how these are treated too, along with numeric values 1000 1,000 01/01/2020"
print(my_tokenizer_contractions_clean(text))

['test', 'tokenizer', 'contractions', 'such', 'as', 'do', 'not', 'will', 'not', 'cooperate', 'and', 'punctuation', 'how', 'the', 'tokenizer', 'handles', 'these', 'tokenizing', 'ml', 'also', 'we', 'check', 'for', 'uk', 'prime', 'minister', 'the', 'speaker', 'mr', 'speaker', 'and', 'see', 'how', 'these', 'are', 'treated', 'too', 'along', 'with', 'numeric', 'values']


In [82]:



nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])


def my_tokenizer_lemma(text):
        
    # clean special chatacters and remove extra spaces 
    text_trimmed = re.sub(r'\s+', ' ', text)
    # Remove extra spaces and transform to lower case
    text_trimmed = text_trimmed.strip()

    # tokenized = my_custom_tokenizer(text_string)
    doc = nlp(text_trimmed)
    tokenized_out = []

    for token in doc:
        if token.text.isalpha():
            tokenized_out.append(token.lemma_.lower())

    return tokenized_out

text = "test tokenizer\n\t. contractions!, SUCH as don't, won't, co-operate and punctuation? how the tokenizer handles these? #tokenizing #ml. Also, we check for U.K. Prime Minister, the speaker, mr Speaker and see how these are treated too, along with numeric values 1000 1,000 01/01/2020"
print(my_tokenizer_lemma(text))

['test', 'tokenizer', 'contraction', 'such', 'as', 'do', 'will', 'co', 'operate', 'and', 'punctuation', 'how', 'the', 'tokenizer', 'handle', 'these', 'tokenize', 'ml', 'also', 'we', 'check', 'for', 'prime', 'minister', 'the', 'speaker', 'mr', 'speaker', 'and', 'see', 'how', 'these', 'be', 'treat', 'too', 'along', 'with', 'numeric', 'value']


## Program / Execution

### Load and clean data

In [38]:
 # Load speeches data frame
df = read_speeches_csv()
# Clean data frame
df_cleaned = speeches_clean(df)
# Print dimensions
print(df_cleaned.shape)

(8084, 8)


See the class distribution below. It appears that the dataset is imbalanced, in the sense that there is a vast majority of entries for the Conservative party, and a very small proportion of entries for Liberal Democrat. This can have an impact on the classifiers, and should be addressed.

SK_Learn provides the class_weight argument, which can be passed to our models. By providing the value 'balanced', the model builds a dictionary where the weights are proportional to the class (similar idea to stratified sampling, in the way that we don't want to over-represent a particular party simply because it appears more often)

In [39]:
print(df_cleaned['party'].value_counts())

party
Conservative               4819
Labour                     2317
Scottish National Party     679
Liberal Democrat            269
Name: count, dtype: int64


Once we found the best performing tokenizer, we can adjust and test various hyperparameters to improve the performance even more

In [74]:
from sklearn.model_selection import GridSearchCV

#  (b) Generate object that splits data using stratified sampling, and random seed of 26
splitter_obj = StratifiedShuffleSplit(n_splits = 1, test_size = 0.2, random_state = 26) 
# Split data
for train_index, test_index in splitter_obj.split(df_cleaned, df_cleaned['party']):
    train = df_cleaned.iloc[train_index]
    test = df_cleaned.iloc[test_index]
# (b) Split target in both training and testing set
y_train, y_test = train['party'], test['party']
# (b) Create vectorised data for x objects
'''
Max features set to 3000
stop_words, ngram = defined by parameters when function is called
'''

vectorizer = TfidfVectorizer(max_features = 3000, 
                             stop_words='english', 
                             ngram_range = (1,3),
                             tokenizer = my_tokenizer_lemma)
x_train = vectorizer.fit_transform(train['speech'])
x_test = vectorizer.transform(test['speech'])




In [76]:
c_initial = [0.001, 0.01, 0.1, 1, 10, 100, 1000]
tuned = [0.60, 0.61, 0.62, 0.63, 0.64, 0.65, 0.66, 0.67, 0.68]
param_grid = [{'C': c_initial,
               'max_iter': [5000]}]

svc = LinearSVC(class_weight='balanced')

grid_search = GridSearchCV(estimator = svc,
                           param_grid = param_grid,
                           scoring = 'f1_macro',
                           cv = 5,
                           verbose = 2,
                           n_jobs = -1)

grid_search.fit(x_train, y_train)

print("\n=== All macro F1 scores ===")
for mean, params in zip(grid_search.cv_results_['mean_test_score'], grid_search.cv_results_['params']):
    print(f"C={params['C']}, macro F1: {mean:.3f}")

print("Best params:", grid_search.best_params_)
print("Best macro F1 (CV):", grid_search.best_score_)

# Evaluate best model on your held-out test:
best_svc = grid_search.best_estimator_
y_pred = best_svc.predict(x_test)
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

Fitting 5 folds for each of 7 candidates, totalling 35 fits
[CV] END .............................C=0.001, max_iter=5000; total time=   0.5s
[CV] END .............................C=0.001, max_iter=5000; total time=   0.5s
[CV] END .............................C=0.001, max_iter=5000; total time=   0.5s
[CV] END .............................C=0.001, max_iter=5000; total time=   0.5s
[CV] END .............................C=0.001, max_iter=5000; total time=   0.5s
[CV] END ..............................C=0.01, max_iter=5000; total time=   0.6s
[CV] END ..............................C=0.01, max_iter=5000; total time=   0.7s
[CV] END ..............................C=0.01, max_iter=5000; total time=   0.7s
[CV] END ..............................C=0.01, max_iter=5000; total time=   0.5s
[CV] END ..............................C=0.01, max_iter=5000; total time=   0.5s
[CV] END ...............................C=0.1, max_iter=5000; total time=   0.7s
[CV] END ...............................C=0.1, ma

### Train and test ML models

In [42]:
# Dictionary to record the Macro Avg F1 score for each tested model
f1_results = {}

#### Model set 1:

Train a Random Forest Model and SVM linear Kernel model:

    Remove English stop word: Yes
    Ngram: unigram only
    Tokenizer: Default
    

In [43]:
section_c = ml_pipeline(data = df_cleaned, stop_words = 'english')
# Save results into a dictionary
f1_results['f1_ma_rf_unigram'] =  round(section_c['rf']['macro avg']['f1-score'] ,2)
f1_results['f1_ma_svc_unigram'] = round(section_c['svc']['macro avg']['f1-score'], 2)


Arguments:
	Ngram: (1, 1)
	Stop words: english
	Tokenizer: None
	Class Weights: None
                         precision    recall  f1-score   support

           Conservative       0.72      0.98      0.83       964
Scottish National Party       0.76      0.45      0.56       463
                 Labour       0.00      0.00      0.00        54
       Liberal Democrat       0.87      0.25      0.39       136

               accuracy                           0.73      1617
              macro avg       0.59      0.42      0.45      1617
           weighted avg       0.72      0.73      0.69      1617

                         precision    recall  f1-score   support

           Conservative       0.85      0.92      0.88       964
Scottish National Party       0.76      0.73      0.74       463
                 Labour       1.00      0.22      0.36        54
       Liberal Democrat       0.72      0.57      0.64       136

               accuracy                           0.81      1617

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [44]:
section_c_balanced = ml_pipeline(data = df_cleaned, stop_words = 'english', class_weight = 'balanced')
# Save results into a dictionary
f1_results['f1_ma_rf_unigram_balanced'] =  round(section_c_balanced['rf']['macro avg']['f1-score'] ,2)
f1_results['f1_ma_svc_unigram_balanced'] = round(section_c_balanced['svc']['macro avg']['f1-score'], 2)


Arguments:
	Ngram: (1, 1)
	Stop words: english
	Tokenizer: None
	Class Weights: balanced
                         precision    recall  f1-score   support

           Conservative       0.70      0.98      0.82       964
Scottish National Party       0.83      0.36      0.50       463
                 Labour       1.00      0.02      0.04        54
       Liberal Democrat       0.86      0.41      0.56       136

               accuracy                           0.72      1617
              macro avg       0.85      0.44      0.48      1617
           weighted avg       0.76      0.72      0.68      1617

                         precision    recall  f1-score   support

           Conservative       0.87      0.88      0.88       964
Scottish National Party       0.74      0.74      0.74       463
                 Labour       0.66      0.39      0.49        54
       Liberal Democrat       0.65      0.70      0.67       136

               accuracy                           0.81      

#### Model Set 2

Train a Random Forest Model and SVM linear Kernel model:

    Remove English stop word: Yes
    Ngram: unigram, bi-gram and tri-grams
    Tokenizer: Default
    

In [45]:
section_d = ml_pipeline(data = df_cleaned, ngram = (1,3), stop_words = 'english')
# Save results into a dictionary
f1_results['f1_ma_rf_uni_bi_trigrams'] =  round(section_d['rf']['macro avg']['f1-score'] ,2)
f1_results['f1_ma_svc_uni_bi_trigrams'] = round(section_d['svc']['macro avg']['f1-score'], 2)


Arguments:
	Ngram: (1, 3)
	Stop words: english
	Tokenizer: None
	Class Weights: None
                         precision    recall  f1-score   support

           Conservative       0.74      0.97      0.84       964
Scottish National Party       0.78      0.49      0.60       463
                 Labour       0.00      0.00      0.00        54
       Liberal Democrat       0.82      0.36      0.50       136

               accuracy                           0.75      1617
              macro avg       0.58      0.45      0.48      1617
           weighted avg       0.73      0.75      0.71      1617

                         precision    recall  f1-score   support

           Conservative       0.85      0.92      0.88       964
Scottish National Party       0.75      0.73      0.74       463
                 Labour       0.90      0.17      0.28        54
       Liberal Democrat       0.80      0.63      0.70       136

               accuracy                           0.82      1617

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [46]:
section_d_balanced = ml_pipeline(data = df_cleaned, ngram = (1,3), stop_words = 'english', class_weight = 'balanced')
# Save results into a dictionary
f1_results['f1_ma_rf_uni_bi_trigrams_balanced'] =  round(section_d_balanced['rf']['macro avg']['f1-score'] ,2)
f1_results['f1_ma_svc_uni_bi_trigrams_balanced'] = round(section_d_balanced['svc']['macro avg']['f1-score'], 2)


Arguments:
	Ngram: (1, 3)
	Stop words: english
	Tokenizer: None
	Class Weights: balanced
                         precision    recall  f1-score   support

           Conservative       0.71      0.96      0.82       964
Scottish National Party       0.80      0.40      0.53       463
                 Labour       0.00      0.00      0.00        54
       Liberal Democrat       0.83      0.49      0.62       136

               accuracy                           0.73      1617
              macro avg       0.58      0.46      0.49      1617
           weighted avg       0.72      0.73      0.69      1617

                         precision    recall  f1-score   support

           Conservative       0.87      0.90      0.89       964
Scottish National Party       0.75      0.74      0.75       463
                 Labour       0.53      0.37      0.43        54
       Liberal Democrat       0.70      0.69      0.70       136

               accuracy                           0.82      

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


### Custom Tokenizers

**Basic Tokenizer**

`my_tokenizer_basic`

- Removes special break characters, such as \n, \t etc
- Removes any extra white spaces 
- Uses `nltk` word tokenizer to split the words into objects
- Only keeps alphabetical objects, ignores numeric and punctuation marks
- It keeps English stop words

From the metrics below, we can see that this tokenizer did not improve performance on the model. 



In [47]:
section_e_basic_t =  ml_pipeline(data = df_cleaned, ngram = (1,3), tokenizer = my_tokenizer_basic, class_weight='balanced')
f1_results['f1_ma_rf_uni_bi_trigrams_basictoken'] =  round(section_e_basic_t['rf']['macro avg']['f1-score'] ,2)
f1_results['f1_ma_svc_uni_bi_trigrams_basictoken'] = round(section_e_basic_t['svc']['macro avg']['f1-score'], 2)


Arguments:
	Ngram: (1, 3)
	Stop words: None
	Tokenizer: my_tokenizer_basic
	Class Weights: balanced




                         precision    recall  f1-score   support

           Conservative       0.71      0.98      0.82       964
Scottish National Party       0.84      0.37      0.51       463
                 Labour       0.00      0.00      0.00        54
       Liberal Democrat       0.82      0.46      0.59       136

               accuracy                           0.73      1617
              macro avg       0.59      0.45      0.48      1617
           weighted avg       0.73      0.73      0.68      1617

                         precision    recall  f1-score   support

           Conservative       0.88      0.88      0.88       964
Scottish National Party       0.73      0.74      0.73       463
                 Labour       0.50      0.33      0.40        54
       Liberal Democrat       0.68      0.72      0.70       136

               accuracy                           0.81      1617
              macro avg       0.70      0.67      0.68      1617
           weighted 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


**Sentence Tokenizer**

Using a sentence tokenizer produced worse results than previous tokenizers. A potential reason is that splitting the text by sentences instead of words generates a lower proportion of unique values. There will be less repeated sentences than repeated words, therefore this reduces the dimensionality and data availability for the model to actually learn from the data, given that with less repetition of token_x, label_x pairs, it becomes more difficult to the model to generalize

In [48]:
section_e_sentence_t =  ml_pipeline(data = df_cleaned, ngram = (1,1), tokenizer = my_tokenizer_sentence, class_weight='balanced')
f1_results['f1_ma_rf_uni_bi_trigrams_sentencetoken'] =  round(section_e_sentence_t['rf']['macro avg']['f1-score'] ,2)
f1_results['f1_ma_svc_uni_bi_trigrams_sentencetoken'] = round(section_e_sentence_t['svc']['macro avg']['f1-score'], 2)


Arguments:
	Ngram: (1, 1)
	Stop words: None
	Tokenizer: my_tokenizer_sentence
	Class Weights: balanced




                         precision    recall  f1-score   support

           Conservative       0.75      0.43      0.54       964
Scottish National Party       0.34      0.67      0.45       463
                 Labour       0.04      0.06      0.05        54
       Liberal Democrat       0.24      0.17      0.20       136

               accuracy                           0.46      1617
              macro avg       0.34      0.33      0.31      1617
           weighted avg       0.57      0.46      0.47      1617

                         precision    recall  f1-score   support

           Conservative       0.63      0.88      0.74       964
Scottish National Party       0.42      0.12      0.19       463
                 Labour       0.07      0.07      0.07        54
       Liberal Democrat       0.29      0.16      0.21       136

               accuracy                           0.58      1617
              macro avg       0.35      0.31      0.30      1617
           weighted 

**Spacy/Contractions Tokenizer**

In [49]:
section_e_spacy_t =  ml_pipeline(data = df_cleaned, ngram = (1,3), tokenizer = my_tokenizer_spacy, class_weight='balanced')
f1_results['f1_ma_rf_uni_bi_trigrams_spacytoken'] =  round(section_e_spacy_t['rf']['macro avg']['f1-score'] ,2)
f1_results['f1_ma_svc_uni_bi_trigrams_spacytoken'] = round(section_e_spacy_t['svc']['macro avg']['f1-score'], 2)


Arguments:
	Ngram: (1, 3)
	Stop words: None
	Tokenizer: my_tokenizer_spacy
	Class Weights: balanced




                         precision    recall  f1-score   support

           Conservative       0.70      0.98      0.81       964
Scottish National Party       0.82      0.34      0.48       463
                 Labour       0.00      0.00      0.00        54
       Liberal Democrat       0.81      0.41      0.55       136

               accuracy                           0.72      1617
              macro avg       0.58      0.43      0.46      1617
           weighted avg       0.72      0.72      0.67      1617

                         precision    recall  f1-score   support

           Conservative       0.88      0.88      0.88       964
Scottish National Party       0.73      0.73      0.73       463
                 Labour       0.50      0.33      0.40        54
       Liberal Democrat       0.67      0.72      0.70       136

               accuracy                           0.81      1617
              macro avg       0.69      0.67      0.68      1617
           weighted 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [86]:
section_e_contractcleaned_t =  ml_pipeline(data = df_cleaned, ngram = (1,1), tokenizer = my_tokenizer_contractions_clean, class_weight='balanced')#, stop_words = 'english')
f1_results['f1_ma_rf_uni_bi_trigrams_contractcleantoken'] =  round(section_e_contractcleaned_t['rf']['macro avg']['f1-score'] ,2)
f1_results['f1_ma_svc_uni_bi_trigrams_contractcleantoken'] = round(section_e_contractcleaned_t['svc']['macro avg']['f1-score'], 2)

#)1,3) stop words removed: svc.56
#(1,2) stop words removed: svc.70
#(1,1) stop words removed: svc.70

#(1,3) no stop words removed: svc.68
#(1,2) no stop words removed: svc.67
#(1,1) no stop words removed: svc.70





Arguments:
	Ngram: (1, 1)
	Stop words: None
	Tokenizer: my_tokenizer_contractions_clean
	Class Weights: balanced




                         precision    recall  f1-score   support

           Conservative       0.68      0.99      0.80       964
Scottish National Party       0.88      0.29      0.43       463
                 Labour       0.00      0.00      0.00        54
       Liberal Democrat       0.88      0.31      0.46       136

               accuracy                           0.70      1617
              macro avg       0.61      0.40      0.42      1617
           weighted avg       0.73      0.70      0.64      1617

                         precision    recall  f1-score   support

           Conservative       0.88      0.90      0.89       964
Scottish National Party       0.77      0.73      0.75       463
                 Labour       0.57      0.44      0.50        54
       Liberal Democrat       0.64      0.71      0.67       136

               accuracy                           0.82      1617
              macro avg       0.71      0.70      0.70      1617
           weighted 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [83]:

section_e_lemma_t =  ml_pipeline(data = df_cleaned, ngram = (1,3), tokenizer = my_tokenizer_lemma, class_weight='balanced', stop_words = 'english')
f1_results['f1_ma_rf_uni_bi_trigrams_lemmatoken'] =  round(section_e_lemma_t['rf']['macro avg']['f1-score'] ,2)
f1_results['f1_ma_svc_uni_bi_trigrams_lemmatoken'] = round(section_e_lemma_t['svc']['macro avg']['f1-score'], 2)


Arguments:
	Ngram: (1, 3)
	Stop words: english
	Tokenizer: my_tokenizer_lemma
	Class Weights: balanced




                         precision    recall  f1-score   support

           Conservative       0.71      0.97      0.82       964
Scottish National Party       0.82      0.38      0.52       463
                 Labour       0.00      0.00      0.00        54
       Liberal Democrat       0.86      0.48      0.61       136

               accuracy                           0.73      1617
              macro avg       0.59      0.46      0.49      1617
           weighted avg       0.73      0.73      0.69      1617

                         precision    recall  f1-score   support

           Conservative       0.87      0.90      0.89       964
Scottish National Party       0.77      0.74      0.75       463
                 Labour       0.53      0.44      0.48        54
       Liberal Democrat       0.71      0.71      0.71       136

               accuracy                           0.82      1617
              macro avg       0.72      0.70      0.71      1617
           weighted 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [84]:
for key, value in f1_results.items():
    print(f"F1 Macro Avg Score {key:<50} Value: {value:6.2f}")

F1 Macro Avg Score f1_ma_rf_unigram                                   Value:   0.45
F1 Macro Avg Score f1_ma_svc_unigram                                  Value:   0.66
F1 Macro Avg Score f1_ma_rf_unigram_balanced                          Value:   0.48
F1 Macro Avg Score f1_ma_svc_unigram_balanced                         Value:   0.70
F1 Macro Avg Score f1_ma_rf_uni_bi_trigrams                           Value:   0.48
F1 Macro Avg Score f1_ma_svc_uni_bi_trigrams                          Value:   0.65
F1 Macro Avg Score f1_ma_rf_uni_bi_trigrams_balanced                  Value:   0.49
F1 Macro Avg Score f1_ma_svc_uni_bi_trigrams_balanced                 Value:   0.69
F1 Macro Avg Score f1_ma_rf_uni_bi_trigrams_basictoken                Value:   0.48
F1 Macro Avg Score f1_ma_svc_uni_bi_trigrams_basictoken               Value:   0.68
F1 Macro Avg Score f1_ma_rf_uni_bi_trigrams_sentencetoken             Value:   0.31
F1 Macro Avg Score f1_ma_svc_uni_bi_trigrams_sentencetoken            Value: