# Part Two

## Functions

### Libraries

In [121]:
# Libraries
from pathlib import Path
import pandas as pd
import os
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.svm import LinearSVC
from nltk import word_tokenize
import re
import contractions
import spacy
from spacy.lang.en import English
from spacy.tokenizer import Tokenizer


### Read data

In [88]:
def read_speeches_csv(path=Path.cwd() / "texts" / "p2-texts"):
    '''
    Function to load csv files into a pandas data frame

    Args:
        Function defaults to a specific location to search for the files unless otherwise specified

    Returns
        Pandas data frame
    '''
    # Extract file name
    file = os.listdir(path)[0]
    file_load = os.path.join(path, file)

    # Read data
    df = pd.read_csv(file_load)
    return df


### Clean data

In [119]:
def speeches_clean(df):
    '''
    Function that takes a data frame containing speeches, and performs custom cleaning tasks on it
    Custom cleaning tasks are:
        - Column 'party': replaces all entries 'Labour (Co-op)' with 'Labour'
        - Column 'party': removes all values where entry is 'Speaker'
        - Column 'party': only keeps the rows of the four most common parties
                          Finds the frequency count for each party, and keep the top 4 only
        - Column 'speech_class': removes all rows where value is NOT 'Speech'
        - Column 'speech': removes any entries where the length of the speech is less than 1000 characters

    Args: 
        df: Pandas data frame

    Returns:
        A Pandas data frame, cleaned
    '''
    # (a).i Clean Labour (Co-op) values
    df_cleaned = df.replace('Labour (Co-op)', 'Labour')

    # (a).ii Remove rows where 'party' == 'Speaker'
    '''Note: Remove speaker rows first, otherwise this will interfere with finding the most common parties'''
    df_cleaned = df_cleaned[df_cleaned['party'] != 'Speaker']

    # (a).ii Remove rows where the value of 'party' is not one of the 4 most common parties
    parties_count = df_cleaned['party'].value_counts().sort_values(ascending=False)
    # # Extract the name of the 4 most common parties 
    top4_parties = parties_count.index[:4].tolist()
    # # Filter to top 4 most common parties
    df_cleaned2 = df_cleaned[df_cleaned['party'].isin(top4_parties)]

    # (a).iii Remove rows where value in 'speech_class' is not 'Speech
    df_cleaned2 = df_cleaned2[df_cleaned2['speech_class'] == 'Speech']

    # (a).iv Remove rows where the text in speech columns is less than 1000
    df_out = df_cleaned2[df_cleaned2['speech'].str.len() >= 1000]

    return df_out


### Machine Learning Pipeline

In [120]:
def ml_pipeline(**kwargs):
    '''
    Function which processes and build ML models given the speeches data and prepares the data to be fed into ML models:
    The pipeline:
        Splits into train, test sets
        Vectorises the data
        Trains a RandomForest Model
        Trains a Linear SVM classifer
        Extracts the Classification Report for each model
        Extracts the macro-Average F1 Score
    
    Arguments can be passed as key value pairs. Some arguments are mandatory whilst other are optionals. When optional arguments are not provided
    the function will use defaul values
    Ars:
        data (mandatory): A cleaned pandas data frame
        ngram (optional): A tuple containing the ngram to consider to pass in the TfidVectorizer function
                          default value: (1,1) unigrams
        class_weights (optional): Balances the weight for each class in the model depending on frequency counts
        verbose (optional): A booles object, T/F to determine whether the user wants extra information printed out or not

    The function prints:
        The classification report of each model, which contains the Macro Avg F1 value

    Returns:
        A dictionary with the classification report for the Random Forest and SVC models 
        Trained random forest and SVC models

    '''
    # Extract input parameters
    input_dict = kwargs

    # Extract data from input
    df = input_dict.get('data')
    ngram = input_dict.get('ngram', (1,1))
    tokenizer = input_dict.get('tokenizer', None)
    class_weight = input_dict.get('class_weight', None)
    verbose = input_dict.get('verbose', False)
    best_model = input_dict.get('best_model', False)

    if verbose:
        # Tokenizer print object 
        if tokenizer is not None:
            token_print = tokenizer.__name__
        else:
            token_print = tokenizer
        print("\nArguments:")
        print(f"\tNgram: {ngram}\n\tTokenizer: {token_print}\n\tClass Weights: {class_weight}")

    # (b) Generate object that splits data using stratified sampling, and random seed of 26
    splitter_obj = StratifiedShuffleSplit(n_splits = 1, test_size = 0.2, random_state = 26) 
    # Split data
    for train_index, test_index in splitter_obj.split(df, df['party']):
        train = df.iloc[train_index]
        test = df.iloc[test_index]

    # (b) Split target in both training and testing set
    y_train, y_test = train['party'], test['party']

    # (b) Create vectorised data for x objects
    vectorizer = TfidfVectorizer(max_features = 3000,    # Parameter set as specified in assignment prompt
                                 stop_words='english',   # Parameter set as specified in assignment prompt 
                                 ngram_range = ngram,    # Adjustable parameter
                                 tokenizer = tokenizer)  # Adjustable parameter
    x_train = vectorizer.fit_transform(train['speech'])
    x_test = vectorizer.transform(test['speech'])

    '''=== Section C ==='''
    # (c) Train random forest
    random_forest = RandomForestClassifier(n_estimators=300,           # Parameter set as specified in assignment prompt
                                           n_jobs = -1,                # Parameter set for efficiency and faster processing (uses all CPUs available)
                                           class_weight=class_weight)  # Adjustable parameter - assigns weights to classes depending on class frequency, important when data is not balanced 
    random_forest.fit(x_train, y_train)
    random_forest_y_predict = random_forest.predict(x_test)

    # (c) Train SVM
    svm = LinearSVC(class_weight=class_weight)   # Adjustable parameter - assigns weights to classes depending on class frequency, important when data is not balanced  
    svm.fit(x_train, y_train)
    svm_y_predict = svm.predict(x_test)

    # Get label names
    target_names = y_test.unique()

    # Results section 
    # # Random Forest
    rf_cr = classification_report(y_test, random_forest_y_predict, target_names = target_names, output_dict = True, zero_division = 0)
    f1_ma_rf = round(rf_cr['macro avg']['f1-score'], 2)

    # # SVM Classifier
    svc_cr = classification_report(y_test, svm_y_predict, target_names = target_names, output_dict = True, zero_division = 0)
    f1_ma_svc = round(svc_cr['macro avg']['f1-score'], 2)

    if best_model:
        # Find the best model
        svm_best = f1_ma_svc > f1_ma_rf
        # print based on test above
        if svm_best:
               print(f"{"="*20} SVC Performance {"="*20}")
               print(classification_report(y_test, svm_y_predict, target_names = target_names, zero_division = 0))
               print(f"F1 Macro Average Score: {f1_ma_svc}\n")
        else: 
               print(f"{"="*20} Random Forest Performance {"="*20}")
               print(classification_report(y_test, random_forest_y_predict, target_names = target_names, zero_division = 0))
               print(f"F1 Macro Average Score: {f1_ma_rf}\n")
    else:

        print(f"{"="*20} Random Forest Performance {"="*20}")
        print(classification_report(y_test, random_forest_y_predict, target_names = target_names, zero_division = 0))
        print(f"F1 Macro Average Score: {f1_ma_rf}\n")

    
        print(f"{"="*20} SVC Performance {"="*20}")
        print(classification_report(y_test, svm_y_predict, target_names = target_names, zero_division = 0))
        print(f"F1 Macro Average Score: {f1_ma_svc}\n")

    return {'rf': rf_cr, 'svc': svc_cr}, svm, random_forest 


### Custom Tokenizers

A few different tokenizers implementations were tested to find the best performing one. See tokenizer functions below

#### Basic Tokenizer

In [None]:
def my_tokenizer_basic(text):
    '''
    Basic tokenizer:
        Removes special break characters, such as \n, \t etc
        Removes any extra white spaces 
        Uses nltk word tokenizer to split the words into objects
        Only keeps alphabetical objects, ignores numeric and punctuation marks
    '''
    # Clean the text. Remove special characters, such as \n, \t etc and extra white spaces
    text = re.sub(r'\s+', ' ', text)
    text = text.strip()
    tokens = word_tokenize(text.lower())
    return [token for token in tokens if token.isalpha()]

From the examples above it seems that we need a tokenizer that provides cleaner data. 

The tokenizer below attempts to solve the following issues:

- Words contractions:
    Words like `won't` would be stored in a different vector than `will` and `not`, despite having the same meaning. The tokenizer below uses the `contraction` library to expand these contractions into a uniform output. Anytime `won't` is encountered, it gets transformed to `will not`. This consistency across the corpus might provide more accurate representation and improve the performance of the model.

- Named entities:
    Names entities like `Prime Minister` would be stored as two vectors, one for `prime` and one for `minister`. If in the corpus the word `prime` appears in a different context, such as a discussion on `prime TV shows`, the word `prime` would get a frequency count of 2, despite these two words having very different meaning given their context. The tokenizer below uses `Spacy` `en_core_web_lg` to detect all name entities, and when found, these are collapsed into a single word, i.e., `primeminister`. The objective is that each time `Prime Minister` appears, it is encoded as the entity of `Prime Minister` and not two separate words. 
    This should also help with stop words, as `The Church` would become `thechurch` as it refers to a specific entity, thus increasing the resolution and context of stop words as well. 

- Standard cleaning:
    This tokenizer also applies standard cleaning, such as converting all words to lower case, removes punctuation marks and digits. 

- Stop words:
    Stop words are not removed as these might be significant and perhaps could be part of named entities.

Unfortunately, this complex tokenization did not add any predictive power to our models. In fact, it performed worse than some simpler tokenizers. 



In [None]:

nlp = spacy.load("en_core_web_sm")

def my_tokenizer_spacy(text):
    '''
    Tokenizer using SpaCy

    - Applies the same cleaning steps as basic_tokenizer()
    - It uses the contractions library to amend any contractions found in the texts. For example, it transforms "cant't" into "can not", "I've" into "I have", etc
    - Searches for Name Entity objects (using SpaCy) and joins them into a single string so Named Entities get their own vector representation
        For example, the text may contain "United Kingdom" and "united, we will build a better country". In this case the word "united" would get a count of 2
        but the meaning of the actual token "united" is different given the context. Using name entity labels, the tokenizer below joins "United Kingdom" into
        "unitedkingdom" so this can be stored as a unique vector, separate to "united" thus helping to capture as much context as possible. 
    - Terms that are not named entities are then cleaned and stored into the list of token (along with the transformed named entities)
    - Numbers and punctuation marks are excluded from the output
    '''

    # clean special chatacters and remove extra spaces 
    text_trimmed = re.sub(r'\s+', ' ', text)
    # Remove extra spaces and transform to lower case
    text_trimmed = text_trimmed.strip()
    # Do initial simple split
    token_iter = text_trimmed.split()

    # fix contractions only for words with '
    fixed_contractions = [] 
    for word in token_iter:
        if "'" in word:
            fixed_contractions.append(contractions.fix(word))
        else:
            fixed_contractions.append(word)
    # Join back to string
    text_string = " ".join(fixed_contractions)

    # Pass spacy parser
    doc = nlp(text_string)
    tokenized = []
    processed_token_indices = []
    # First, save named entities for accuracy (see text above for explanation)
    '''Loop using indeces, and save index number to not double count objects in tokenizer'''
    for ent in doc.ents:
        # Join named entities 
        ent_clean = re.sub(r"[^\w\s]", "", ent.text).replace(" ", "").lower()
        if ent_clean.isalpha():
            tokenized.append(ent_clean)
        for token in ent:
            processed_token_indices.append(token.i)

    # Loop over document to extract words, without double counting the already seen values
    for token in doc:
        if token.i not in processed_token_indices:
        # Clean punctuation marks in words (if any)
            cleaned_token = re.sub(r"[^\w\s]", "", token.text)
            # Then only append letters
            if cleaned_token.isalpha():
                tokenized.append(cleaned_token.lower())

    return tokenized




In [122]:
# Download English library object from SpaCy
nlp = spacy.load("en_core_web_sm")
# Create custom tokenizer using the vocabulary in the nlp object
my_custom_tokenizer = Tokenizer(nlp.vocab)

def my_tokenizer_contractions_clean(text):
        
    # clean special chatacters and remove extra spaces 
    text_trimmed = re.sub(r'\s+', ' ', text)
    # Remove extra spaces and transform to lower case
    text_trimmed = text_trimmed.strip()
    # Do initial simple split
    token_iter = text_trimmed.split()

    # fix contractions only for words with '
    fixed_contractions = [] 
    for word in token_iter:
        if "'" in word:
            fixed_contractions.append(contractions.fix(word))
        else:
            fixed_contractions.append(word)
    # Join back to string
    text_string = " ".join(fixed_contractions)

    tokenized = my_custom_tokenizer(text_string)
    tokenized_out = []
    for token in tokenized:
        # Clean punctuation marks in words (if any)
        cleaned_token = re.sub(r"[^\w\s]", "", token.text)

        if cleaned_token.isalpha():
            tokenized_out.append(cleaned_token.lower())

    return tokenized_out

text = "test tokenizer\n\t. contractions!, SUCH as don't, won't, co-operate and punctuation? how the tokenizer handles these? #tokenizing #ml. Also, we check for U.K. Prime Minister, the speaker, mr Speaker and see how these are treated too, along with numeric values 1000 1,000 01/01/2020"
print(my_tokenizer_contractions_clean(text))

['test', 'tokenizer', 'contractions', 'such', 'as', 'do', 'not', 'will', 'not', 'cooperate', 'and', 'punctuation', 'how', 'the', 'tokenizer', 'handles', 'these', 'tokenizing', 'ml', 'also', 'we', 'check', 'for', 'uk', 'prime', 'minister', 'the', 'speaker', 'mr', 'speaker', 'and', 'see', 'how', 'these', 'are', 'treated', 'too', 'along', 'with', 'numeric', 'values']


In [None]:

def my_tokenizer_lemma(text):
        
    # clean special chatacters and remove extra spaces 
    text_trimmed = re.sub(r'\s+', ' ', text)
    # Remove extra spaces and transform to lower case
    text_trimmed = text_trimmed.strip()

    # tokenized = my_custom_tokenizer(text_string)
    doc = nlp(text_trimmed)
    tokenized_out = []

    for token in doc:
        if token.text.isalpha():
            tokenized_out.append(token.lemma_.lower())

    return tokenized_out

text = "test tokenizer\n\t. contractions!, SUCH as don't, won't, co-operate and punctuation? how the tokenizer handles these? #tokenizing #ml. Also, we check for U.K. Prime Minister, the speaker, mr Speaker and see how these are treated too, along with numeric values 1000 1,000 01/01/2020"
print(my_tokenizer_lemma(text))

['test', 'tokenizer', 'contraction', 'such', 'as', 'do', 'will', 'co', 'operate', 'and', 'punctuation', 'how', 'the', 'tokenizer', 'handle', 'these', 'tokenize', 'ml', 'also', 'we', 'check', 'for', 'prime', 'minister', 'the', 'speaker', 'mr', 'speaker', 'and', 'see', 'how', 'these', 'be', 'treat', 'too', 'along', 'with', 'numeric', 'value']


## Program / Execution

### Load and clean data

In [90]:
 # Load speeches data frame
df = read_speeches_csv()
# Clean data frame
df_cleaned = speeches_clean(df)
# Print dimensions
print(df_cleaned.shape)

(8084, 8)


In [74]:
from sklearn.model_selection import GridSearchCV

#  (b) Generate object that splits data using stratified sampling, and random seed of 26
splitter_obj = StratifiedShuffleSplit(n_splits = 1, test_size = 0.2, random_state = 26) 
# Split data
for train_index, test_index in splitter_obj.split(df_cleaned, df_cleaned['party']):
    train = df_cleaned.iloc[train_index]
    test = df_cleaned.iloc[test_index]
# (b) Split target in both training and testing set
y_train, y_test = train['party'], test['party']
# (b) Create vectorised data for x objects
'''
Max features set to 3000
stop_words, ngram = defined by parameters when function is called
'''

vectorizer = TfidfVectorizer(max_features = 3000, 
                             stop_words='english', 
                             ngram_range = (1,3),
                             tokenizer = my_tokenizer_lemma)
x_train = vectorizer.fit_transform(train['speech'])
x_test = vectorizer.transform(test['speech'])




In [76]:
c_initial = [0.001, 0.01, 0.1, 1, 10, 100, 1000]
tuned = [0.60, 0.61, 0.62, 0.63, 0.64, 0.65, 0.66, 0.67, 0.68]
param_grid = [{'C': c_initial,
               'max_iter': [5000]}]

svc = LinearSVC(class_weight='balanced')

grid_search = GridSearchCV(estimator = svc,
                           param_grid = param_grid,
                           scoring = 'f1_macro',
                           cv = 5,
                           verbose = 2,
                           n_jobs = -1)

grid_search.fit(x_train, y_train)

print("\n=== All macro F1 scores ===")
for mean, params in zip(grid_search.cv_results_['mean_test_score'], grid_search.cv_results_['params']):
    print(f"C={params['C']}, macro F1: {mean:.3f}")

print("Best params:", grid_search.best_params_)
print("Best macro F1 (CV):", grid_search.best_score_)

# Evaluate best model on your held-out test:
best_svc = grid_search.best_estimator_
y_pred = best_svc.predict(x_test)
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

Fitting 5 folds for each of 7 candidates, totalling 35 fits
[CV] END .............................C=0.001, max_iter=5000; total time=   0.5s
[CV] END .............................C=0.001, max_iter=5000; total time=   0.5s
[CV] END .............................C=0.001, max_iter=5000; total time=   0.5s
[CV] END .............................C=0.001, max_iter=5000; total time=   0.5s
[CV] END .............................C=0.001, max_iter=5000; total time=   0.5s
[CV] END ..............................C=0.01, max_iter=5000; total time=   0.6s
[CV] END ..............................C=0.01, max_iter=5000; total time=   0.7s
[CV] END ..............................C=0.01, max_iter=5000; total time=   0.7s
[CV] END ..............................C=0.01, max_iter=5000; total time=   0.5s
[CV] END ..............................C=0.01, max_iter=5000; total time=   0.5s
[CV] END ...............................C=0.1, max_iter=5000; total time=   0.7s
[CV] END ...............................C=0.1, ma

### Train and test ML models

In [92]:
# Dictionary to record the Macro Avg F1 score for each tested model
f1_results = {}

#### Section C

##### Model set 1:

Train a Random Forest Model and SVM linear Kernel model:

- Ngram: unigram only
- Tokenizer: Default
    

In [None]:
section_c, svm_model_c, rf_model_c  = ml_pipeline(data = df_cleaned)
# Save results into a dictionary
f1_results['f1_ma_rf_unigram'] =  round(section_c['rf']['macro avg']['f1-score'] ,2)
f1_results['f1_ma_svc_unigram'] = round(section_c['svc']['macro avg']['f1-score'], 2)

                         precision    recall  f1-score   support

           Conservative       0.73      0.98      0.84       964
Scottish National Party       0.78      0.46      0.58       463
                 Labour       0.00      0.00      0.00        54
       Liberal Democrat       0.90      0.32      0.47       136

               accuracy                           0.74      1617
              macro avg       0.60      0.44      0.47      1617
           weighted avg       0.73      0.74      0.70      1617

F1 Macro Average Score: 0.47

                         precision    recall  f1-score   support

           Conservative       0.85      0.92      0.88       964
Scottish National Party       0.76      0.73      0.74       463
                 Labour       1.00      0.22      0.36        54
       Liberal Democrat       0.72      0.57      0.64       136

               accuracy                           0.81      1617
              macro avg       0.83      0.61      0.66 

#### Section D

##### Model Set 2

Train a Random Forest Model and SVM linear Kernel model:

- Ngram: unigram, bi-gram and tri-grams
- Tokenizer: Default
    

In [None]:
section_d,  svm_model_d, rf_model_d = ml_pipeline(data = df_cleaned, ngram = (1,3))
# Save results into a dictionary
f1_results['f1_ma_rf_uni_bi_trigrams'] =  round(section_d['rf']['macro avg']['f1-score'] ,2)
f1_results['f1_ma_svc_uni_bi_trigrams'] = round(section_d['svc']['macro avg']['f1-score'], 2)

                         precision    recall  f1-score   support

           Conservative       0.71      0.97      0.82       964
Scottish National Party       0.81      0.40      0.53       463
                 Labour       0.00      0.00      0.00        54
       Liberal Democrat       0.82      0.49      0.61       136

               accuracy                           0.73      1617
              macro avg       0.59      0.46      0.49      1617
           weighted avg       0.73      0.73      0.69      1617

F1 Macro Average Score: 0.49

                         precision    recall  f1-score   support

           Conservative       0.87      0.90      0.89       964
Scottish National Party       0.75      0.74      0.75       463
                 Labour       0.53      0.37      0.43        54
       Liberal Democrat       0.70      0.69      0.70       136

               accuracy                           0.82      1617
              macro avg       0.71      0.67      0.69 

#### Section E

**Basic Tokenizer**

`my_tokenizer_basic`

- Removes special break characters, such as \n, \t etc
- Removes any extra white spaces 
- Uses `nltk` word tokenizer to split the words into objects
- Only keeps alphabetical objects, ignores numeric and punctuation marks
- It keeps English stop words

From the metrics below, we can see that this tokenizer did not improve performance on the model. 



In [47]:
section_e_basic_t =  ml_pipeline(data = df_cleaned, ngram = (1,3), tokenizer = my_tokenizer_basic, class_weight='balanced')
f1_results['f1_ma_rf_uni_bi_trigrams_basictoken'] =  round(section_e_basic_t['rf']['macro avg']['f1-score'] ,2)
f1_results['f1_ma_svc_uni_bi_trigrams_basictoken'] = round(section_e_basic_t['svc']['macro avg']['f1-score'], 2)


Arguments:
	Ngram: (1, 3)
	Stop words: None
	Tokenizer: my_tokenizer_basic
	Class Weights: balanced




                         precision    recall  f1-score   support

           Conservative       0.71      0.98      0.82       964
Scottish National Party       0.84      0.37      0.51       463
                 Labour       0.00      0.00      0.00        54
       Liberal Democrat       0.82      0.46      0.59       136

               accuracy                           0.73      1617
              macro avg       0.59      0.45      0.48      1617
           weighted avg       0.73      0.73      0.68      1617

                         precision    recall  f1-score   support

           Conservative       0.88      0.88      0.88       964
Scottish National Party       0.73      0.74      0.73       463
                 Labour       0.50      0.33      0.40        54
       Liberal Democrat       0.68      0.72      0.70       136

               accuracy                           0.81      1617
              macro avg       0.70      0.67      0.68      1617
           weighted 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


**Spacy/Contractions Tokenizer**

In [49]:
section_e_spacy_t =  ml_pipeline(data = df_cleaned, ngram = (1,3), tokenizer = my_tokenizer_spacy, class_weight='balanced')
f1_results['f1_ma_rf_uni_bi_trigrams_spacytoken'] =  round(section_e_spacy_t['rf']['macro avg']['f1-score'] ,2)
f1_results['f1_ma_svc_uni_bi_trigrams_spacytoken'] = round(section_e_spacy_t['svc']['macro avg']['f1-score'], 2)


Arguments:
	Ngram: (1, 3)
	Stop words: None
	Tokenizer: my_tokenizer_spacy
	Class Weights: balanced




                         precision    recall  f1-score   support

           Conservative       0.70      0.98      0.81       964
Scottish National Party       0.82      0.34      0.48       463
                 Labour       0.00      0.00      0.00        54
       Liberal Democrat       0.81      0.41      0.55       136

               accuracy                           0.72      1617
              macro avg       0.58      0.43      0.46      1617
           weighted avg       0.72      0.72      0.67      1617

                         precision    recall  f1-score   support

           Conservative       0.88      0.88      0.88       964
Scottish National Party       0.73      0.73      0.73       463
                 Labour       0.50      0.33      0.40        54
       Liberal Democrat       0.67      0.72      0.70       136

               accuracy                           0.81      1617
              macro avg       0.69      0.67      0.68      1617
           weighted 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [None]:
section_e_mytoken, svm_model_e_mytoken, rf_model__e_mytoken =  ml_pipeline(data = df_cleaned, 
                                                                           ngram = (1,1), 
                                                                           tokenizer = my_tokenizer_contractions_clean, 
                                                                           class_weight='balanced', 
                                                                           best_model = True)

f1_results['f1_ma_rf_uni_bi_trigrams_contractcleantoken'] =  round(section_e_mytoken['rf']['macro avg']['f1-score'] ,2)
f1_results['f1_ma_svc_uni_bi_trigrams_contractcleantoken'] = round(section_e_mytoken['svc']['macro avg']['f1-score'], 2)

#)1,3) stop words removed: svc.56
#(1,2) stop words removed: svc.70
#(1,1) stop words removed: svc.70

#(1,3) no stop words removed: svc.68
#(1,2) no stop words removed: svc.67
#(1,1) no stop words removed: svc.70






                         precision    recall  f1-score   support

           Conservative       0.85      0.92      0.88       964
Scottish National Party       0.75      0.74      0.74       463
                 Labour       1.00      0.24      0.39        54
       Liberal Democrat       0.73      0.54      0.62       136

               accuracy                           0.81      1617
              macro avg       0.83      0.61      0.66      1617
           weighted avg       0.81      0.81      0.80      1617

F1 Macro Average Score: 0.66



In [83]:

section_e_lemma_t =  ml_pipeline(data = df_cleaned, ngram = (1,3), tokenizer = my_tokenizer_lemma, class_weight='balanced', stop_words = 'english')
f1_results['f1_ma_rf_uni_bi_trigrams_lemmatoken'] =  round(section_e_lemma_t['rf']['macro avg']['f1-score'] ,2)
f1_results['f1_ma_svc_uni_bi_trigrams_lemmatoken'] = round(section_e_lemma_t['svc']['macro avg']['f1-score'], 2)


Arguments:
	Ngram: (1, 3)
	Stop words: english
	Tokenizer: my_tokenizer_lemma
	Class Weights: balanced




                         precision    recall  f1-score   support

           Conservative       0.71      0.97      0.82       964
Scottish National Party       0.82      0.38      0.52       463
                 Labour       0.00      0.00      0.00        54
       Liberal Democrat       0.86      0.48      0.61       136

               accuracy                           0.73      1617
              macro avg       0.59      0.46      0.49      1617
           weighted avg       0.73      0.73      0.69      1617

                         precision    recall  f1-score   support

           Conservative       0.87      0.90      0.89       964
Scottish National Party       0.77      0.74      0.75       463
                 Labour       0.53      0.44      0.48        54
       Liberal Democrat       0.71      0.71      0.71       136

               accuracy                           0.82      1617
              macro avg       0.72      0.70      0.71      1617
           weighted 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [84]:
for key, value in f1_results.items():
    print(f"F1 Macro Avg Score {key:<50} Value: {value:6.2f}")

F1 Macro Avg Score f1_ma_rf_unigram                                   Value:   0.45
F1 Macro Avg Score f1_ma_svc_unigram                                  Value:   0.66
F1 Macro Avg Score f1_ma_rf_unigram_balanced                          Value:   0.48
F1 Macro Avg Score f1_ma_svc_unigram_balanced                         Value:   0.70
F1 Macro Avg Score f1_ma_rf_uni_bi_trigrams                           Value:   0.48
F1 Macro Avg Score f1_ma_svc_uni_bi_trigrams                          Value:   0.65
F1 Macro Avg Score f1_ma_rf_uni_bi_trigrams_balanced                  Value:   0.49
F1 Macro Avg Score f1_ma_svc_uni_bi_trigrams_balanced                 Value:   0.69
F1 Macro Avg Score f1_ma_rf_uni_bi_trigrams_basictoken                Value:   0.48
F1 Macro Avg Score f1_ma_svc_uni_bi_trigrams_basictoken               Value:   0.68
F1 Macro Avg Score f1_ma_rf_uni_bi_trigrams_sentencetoken             Value:   0.31
F1 Macro Avg Score f1_ma_svc_uni_bi_trigrams_sentencetoken            Value: