# Part Two

## Functions

### Libraries

In [1]:
# Libraries
from pathlib import Path
import pandas as pd
import os
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.svm import LinearSVC
from nltk import word_tokenize
import re
from transformers import BertTokenizerFast

  from .autonotebook import tqdm as notebook_tqdm
None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


### Read data

In [2]:
def read_speeches_csv(path=Path.cwd() / "texts" / "p2-texts"):
    '''
    Function to load csv files into pandas data frames

    Args:
        Function defaults to a specific location to search for the files unless otherwise specified

    Returns
        Pandas data frame
    '''
    # Extract file name
    file = os.listdir(path)[0]
    file_load = os.path.join(path, file)

    # Read data
    df = pd.read_csv(file_load)
    return df


### Clean data

In [3]:
def speeches_clean(df):
    '''
    Function that takes a data frame containing speeches, and performs custom cleaning tasks on it
    Custom cleaning tasks are:
        - Column 'party': replaces all entries 'Labour (Co-op)' with 'Labour'
        - Column 'party': removes all values where entry is 'Speaker'
        - Column 'party': only keeps the rows of the four most common parties
            Find the frequency count for each party, and keep the top 4 only
        - Column 'speech_class': removes all rows where value is NOT 'Speech'
        - Column 'speech': removes any entries where the length of the speech is less than 1000 characters

    Args: 
        df: Pandas data frame

    Returns:
        A Pandas data frame, cleaned
    '''
    # (a).i Clean Labour (Co-op) values
    df_cleaned = df.replace('Labour (Co-op)', 'Labour')

    # (a).ii Remove rows where 'party' == 'Speaker'
    '''Note: Remove speaker rows first, otherwise this will interfere with finding the most common parties'''
    df_cleaned = df_cleaned[df_cleaned['party'] != 'Speaker']

    # (a).ii Remove rows where the value of 'party' is not one of the 4 most common parties
    parties_count = df_cleaned['party'].value_counts().sort_values(ascending=False)
    # # Extract the name of the 4 most common parties 
    top4_parties = parties_count.index[:4].tolist()
    # # Filter to top 4 most common parties
    df_cleaned2 = df_cleaned[df_cleaned['party'].isin(top4_parties)]

    # (a).iii Remove rows where value in 'speech_class' is not 'Speech
    df_cleaned2 = df_cleaned2[df_cleaned2['speech_class'] == 'Speech']

    return df_cleaned2


### Machine Learning Pipeline

In [4]:
def ml_pipeline(**kwargs):
    '''
    Function which processes and build ML models given the speeches data and prepares the data to be fed into ML models:
    The pipeline:
        Splits into train, test sets
        Vectorises the data
        Trains a RandomForest Model
        Trains a Linear SVM classifer
        Extracts the CLassification Report for each model
        Macro-Average F1 Score
    
    Arguments can be passed as key value pairs. Some arguments are mandatory whilst other are optionals. When optional arguments are not provided
    the function will use defaul values
    Ars:
        data (mandatory): A cleaned pandas data frame
        ngram (optional): a tuple containing the ngram to consider to pass in the TfidVectorizer function
            default value: (1,1) unigrams
        stop_words (optional): A string containing the value for the stop_words argument for TfidVectorizer. If set ti 'english', stop words would be removed
            default value: None - Stop words would not be removed

    '''
    # Extract input parameters
    input_dict = kwargs

    # Extract data from input
    df = input_dict.get('data')
    ngram = input_dict.get('ngram', (1,1))
    stop_words = input_dict.get('stop_words', None)
    tokenizer = input_dict.get('tokenizer', None)

    # Tokenizer print: 
    if tokenizer is not None:
        token_print = tokenizer.__name__
    else:
        token_print = tokenizer
    print("\nArguments:")
    print(f"\tNgram: {ngram}\n\tStop words: {stop_words}\n\tTokenizer: {token_print}\n")

    # (b) Generate object that splits data using stratified sampling, and random seed of 26
    splitter_obj = StratifiedShuffleSplit(n_splits = 1, test_size = 0.2, random_state = 26) 
    # Split data
    for train_index, test_index in splitter_obj.split(df, df['party']):
        train = df.iloc[train_index]
        test = df.iloc[test_index]

    # (b) Split target in both training and testing set
    y_train, y_test = train['party'], test['party']

    # (b) Create vectorised data for x objects
    '''
    Max features set to 3000
    stop_words, ngram = defined by parameters when function is called
    '''
    vectorizer = TfidfVectorizer(max_features = 3000, 
                                 stop_words=stop_words, 
                                 ngram_range = ngram,
                                 tokenizer = tokenizer)
    x_train = vectorizer.fit_transform(train['speech'])
    x_test = vectorizer.transform(test['speech'])

    # (c) Train random forest
    random_forest = RandomForestClassifier(n_estimators=10, n_jobs = -1) # TODO Set to small number for training, bring back to 300 for real testing
    random_forest.fit(x_train, y_train)
    random_forest_y_predict = random_forest.predict(x_test)

    # (c) Train SVM
    svm = LinearSVC()
    svm.fit(x_train, y_train)
    svm_y_predict = svm.predict(x_test)

    # Get label names
    target_names = y_test.unique()

    # Results section 
    print(f"{"="*20} Random Forest Performance {"="*20}")
    rf_cr = classification_report(y_test, random_forest_y_predict, target_names = target_names, output_dict = True)
    print(classification_report(y_test, random_forest_y_predict, target_names = target_names))

    print(f"{"="*20} SVC Performance {"="*20}")
    svc_cr = classification_report(y_test, svm_y_predict, target_names = target_names, output_dict = True)
    print(classification_report(y_test, svm_y_predict, target_names = target_names))

    return {'rf': rf_cr, 'svc': svc_cr}


### Custom Tokenizers

#### Basic Tokenizer

In [5]:
def my_tokenizer_basic(text):
    '''
    Basic tokenizer that keeps stop words in
    '''
    # Clean the text. Remove special characters, such as \n, \t etc and extra white spaces
    text = re.sub(r'\s+', ' ', text)
    text = text.strip()
    tokens = word_tokenize(text.lower())
    return [token for token in tokens if token.isalpha()]

## Program / Execution

### Load and clean data

In [7]:
 # Load speeches data frame
df = read_speeches_csv()
# Clean data frame
df_cleaned = speeches_clean(df)
# Print dimensions
print(df_cleaned.shape)

(36223, 8)


In [13]:
df_cleaned['party'].count_values()

AttributeError: 'Series' object has no attribute 'count_values'

### Train and test ML models

In [8]:
# Dictionary to record the Macro Avg F1 score for each tested model
f1_results = {}

#### Model set 1:

Train a Random Forest Model and SVM linear Kernel model:

    Remove English stop word: Yes
    Ngram: unigram only
    Tokenizer: Default
    

In [9]:
section_c = ml_pipeline(data = df_cleaned, stop_words = 'english')
# Save results into a dictionary
f1_results['f1_ma_rf_unigram'] =  round(section_c['rf']['macro avg']['f1-score'] ,2)
f1_results['f1_ma_svc_unigram'] = round(section_c['svc']['macro avg']['f1-score'], 2)


Arguments:
	Ngram: (1, 1)
	Stop words: english
	Tokenizer: None

                         precision    recall  f1-score   support

                 Labour       0.78      0.96      0.86      5016
           Conservative       0.65      0.40      0.50      1608
Scottish National Party       1.00      0.01      0.01       160
       Liberal Democrat       0.76      0.20      0.31       461

               accuracy                           0.76      7245
              macro avg       0.80      0.39      0.42      7245
           weighted avg       0.76      0.76      0.73      7245

                         precision    recall  f1-score   support

                 Labour       0.85      0.93      0.89      5016
           Conservative       0.65      0.60      0.62      1608
Scottish National Party       0.61      0.11      0.18       160
       Liberal Democrat       0.63      0.37      0.47       461

               accuracy                           0.80      7245
              macro

#### Model Set 2

Train a Random Forest Model and SVM linear Kernel model:

    Remove English stop word: Yes
    Ngram: unigram, bi-gram and tri-grams
    Tokenizer: Default
    

In [10]:
section_d = ml_pipeline(data = df_cleaned, ngram = (1,3), stop_words = 'english')
# Save results into a dictionary
f1_results['f1_ma_rf_uni_bi_trigrams'] =  round(section_d['rf']['macro avg']['f1-score'] ,2)
f1_results['f1_ma_svc_uni_bi_trigrams'] = round(section_d['svc']['macro avg']['f1-score'], 2)


Arguments:
	Ngram: (1, 3)
	Stop words: english
	Tokenizer: None

                         precision    recall  f1-score   support

                 Labour       0.79      0.94      0.86      5016
           Conservative       0.63      0.45      0.53      1608
Scottish National Party       0.00      0.00      0.00       160
       Liberal Democrat       0.76      0.24      0.36       461

               accuracy                           0.77      7245
              macro avg       0.55      0.41      0.44      7245
           weighted avg       0.74      0.77      0.74      7245



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


                         precision    recall  f1-score   support

                 Labour       0.85      0.93      0.89      5016
           Conservative       0.65      0.61      0.63      1608
Scottish National Party       0.58      0.09      0.16       160
       Liberal Democrat       0.67      0.41      0.50       461

               accuracy                           0.80      7245
              macro avg       0.69      0.51      0.55      7245
           weighted avg       0.79      0.80      0.79      7245

