In [3]:
import pandas as pd
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import RidgeClassifier
from sklearn.metrics import classification_report, f1_score


In [4]:
def read_file(path: str):
    return pd.read_csv(path)

df = pd.read_csv(r'C:\Users\tayyi\nlp-github-classroom-test-tayyib-saddique\feature-extraction-classification\hansard40000.csv')
df.head()

Unnamed: 0,speech,party,constituency,date,speech_class,major_heading,year,speakername
0,"Unemployment is soaring, uptake in benefits ha...",Labour,Portsmouth South,2020-09-14,Speech,Work and Pensions,2020,Stephen Morgan
1,I thank the hon. Gentleman for raising issues ...,Conservative,Mid Sussex,2020-09-14,Speech,Work and Pensions,2020,Mims Davies
2,As my hon. Friend the Member for Portsmouth So...,Labour,Warwick and Leamington,2020-09-14,Speech,Work and Pensions,2020,Matt Western
3,I thank the hon. Gentleman for raising the nee...,Conservative,Mid Sussex,2020-09-14,Speech,Work and Pensions,2020,Mims Davies
4,There is no doubt that the unemployment situat...,Labour,Ellesmere Port and Neston,2020-09-14,Speech,Work and Pensions,2020,Justin Madders


In [5]:
def pre_processing(df: pd.DataFrame) -> pd.DataFrame:
    df['party'] = df['party'].replace('Labour (Co-op)', 'Labour')
    
    major_parties = df['party'].value_counts().drop('Speaker').nlargest(4).index

    df = df.loc[(df['party'].isin(major_parties)) & (df['speech_class'] == 'Speech') & (df['speech'].str.len() >= 1500)]
    return df

df = pre_processing(df)
df.shape

(6476, 8)

In [6]:
df.head()

Unnamed: 0,speech,party,constituency,date,speech_class,major_heading,year,speakername
99,I am delighted to announce that last Friday we...,Conservative,South West Norfolk,2020-09-14,Speech,Japan Free Trade Agreement,2020,Elizabeth Truss
100,I thank the Secretary of State for advance sig...,Labour,Islington South and Finsbury,2020-09-14,Speech,Japan Free Trade Agreement,2020,Emily Thornberry
101,After the right hon. Lady’s congratulations to...,Conservative,South West Norfolk,2020-09-14,Speech,Japan Free Trade Agreement,2020,Elizabeth Truss
104,I congratulate the Secretary of State. I recog...,Scottish National Party,Dundee East,2020-09-14,Speech,Japan Free Trade Agreement,2020,Stewart Hosie
188,"I beg to move, That the Bill be now read a Sec...",Conservative,Uxbridge and South Ruislip,2020-09-14,Speech,United Kingdom Internal Market Bill,2020,Boris Johnson


Vectorising speeches and splitting data

In [7]:
vectoriser = TfidfVectorizer(stop_words='english', max_features=4000)
X = vectoriser.fit_transform(df['speech'])

X_train, X_test, y_train, y_test = train_test_split(X, df['party'], 
                                                    test_size = 0.2, 
                                                    stratify=df['party'], 
                                                    random_state=99)
print((X_train.shape, X_test.shape, y_train.shape, y_test.shape))


((5180, 4000), (1296, 4000), (5180,), (1296,))


Classification 

In [8]:
classifiers = {
    'Random Forest': RandomForestClassifier(n_estimators=400),
    'Support Vector Machine': SVC(kernel='linear'),
    'Ridge Classifier' : RidgeClassifier(tol=1e-2, solver="sparse_cg")

}

def classification(classifiers, X_train, X_test, y_train, y_test):
    for name, clf in classifiers.items():
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        f1 = f1_score(y_test, y_pred, average = 'macro')
        print(f"{name}:")
        print(f'Macro-average F1 score: {f1}')
        print(classification_report(y_test, y_pred, zero_division=0))


In [9]:
classification(classifiers, X_train, X_test, y_train, y_test)


Random Forest:
Macro-average F1 score: 0.4521366378118
                         precision    recall  f1-score   support

           Conservative       0.70      0.98      0.82       732
                 Labour       0.80      0.49      0.61       402
       Liberal Democrat       0.00      0.00      0.00        50
Scottish National Party       0.96      0.24      0.39       112

               accuracy                           0.72      1296
              macro avg       0.62      0.43      0.45      1296
           weighted avg       0.73      0.72      0.68      1296

Support Vector Machine:
Macro-average F1 score: 0.623665641562648
                         precision    recall  f1-score   support

           Conservative       0.83      0.93      0.88       732
                 Labour       0.76      0.75      0.75       402
       Liberal Democrat       0.86      0.12      0.21        50
Scottish National Party       0.86      0.53      0.65       112

               accuracy      

Adjusting parameters when vectorising data

In [10]:
vectoriser = TfidfVectorizer(stop_words='english', max_features=4000, ngram_range=(1, 3))
X = vectoriser.fit_transform(df['speech'])

X_train, X_test, y_train, y_test = train_test_split(X, df['party'], 
                                                    test_size = 0.2, 
                                                    stratify=df['party'], 
                                                    random_state=99)
print((X_train.shape, X_test.shape, y_train.shape, y_test.shape))

classification(classifiers, X_train, X_test, y_train, y_test)


((5180, 4000), (1296, 4000), (5180,), (1296,))
Random Forest:
Macro-average F1 score: 0.502443180647742
                         precision    recall  f1-score   support

           Conservative       0.73      0.97      0.83       732
                 Labour       0.80      0.54      0.64       402
       Liberal Democrat       0.50      0.02      0.04        50
Scottish National Party       0.87      0.35      0.50       112

               accuracy                           0.75      1296
              macro avg       0.72      0.47      0.50      1296
           weighted avg       0.75      0.75      0.71      1296

Support Vector Machine:
Macro-average F1 score: 0.6213503717843876
                         precision    recall  f1-score   support

           Conservative       0.84      0.94      0.89       732
                 Labour       0.79      0.78      0.78       402
       Liberal Democrat       1.00      0.06      0.11        50
Scottish National Party       0.87      0.59 

Implementing custom tokenizer 

In [11]:
nltk.download('wordnet')


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\tayyi\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [13]:
def custom_tokenizer(text):
    # Converting text to lowercase
    text = text.lower()
    
    # Tokenization
    tokens = word_tokenize(text)
    
    # Remove punctuation
    tokens = [re.sub(r'[^\w\s]', '', token) for token in tokens if re.sub(r'[^\w\s]', '', token)]
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    return tokens


vectoriser = TfidfVectorizer(tokenizer=custom_tokenizer, max_features=4000, ngram_range=(1, 3))
X = vectoriser.fit_transform(df['speech'])

X_train, X_test, y_train, y_test = train_test_split(X, df['party'], 
                                                    test_size = 0.2, 
                                                    stratify=df['party'], 
                                                    random_state=99)

print((X_train.shape, X_test.shape, y_train.shape, y_test.shape))

classification(classifiers, X_train, X_test, y_train, y_test)




((5180, 4000), (1296, 4000), (5180,), (1296,))
Random Forest:
Macro-average F1 score: 0.5110425375960589
                         precision    recall  f1-score   support

           Conservative       0.73      0.98      0.84       732
                 Labour       0.82      0.55      0.66       402
       Liberal Democrat       0.50      0.04      0.07        50
Scottish National Party       0.86      0.33      0.48       112

               accuracy                           0.75      1296
              macro avg       0.73      0.47      0.51      1296
           weighted avg       0.76      0.75      0.72      1296

Support Vector Machine:
Macro-average F1 score: 0.6220906234540804
                         precision    recall  f1-score   support

           Conservative       0.84      0.95      0.89       732
                 Labour       0.80      0.77      0.78       402
       Liberal Democrat       1.00      0.06      0.11        50
Scottish National Party       0.85      0.60