In [12]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import f1_score, classification_report
import spacy
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression, PassiveAggressiveClassifier
from sklearn.neural_network import MLPClassifier

nlp = spacy.load("en_core_web_sm")

In [2]:
df = pd.read_csv("p2-texts/hansard40000.csv")
print(df)

                                                  speech         party  \
0      Unemployment is soaring, uptake in benefits ha...        Labour   
1      I thank the hon. Gentleman for raising issues ...  Conservative   
2      As my hon. Friend the Member for Portsmouth So...        Labour   
3      I thank the hon. Gentleman for raising the nee...  Conservative   
4      There is no doubt that the unemployment situat...        Labour   
...                                                  ...           ...   
39995  I totally agree with everything that the right...       Speaker   
39996  Message to attend the Lords Commissioners deli...           NaN   
39997  I have to acquaint the House that the House ha...       Speaker   
39998  I have further to acquaint the House that the ...       Speaker   
39999  The Commission was also for proroguing this pr...       Speaker   

                    constituency        date speech_class  \
0               Portsmouth South  2020-09-14      

In [3]:
df['party'] = df['party'].replace('Labour (Co-op)', 'Labour')
print(df['party'])

0              Labour
1        Conservative
2              Labour
3        Conservative
4              Labour
             ...     
39995         Speaker
39996             NaN
39997         Speaker
39998         Speaker
39999         Speaker
Name: party, Length: 40000, dtype: object


In [4]:
common_parties = df['party'].value_counts().drop('Speaker').nlargest(4).index
df = df[df['party'].isin(common_parties)]
print(df)

                                                  speech         party  \
0      Unemployment is soaring, uptake in benefits ha...        Labour   
1      I thank the hon. Gentleman for raising issues ...  Conservative   
2      As my hon. Friend the Member for Portsmouth So...        Labour   
3      I thank the hon. Gentleman for raising the nee...  Conservative   
4      There is no doubt that the unemployment situat...        Labour   
...                                                  ...           ...   
39985  I will answer my hon. Friend. East West Rail, ...  Conservative   
39990  The hon. Gentleman is absolutely right to poin...  Conservative   
39991  Cutting-edge maritime projects such as the Hol...  Conservative   
39992  My hon. Friend is a brilliant champion of conn...  Conservative   
39994  On a point of order, Mr Speaker. As a further ...  Conservative   

                    constituency        date speech_class      major_heading  \
0               Portsmouth Sout

In [5]:
df = df[df['speech_class'] == 'Speech']
print(df)

                                                  speech         party  \
0      Unemployment is soaring, uptake in benefits ha...        Labour   
1      I thank the hon. Gentleman for raising issues ...  Conservative   
2      As my hon. Friend the Member for Portsmouth So...        Labour   
3      I thank the hon. Gentleman for raising the nee...  Conservative   
4      There is no doubt that the unemployment situat...        Labour   
...                                                  ...           ...   
39985  I will answer my hon. Friend. East West Rail, ...  Conservative   
39990  The hon. Gentleman is absolutely right to poin...  Conservative   
39991  Cutting-edge maritime projects such as the Hol...  Conservative   
39992  My hon. Friend is a brilliant champion of conn...  Conservative   
39994  On a point of order, Mr Speaker. As a further ...  Conservative   

                    constituency        date speech_class      major_heading  \
0               Portsmouth Sout

In [6]:
df = df[df['speech'].str.len() >= 1000]
print(df)

                                                  speech  \
63     It has been less than two weeks since the Gove...   
99     I am delighted to announce that last Friday we...   
100    I thank the Secretary of State for advance sig...   
101    After the right hon. Lady’s congratulations to...   
104    I congratulate the Secretary of State. I recog...   
...                                                  ...   
39831  I rise to present a petition on behalf of the ...   
39834  Thank you, Mr Deputy Speaker, and I am very gr...   
39835  I congratulate my hon. Friend the Member for S...   
39837  The hon. Gentleman makes an important, twofold...   
39869  Recent research by the Campaign to Protect Rur...   

                         party                  constituency        date  \
63                Conservative               Suffolk Coastal  2020-09-14   
99                Conservative            South West Norfolk  2020-09-14   
100                     Labour  Islington South and

In [7]:
print(df.shape)

(8084, 8)


In [8]:
vectorize = TfidfVectorizer(stop_words='english', max_features=3000)
x = vectorize.fit_transform(df['speech'])
y = df['party']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, stratify=y, random_state=26)

In [9]:
random_forest = RandomForestClassifier(n_estimators=300, random_state=26)
random_forest.fit(x_train, y_train)
random_forest_predict = random_forest.predict(x_test)

svm = SVC(kernel='linear', random_state=26)
svm.fit(x_train, y_train)
svm_predict = svm.predict(x_test)

print(f1_score(y_test, random_forest_predict, average='macro'))
print(classification_report(y_test, random_forest_predict))

print(f1_score(y_test, svm_predict, average='macro'))
print(classification_report(y_test, svm_predict))

0.45469001950616234
                         precision    recall  f1-score   support

           Conservative       0.72      0.98      0.83       964
                 Labour       0.75      0.44      0.56       463
       Liberal Democrat       0.00      0.00      0.00        54
Scottish National Party       0.87      0.29      0.43       136

               accuracy                           0.73      1617
              macro avg       0.59      0.43      0.45      1617
           weighted avg       0.72      0.73      0.69      1617

0.5933446121140653
                         precision    recall  f1-score   support

           Conservative       0.83      0.92      0.87       964
                 Labour       0.74      0.71      0.72       463
       Liberal Democrat       1.00      0.07      0.14        54
Scottish National Party       0.78      0.54      0.64       136

               accuracy                           0.80      1617
              macro avg       0.84      0.56  

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [10]:
vectorize_ngrams = TfidfVectorizer(stop_words='english', max_features=3000, ngram_range=(1, 3))
x_ngrams = vectorize_ngrams.fit_transform(df['speech'])

x_train_ng, x_test_ng, y_train_ng, y_test_ng = train_test_split(x_ngrams, df['party'], test_size=0.2, stratify=df['party'], random_state=26)

In [11]:
random_forest_ng = RandomForestClassifier(n_estimators=300, random_state=26)
random_forest_ng.fit(x_train_ng, y_train_ng)
random_forest_predict_ng = random_forest_ng.predict(x_test_ng)

svm_ng = SVC(kernel='linear', random_state=26)
svm_ng.fit(x_train_ng, y_train_ng)
svm_predict_ng = svm.predict(x_test_ng)

print(f1_score(y_test_ng, random_forest_predict_ng, average='macro'))
print(classification_report(y_test_ng, random_forest_predict_ng))

print(f1_score(y_test_ng, svm_predict_ng, average='macro'))
print(classification_report(y_test_ng, svm_predict_ng))



0.47930475175651455
                         precision    recall  f1-score   support

           Conservative       0.74      0.96      0.83       964
                 Labour       0.75      0.48      0.58       463
       Liberal Democrat       0.00      0.00      0.00        54
Scottish National Party       0.84      0.35      0.50       136

               accuracy                           0.74      1617
              macro avg       0.58      0.45      0.48      1617
           weighted avg       0.72      0.74      0.71      1617

0.21864064489364304
                         precision    recall  f1-score   support

           Conservative       0.59      0.92      0.72       964
                 Labour       0.27      0.07      0.11       463
       Liberal Democrat       0.00      0.00      0.00        54
Scottish National Party       1.00      0.02      0.04       136

               accuracy                           0.57      1617
              macro avg       0.47      0.25 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [13]:
def custom_tokenizer(text):
    doc = nlp(text)
    return [token.lemma_.lower() for token in doc if not token.is_stop and not token.is_punct and len(token.text) > 2 and token.is_alpha]

In [None]:
vectorizer