In [12]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

In [13]:
df = pd.read_csv("p2-texts/hansard40000.csv")
print(df)

                                                  speech         party  \
0      Unemployment is soaring, uptake in benefits ha...        Labour   
1      I thank the hon. Gentleman for raising issues ...  Conservative   
2      As my hon. Friend the Member for Portsmouth So...        Labour   
3      I thank the hon. Gentleman for raising the nee...  Conservative   
4      There is no doubt that the unemployment situat...        Labour   
...                                                  ...           ...   
39995  I totally agree with everything that the right...       Speaker   
39996  Message to attend the Lords Commissioners deli...           NaN   
39997  I have to acquaint the House that the House ha...       Speaker   
39998  I have further to acquaint the House that the ...       Speaker   
39999  The Commission was also for proroguing this pr...       Speaker   

                    constituency        date speech_class  \
0               Portsmouth South  2020-09-14      

In [14]:
df['party'] = df['party'].replace('Labour (Co-op)', 'Labour')
print(df['party'])

0              Labour
1        Conservative
2              Labour
3        Conservative
4              Labour
             ...     
39995         Speaker
39996             NaN
39997         Speaker
39998         Speaker
39999         Speaker
Name: party, Length: 40000, dtype: object


In [15]:
common_parties = df['party'].value_counts().drop('Speaker').nlargest(4).index
df = df[df['party'].isin(common_parties)]
print(df)

                                                  speech         party  \
0      Unemployment is soaring, uptake in benefits ha...        Labour   
1      I thank the hon. Gentleman for raising issues ...  Conservative   
2      As my hon. Friend the Member for Portsmouth So...        Labour   
3      I thank the hon. Gentleman for raising the nee...  Conservative   
4      There is no doubt that the unemployment situat...        Labour   
...                                                  ...           ...   
39985  I will answer my hon. Friend. East West Rail, ...  Conservative   
39990  The hon. Gentleman is absolutely right to poin...  Conservative   
39991  Cutting-edge maritime projects such as the Hol...  Conservative   
39992  My hon. Friend is a brilliant champion of conn...  Conservative   
39994  On a point of order, Mr Speaker. As a further ...  Conservative   

                    constituency        date speech_class      major_heading  \
0               Portsmouth Sout

In [16]:
df = df[df['speech_class'] == 'Speech']
print(df)

                                                  speech         party  \
0      Unemployment is soaring, uptake in benefits ha...        Labour   
1      I thank the hon. Gentleman for raising issues ...  Conservative   
2      As my hon. Friend the Member for Portsmouth So...        Labour   
3      I thank the hon. Gentleman for raising the nee...  Conservative   
4      There is no doubt that the unemployment situat...        Labour   
...                                                  ...           ...   
39985  I will answer my hon. Friend. East West Rail, ...  Conservative   
39990  The hon. Gentleman is absolutely right to poin...  Conservative   
39991  Cutting-edge maritime projects such as the Hol...  Conservative   
39992  My hon. Friend is a brilliant champion of conn...  Conservative   
39994  On a point of order, Mr Speaker. As a further ...  Conservative   

                    constituency        date speech_class      major_heading  \
0               Portsmouth Sout

In [17]:
df = df[df['speech'].str.len() >= 1000]
print(df)

                                                  speech  \
63     It has been less than two weeks since the Gove...   
99     I am delighted to announce that last Friday we...   
100    I thank the Secretary of State for advance sig...   
101    After the right hon. Lady’s congratulations to...   
104    I congratulate the Secretary of State. I recog...   
...                                                  ...   
39831  I rise to present a petition on behalf of the ...   
39834  Thank you, Mr Deputy Speaker, and I am very gr...   
39835  I congratulate my hon. Friend the Member for S...   
39837  The hon. Gentleman makes an important, twofold...   
39869  Recent research by the Campaign to Protect Rur...   

                         party                  constituency        date  \
63                Conservative               Suffolk Coastal  2020-09-14   
99                Conservative            South West Norfolk  2020-09-14   
100                     Labour  Islington South and

In [18]:
print(df.shape)

(8084, 8)


In [20]:
vectorize = TfidfVectorizer(stop_words='english', max_features=3000)
x = vectorize.fit_transform(df['speech'])
y = df['party']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, stratify=y, random_state=26)