In [None]:
import pandas as pd
import csv

In [40]:
def read_and_clean_file(path):
    df = pd.read_csv(path)

    # rename the ‘Labour (Co-op)’ value in ‘party’ column to ‘Labour’
    df["party"] = df["party"].replace("Labour (Co-op)", "‘Labour’")
    #print(df.shape)

    # remove any rows where the value of the ‘party’ column is not one of the four most common party names, and remove the ‘Speaker’ value
    top4_parties = df["party"].value_counts().index[:4]
    df = df[df["party"].isin(top4_parties) & (df["party"] != "Speaker")]
    #print(df.shape)

    # remove any rows where the value in the ‘speech_class’ column is not ‘Speech’
    df = df[df["speech_class"] != "Speaker"]
    #print(df.shape)

    #remove any rows where the text in the ‘speech’ column is less than 1000 characters long.
    df = df[df["speech"].str.len() >= 1000]
    #print(df.shape)

    return df



In [41]:
df = read_and_clean_file("p2-texts/hansard40000.csv")
df

Unnamed: 0,speech,party,constituency,date,speech_class,major_heading,year,speakername
63,It has been less than two weeks since the Gove...,Conservative,Suffolk Coastal,2020-09-14,Speech,Work and Pensions,2020,Therese Coffey
99,I am delighted to announce that last Friday we...,Conservative,South West Norfolk,2020-09-14,Speech,Japan Free Trade Agreement,2020,Elizabeth Truss
100,I thank the Secretary of State for advance sig...,Labour,Islington South and Finsbury,2020-09-14,Speech,Japan Free Trade Agreement,2020,Emily Thornberry
101,After the right hon. Lady’s congratulations to...,Conservative,South West Norfolk,2020-09-14,Speech,Japan Free Trade Agreement,2020,Elizabeth Truss
104,I congratulate the Secretary of State. I recog...,Scottish National Party,Dundee East,2020-09-14,Speech,Japan Free Trade Agreement,2020,Stewart Hosie
...,...,...,...,...,...,...,...,...
39831,I rise to present a petition on behalf of the ...,Conservative,Rother Valley,2021-04-28,Speech,Petition - Levelling Up Fund,2021,Alexander Stafford
39834,"Thank you, Mr Deputy Speaker, and I am very gr...",Conservative,South West Bedfordshire,2021-04-28,Speech,National Minimum Wage Enforcement,2021,Andrew Selous
39835,I congratulate my hon. Friend the Member for S...,Conservative,Sutton and Cheam,2021-04-28,Speech,National Minimum Wage Enforcement,2021,Paul Scully
39837,"The hon. Gentleman makes an important, twofold...",Conservative,Sutton and Cheam,2021-04-28,Speech,National Minimum Wage Enforcement,2021,Paul Scully


In [42]:
#2a
df.shape

(7815, 8)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

In [43]:
def Tfidfvectorizer_split_data(df, ngram: str):
    X = df["speech"]
    y = df["party"]
    if ngram == "tri-gram":
        vectorizer = TfidfVectorizer(stop_words = "english", max_features = 3000, ngram_range=(1,3))
    else :
        vectorizer = TfidfVectorizer(stop_words = "english", max_features = 3000)
    X_vector = vectorizer.fit_transform(X)
    X_train, X_test, y_train, y_test = train_test_split(X_vector, y, test_size=0.2, stratify=y, random_state=26)
    return X_train, X_test, y_train, y_test

In [44]:
#2b
X_train, X_test, Y_train, Y_test = Tfidfvectorizer_split_data(df, "default")
print("X Train set shape:", X_train.shape)
print("X Test set shape:", X_test.shape)
print("Y Train set shape:", Y_train.shape)
print("Y Test set shape:", Y_test.shape)

X Train set shape: (6252, 3000)
X Test set shape: (1563, 3000)
Y Train set shape: (6252,)
Y Test set shape: (1563,)
