In [27]:
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC


In [26]:
def clean_hansard(filepath):
    """Returns a clean dataframe from the hansard dataset"""
    df = pd.read_csv(filepath)
    df['party'] = df['party'].replace({'Labour (Co-op)': 'Labour'})
    # print(df['party'].unique())

    party_counts = df['party'].value_counts()
    main_parties = party_counts.nlargest(4).index.tolist()
    df = df[df['party'].isin(main_parties)]
    # print(df['party'].value_counts())

    # 4th party 'Speaker'. Remove it.
    df = df[df['party'] != 'Speaker']
    # print(df['party'].value_counts())

    df = df[df["speech_class"] == "Speech"]
    speech_lengths = df["speech"].str.len()
    speeches = speech_lengths >= 1000
    df = df[speeches]

    rows, columns = df.shape
    print(f"Number of rows: {rows}")
    print(f"Number of columns: {columns}")

    return df

df = clean_hansard("p2-texts/hansard40000.csv")


Number of rows: 7815
Number of columns: 8


In [None]:
# Vectorizer that removes common English words and limits the number of features to the 3000 most frequent terms.
vectorizer = TfidfVectorizer(stop_words='english', max_features=3000)

# Vectorise the speeches
features = vectorizer.fit_transform(df['speech'])
labels = df["party"]

# Stratified by labels to ensure class proportions whithin parties, with a random seed of 26.
features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size= 0.2, random_state= 26, stratify= labels)

# print(features_train.shape, features_test.shape)
# print(labels_train.value_counts(normalize=True))
# The data set is imbalanced (Conservative: 0.616603, Labour: 0.296545, Scottish National Party: 0.086852)
   

In [32]:
"""Train RandomForest (with n_estimators=300) and SVM with linear kernel classifiers on the training set, and print the scikit-learn macro-average f1 score and
classification report for each classifier on the test set. The label that you are
trying to predict is the 'party' value."""

features_train, features_test, labels_train, labels_test

# 300 trees in the Random Forest, with a random seed of 26.
random_forest = RandomForestClassifier(n_estimators=300, random_state=26)
random_forest.fit(features_train, labels_train)
rf_predict = random_forest.predict(features_test)

svm = SVC(kernel='linear', random_state=26)
svm.fit(features_train, labels_train)
svm_predict = svm.predict(features_test)