In [1]:
import pandas as pd
import random

In [2]:
file = "../data/brexit_blog_corpus.xlsx"
df = pd.read_excel(file, usecols = "A:K")
random.seed(42)

In [3]:
df.head()

Unnamed: 0,SND_studie,SND_dataset,SND_version,Utterance ID No,Utterance,Stance category,second stance category,third,fourth,fifth,Utterance word length
0,1037,1,1.0,1,I know what you mean.,agreement/disagreement,,,,,5
1,1037,1,1.0,2,"Yep, we definitely freeze out others opinions,...",agreement/disagreement,certainty,,,,14
2,1037,1,1.0,3,So do I.,agreement/disagreement,,,,,3
3,1037,1,1.0,4,I don't disagree that the Yes campaign made mi...,agreement/disagreement,contrariety,necessity,,,21
4,1037,1,1.0,5,That to me is where the real conversions will ...,agreement/disagreement,prediction,source of knowledge,,,29


# Dataset distribution with respect to 1st label
Dataset only has 1682 data points, therefore it is very important that out test split and training split both have distributions that are similar to the original data set. Here we can see the distribution of labels when we only consider the first label of each data point.

In [4]:
df_percent = df['Stance category'].value_counts(True)

df_percent

contrariety               0.209275
source of knowledge       0.170630
prediction                0.149822
necessity                 0.121284
uncertainty               0.116528
hypotheticality           0.101665
certainty                 0.049941
agreement/disagreement    0.029727
tact/rudeness             0.026159
volition                  0.024970
Name: Stance category, dtype: float64

# Testing set
Above we saw that the 1st label distribution is highly skewed. In consideration of this, and given that with an 80/20 train/test split we will have fewer than 350 testing examples, it is important to make sure we have an equivalent distribution of 1st labels in our test set - if the less frequent labels are not present in the testing set in proportion to the training set we cannot sufficiently evaluate the model's performance predicting the less frequent classes.

So to ensure distribution is preserved, rather than subsampling our test set from the whole dataset, first we group the data set by label, and then we randomly sample 20% from each group.

In [5]:
test = df.groupby('Stance category',group_keys=False).apply(lambda g: g.sample(n=round(0.2* len(g)), random_state=42))
test.sort_index(inplace = True)

test.head()

Unnamed: 0,SND_studie,SND_dataset,SND_version,Utterance ID No,Utterance,Stance category,second stance category,third,fourth,fifth,Utterance word length
13,1037,1,1.0,14,"Granted, his party may commit regicide in the ...",agreement/disagreement,contrariety,hypotheticality,prediction,uncertainty,21
17,1037,1,1.0,18,And I very much doubt they are willing to cent...,agreement/disagreement,uncertainty,,,,16
19,1037,1,1.0,20,In principle I agree with what he is striving ...,agreement/disagreement,contrariety,,,,19
25,1037,1,1.0,26,It does not take into effect the benefits of S...,agreement/disagreement,,,,,22
26,1037,1,1.0,27,I would argue differently in that the tourname...,agreement/disagreement,,,,,18


As desired, we can see that the label distribution is very similar to the original dataset:

In [6]:
print(test['Stance category'].value_counts(True))


contrariety               0.208955
source of knowledge       0.170149
prediction                0.149254
necessity                 0.122388
uncertainty               0.116418
hypotheticality           0.101493
certainty                 0.050746
agreement/disagreement    0.029851
tact/rudeness             0.026866
volition                  0.023881
Name: Stance category, dtype: float64


# Training set 
Now we define our training set by selecting all data points not in the test set. Again we can see the 1st label distribution is approximately equivalent that of the whole dataset and the testing set. 

In [7]:
mask = df.index.isin(test.index)
train = df[~mask]
train['Stance category'].value_counts()

contrariety               282
source of knowledge       230
prediction                202
necessity                 163
uncertainty               157
hypotheticality           137
certainty                  67
agreement/disagreement     40
tact/rudeness              35
volition                   34
Name: Stance category, dtype: int64

Here is the above wrapped in a function, which we will put into a .py file in the python utilities directory:

In [8]:
def get_train_test():
    file = "../data/brexit_blog_corpus.xlsx"
    df = pd.read_excel(file, usecols="A:K")
    test = df.groupby('Stance category', group_keys=False).apply(lambda g: g.sample(n=round(0.2 * len(g)), random_state=42))
    test.sort_index(inplace=True)
    mask = df.index.isin(test.index)
    train = df[~mask]
    train.sort_index(inplace=True)

    return train, test

Lets do some checks.

In [9]:
train, test = get_train_test()
train.head()


Unnamed: 0,SND_studie,SND_dataset,SND_version,Utterance ID No,Utterance,Stance category,second stance category,third,fourth,fifth,Utterance word length
0,1037,1,1.0,1,I know what you mean.,agreement/disagreement,,,,,5
1,1037,1,1.0,2,"Yep, we definitely freeze out others opinions,...",agreement/disagreement,certainty,,,,14
2,1037,1,1.0,3,So do I.,agreement/disagreement,,,,,3
3,1037,1,1.0,4,I don't disagree that the Yes campaign made mi...,agreement/disagreement,contrariety,necessity,,,21
4,1037,1,1.0,5,That to me is where the real conversions will ...,agreement/disagreement,prediction,source of knowledge,,,29


In [10]:
test.head()

Unnamed: 0,SND_studie,SND_dataset,SND_version,Utterance ID No,Utterance,Stance category,second stance category,third,fourth,fifth,Utterance word length
13,1037,1,1.0,14,"Granted, his party may commit regicide in the ...",agreement/disagreement,contrariety,hypotheticality,prediction,uncertainty,21
17,1037,1,1.0,18,And I very much doubt they are willing to cent...,agreement/disagreement,uncertainty,,,,16
19,1037,1,1.0,20,In principle I agree with what he is striving ...,agreement/disagreement,contrariety,,,,19
25,1037,1,1.0,26,It does not take into effect the benefits of S...,agreement/disagreement,,,,,22
26,1037,1,1.0,27,I would argue differently in that the tourname...,agreement/disagreement,,,,,18


In [11]:
print(round(0.8*len(df)))
print(len(train))

1346
1347


In [12]:
print(round(0.2*len(df)))
print(len(test))

336
335


split sizes are correct to within +-1. This is acceptable.

In [13]:
print(test.index.isin(train.index).sum())

0


In [16]:
print(len(df))
print(len(test) + len(train))

1682
1682


All good!