## Splitting training data into labeled and unlabeled data

We use a 50/50 split and stratify by domain

In [3]:
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd

import sys
sys.path.insert(1, '/work/nlp-project')
from scripts.read_write_data import load_data, read_processed_data, write_conll

In [4]:
TRAIN_PATH = "nlp-project/data/processed/train.conll"
SAVE_PATH = "nlp-project/data/processed/train_splits/"

In [3]:
train = pd.DataFrame(read_processed_data(TRAIN_PATH), columns=["Sentence","NE_label", "BIO-label", "domain"])

In [4]:
np.unique(train["domain"], return_counts=True)

(array(['answers', 'email', 'newsgroup', 'reviews', 'weblogs'],
       dtype=object),
 array([2630, 3770, 1828, 2724, 1585]))

In [6]:
# Splitting data in half, stratifying by domain and ensuring repreducibility with random_stateint
labeled, unlabeled = train_test_split(train, train_size=0.5, stratify=train['domain'], random_state=1)

In [7]:
np.unique(labeled['domain'], return_counts=True)[1], np.unique(unlabeled['domain'], return_counts=True)[1]

(array([1315, 1885,  914, 1362,  792]), array([1315, 1885,  914, 1362,  793]))

In [8]:
# write_conll(labeled, SAVE_PATH+"labeled.conll")
# write_conll(unlabeled, SAVE_PATH+"unlabeled.conll")

In [5]:
labeled, _, _, _ = load_data(SAVE_PATH+"labeled.conll")
unlabeled, _, _, _ = load_data(SAVE_PATH+"unlabeled.conll")
len(labeled), len(unlabeled)

(6268, 6269)

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=b2f14aee-af04-4db5-af55-57a3a58b9f40' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>