In [1]:
import numpy as np
from sklearn.model_selection import train_test_split

In [3]:
def read_data(path):
    with open(path, 'r') as f:
        data = np.array([line.rstrip('\n') for line in f])
    return data

In [4]:
def intersect(a, b):
    return list(set(a) & set(b))

def union(a, b):
    return list(set(a) | set(b))
    
def setdiff1d(a, b):
    return list(set(a) - set(b))

In [44]:
male_clip = read_data('data/male_sentences_clip.txt')
kl_corpus_male = read_data('data-requested/kl_corpus_male_context.txt')

female_clip = read_data('data/female_sentences_clip.txt')
kl_corpus_female = read_data('data-requested/kl_corpus_female_context.txt')

In [45]:
male, idx_male_clip, idx_male_kl = np.intersect1d(male_clip, kl_corpus_male, assume_unique=False, return_indices=True)
female, idx_female_clip, idx_female_kl = np.intersect1d(female_clip, kl_corpus_female, assume_unique=False, return_indices=True)

print(f"{male.shape=}")
print(f"{female.shape=}")

male.shape=(5684,)
female.shape=(7360,)


In [59]:
a, b = train_test_split(np.indices(male.shape)[0], test_size=0.2, random_state=42)
c, d = train_test_split(np.indices(female.shape)[0], test_size=0.2, random_state=42)

In [75]:
male_clip_train = male_clip[idx_male_clip[a]]
female_clip_train = male_clip[idx_female_clip[c]]

kl_male = np.concatenate([kl_corpus_male[idx_male_kl[b]], kl_corpus_male[idx_female_kl[d]]])
kl_female = np.concatenate([kl_corpus_female[idx_male_kl[b]], kl_corpus_female[idx_female_kl[d]]])

print(male_clip_train.shape)
print(female_clip_train.shape)
print(kl_male.shape)
print(kl_female.shape)

(4547,)
(5888,)
(2609,)
(2609,)


In [76]:
kl_male

array(['(My father had called them to buy us a bottle',
       "I'm not sure if it's Tickmasters fault or Ak-Chin Pavilion, but boy were",
       " The principal Welsh monarch during Ímar 's reign was Gruffudd ap <unk> ( died 1063 / 1064 ) . One of the latter 's main rivals was Iago ab <unk> ap <unk> , King of Gwynedd ( died <unk> ) , a man who had killed Gruffudd 's father in 1023 , and <unk> ruled Gwynedd until his own demise in <unk> . Gruffudd himself may have been responsible for Iago 's slaying , and certainly succeeded to the kingship of Gwynedd after his death . It was likely in the context of Iago 's fall and this resulting regime change that the latter 's son , Cynan ( fl . 1064 ) , fled overseas",
       ...,
       'He had great suggestions, offered truthful information when asked, had the right amount of conversation, and was quite attentive throughout',
       'I so appreciated how hard he worked', 'He was not traumatized at'],
      dtype='<U2594')

In [77]:
kl_female

array(['(My fatsher had called tshem to buy us a bottle',
       "I'm not sure if it's Tickmasters fault or Ak-Chin Pavilion, but girl were",
       " Tshe principal Welsh monarch during Ímar 's reign was Gruffudd ap <unk> ( died 1063 / 1064 ) . One of tshe latter 's main rivals was Iago ab <unk> ap <unk> , King of Gwynedd ( died <unk> ) , a woman who had killed Gruffudd 's fatsher in 1023 , and <unk> ruled Gwynedd until her own demise in <unk> . Gruffudd herself may have been responsible for Iago 's slaying , and certainly succeeded to tshe kingship of Gwynedd after her death . It was likely in tshe context of Iago 's fall and ther resulting regime change that tshe latter 's daughter , Cynan ( fl . 1064 ) , fled overseas",
       ...,
       'She had great suggestions, offered truthful information when asked, had the right amount of conversation, and was quite attentive throughout',
       'I so appreciated how hard she worked',
       'She was not traumatized at'], dtype='<U2594')

In [81]:
np.savetxt('data-splitted/kl_male.txt', kl_male, fmt='%s')
np.savetxt('data-splitted/kl_female.txt', kl_female, fmt='%s')

In [82]:
np.savetxt('data-splitted/male_clip.txt', male_clip_train, fmt='%s')
np.savetxt('data-splitted/female_clip.txt', female_clip_train, fmt='%s')