In [9]:
import json
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords

In [10]:
data_path = '../patent_data/cinpatent/en_patent/segmentation/cinpatent_en_0.05.ndjson'
output_dir = './datasets/en_patent_0.05'
os.makedirs(output_dir, exist_ok=True)
lang = 'ja'
feat_list = ['title', 'abstract', 'claim_1', 'description']

In [11]:
with open(data_path, 'r') as f:
    samples = [json.loads(line) for line in f]
print('Number of samples', len(samples))

Number of samples 11349


In [12]:
data = {'train': [], 'val': [], 'test': []}
all_labels = set()

for x in samples:
    if x['is_train']:
        data['train'].append(x)
    elif x['is_dev']:
        data['val'].append(x)
    else:
        data['test'].append(x)
    all_labels.update(x['labels'])

all_labels = {x: i for i, x in enumerate(sorted(all_labels))}
print('Number of labels:', len(all_labels))

Number of labels: 425


In [13]:
def get_text(sample, feat_list):
    text = []
    for feat in feat_list:
        text.extend(sample[feat].split())
    res = " ".join(text).lower()[:5000]
    return res if len(res) != 0 else "none"

In [14]:
def get_labels(sample, all_labels):
    encoded_labels = [str(all_labels[x]) for x in sample['labels']]
    return ','.join(encoded_labels)

In [15]:
stop_words = stopwords.words('english') if lang == 'en' else None
vectorizer = TfidfVectorizer(tokenizer=lambda x: x.split(), stop_words=stop_words, min_df=5, max_df=0.8, max_features=50000)

X, y = {}, {}
X['train'] = vectorizer.fit_transform([get_text(sample, feat_list=feat_list) for sample in data['train']])
y['train'] = [get_labels(sample, all_labels) for sample in data['train']]
X['val'] = vectorizer.transform([get_text(sample, feat_list=feat_list) for sample in data['val']])
y['val'] = [get_labels(sample, all_labels) for sample in data['val']]
X['test'] = vectorizer.transform([get_text(sample, feat_list=feat_list) for sample in data['test']])
y['test'] = [get_labels(sample, all_labels) for sample in data['test']]
n_features = len(vectorizer.vocabulary_)

In [16]:
for data_set in ('train', 'val', 'test'):
    print('Output', data_set)
    with open(os.path.join(output_dir, f'{data_set}.txt'), 'w') as f:
        n_samples = len(data[data_set])
        f.write('{} {} {}\n'.format(n_samples, n_features, len(all_labels)))
        for i, ey in enumerate(y[data_set]):
            f.write(ey)
            for feat_id, feat_val in zip(
                X[data_set].indices[X[data_set].indptr[i]:X[data_set].indptr[i+1]], 
                X[data_set].data[X[data_set].indptr[i]:X[data_set].indptr[i+1]]
            ):
                f.write(" {}:{}".format(feat_id, feat_val))
            f.write('\n')
        

Output train
Output val
Output test
