In [12]:
import json
import os
import scipy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from nltk.corpus import stopwords

In [13]:
data_path = '../patent_data/cinpatent/en_patent/segmentation/en_0.75.ndjson'
output_dir = './datasets/en_patent_0.75'
os.makedirs(output_dir, exist_ok=True)
lang = 'ja'
feat_list = ['title', 'abstract', 'claim_1', 'description']

In [14]:
with open(data_path, 'r') as f:
    samples = [json.loads(line) for line in f]
print('Number of samples', len(samples))

Number of samples 36209


In [15]:
data = {'train': [], 'val': [], 'test': []}
all_labels = set()

for x in samples:
    if x['is_train']:
        data['train'].append(x)
    elif x['is_dev']:
        data['val'].append(x)
    else:
        data['test'].append(x)
    all_labels.update(x['labels'])

all_labels = sorted(all_labels)
print('Number of labels:', len(all_labels))

Number of labels: 425


In [16]:
def get_text(sample, feat_list):
    text = []
    for feat in feat_list:
        text.extend(sample[feat].split())
    res = " ".join(text).lower()[:5000]
    return res if len(res) != 0 else "none"

In [17]:
stop_words = stopwords.words('english') if lang == 'en' else None
vectorizer = TfidfVectorizer(tokenizer=lambda x: x.split(), stop_words=stop_words, min_df=5, max_df=0.8, max_features=50000)
label_encoder = MultiLabelBinarizer(classes=all_labels, sparse_output=True)

for data_set in ('train', 'val', 'test'):
    text_list = [get_text(sample, feat_list=feat_list) for sample in data[data_set]]
    label_list = [sample['labels'] for sample in data[data_set]]
    if data_set == 'train':
        X = vectorizer.fit_transform(text_list)
        y = label_encoder.fit_transform(label_list)
    else:
        X = vectorizer.transform(text_list)
        y = label_encoder.transform(label_list)

    scipy.sparse.save_npz(os.path.join(output_dir, f'X.{data_set}.npz'), X)
    scipy.sparse.save_npz(os.path.join(output_dir, f'Y.{data_set}.npz'), y)

In [18]:
type(X)

scipy.sparse.csr.csr_matrix