# Generate and add sublabels 

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import json

# Import our dictionary of labels for each patent id
with open('/content/drive/MyDrive/Masterthesis/data/dict_labels.json') as json_file:
    id_labels = json.load(json_file)

In [None]:
# Takes array of IPC / CPC labels and return all sublabels
def get_subcategories(arr):
  categories = []

  # For each label string...
  for string in arr:
      category = ""
      char = string[0]

      # Iterate over characters in label string
      for next_char in string[1:]:
        category += char

        # If we swith from number or letter...
        # ...it means we have going down the tree of sub categories
        if char.isdigit() != next_char.isdigit() and char != '/':
            categories.append(category)
        
        # keeping track of next character to see if character type will switch
        char = next_char

      # Append the whole label as well
      categories.append(string)

  # Remove duplicated and return a sorted list for readability
  return sorted(list(set(categories)))

In [None]:
get_subcategories(['Y10T74/19051'])

In [None]:
id_labels

In [None]:
# Go through labels and add sublabels
id_sublabels = {}
for id, labels in id_labels.items():
  id_sublabels[id] = get_subcategories(labels)

In [None]:
from itertools import chain

# Find uniqe labels
unique_labels = sorted(list(set(chain.from_iterable(id_sublabels.values()))))

In [None]:
from sklearn.preprocessing import LabelBinarizer
import numpy as np

# creates dictionary of one hot encoded labels
id_onehot = {}

lb = LabelBinarizer()
lb.fit(unique_labels)
n_unique_labels = len(unique_labels)

for id, labels in id_sublabels.items():
    if labels:
        binarized = np.array(lb.transform(labels))
        flattend = binarized.max(axis=0, keepdims=False)
    else:
        flattend = np.zeros(n_unique_labels)
    id_onehot[id] = flattend

In [None]:
import pandas as pd

# Create dataframe with onehot encoded labels
label_df = pd.DataFrame(id_onehot).T
label_df.columns = unique_labels
label_df.head(5)

In [None]:
# Merge and print to csv
path = '/content/drive/MyDrive/Masterthesis/data/destilled_patents.csv'
df = pd.read_csv(path)
destilled_and_labels = pd.merge(df, label_df, 'left', left_on = 'id', right_index = True)
destilled_and_labels.to_csv('/content/drive/MyDrive/Masterthesis/data/destilled_and_labels.csv', index=False)

In [None]:
destilled_and_labels