# POS and Casing One-Hot Encoding Notebook

This notebook focuses on transforming Part-of-Speech (POS) tags and word casing information into one-hot encoded vectors. The primary objective is to convert POS tags into 17-length one-hot encodings and word casing into 8-length one-hot encodings. This transformation is essential for the deep-learning model.
## Overview
The notebook includes the following key steps:
1. **Loading Data**: Import training data containing POS tags and casing information.
2. **One-Hot Encoding Transformation**: Convert POS tags into 17-length one-hot encoded vectors and casing information into 8-length one-hot encoded vectors.
3. **Handling Sentence Separation**: Ensure that the transformed data maintains the structure of individual sentences, addressing the issue where words were initially concatenated together without sentence separation.

## Problem Addressed
While implementing the one-hot encoding transformations, an issue was encountered where all words were concatenated into a single sequence without maintaining sentence boundaries. To resolve this, additional steps were taken to separate and correctly structure the transformed data by sentences.

## Goal
The main goal of this notebook is to prepare POS and casing information in a format that can be readily used for training deep learning models. By the end of this notebook, you will have lists where POS tags and casing information are represented as one-hot encoded vectors.

In [1]:
import json
import numpy as np

In [2]:
def load_json(file_path):
    with open(file_path, 'r') as f:
        data = json.load(f)
    return data

training = load_json("full_training_set_CRF_tagged.json")
test = load_json("full_test_set_CRF_tagged.json")

In [3]:
print(training[0]['words'])

[{'informe': {'POS': 'NOUN', 'init_cap': False, 'alphanum': True, 'has_num': False, 'has_cap': False, 'has_dash': False, 'has_us': False, 'punctuation': False, 'suf_n': '', 'pre_n': 'in', 'bigram_before': [], 'bigram_after': ['informe', "d'"], 'before_pos': [], 'check_special_words': False}}, {"d'alta": {'POS': None, 'init_cap': False, 'alphanum': False, 'has_num': False, 'has_cap': False, 'has_dash': False, 'has_us': False, 'punctuation': True, 'suf_n': '', 'pre_n': '', 'bigram_before': [], 'bigram_after': None, 'before_pos': ['VERB'], 'check_special_words': False}}, {"d'hospitalitzacio": {'POS': None, 'init_cap': False, 'alphanum': False, 'has_num': False, 'has_cap': False, 'has_dash': False, 'has_us': False, 'punctuation': True, 'suf_n': '', 'pre_n': '', 'bigram_before': [], 'bigram_after': ["d'hospitalitzacio", 'motiu'], 'before_pos': [], 'check_special_words': False}}, {'motiu': {'POS': 'PROPN', 'init_cap': False, 'alphanum': True, 'has_num': False, 'has_cap': False, 'has_dash': F

In [4]:
pos_tags = set()
for i in range(len(training)):
    training[i]

In [5]:
# Crear un set para almacenar las etiquetas POS
pos_tags = set()

# Iterar sobre cada palabra y su etiqueta POS
for i in range(len(training)):
    for word_info in training[i]['words']:
        for word, attributes in word_info.items():
            pos_tag = attributes.get('POS')
            if pos_tag is not None:  # Verificar que la etiqueta POS no sea None
                pos_tags.add(pos_tag)

# Iterar sobre cada palabra y su etiqueta POS
for i in range(len(test)):
    for word_info in test[i]['words']:
        for word, attributes in word_info.items():
            pos_tag = attributes.get('POS')
            if pos_tag is not None:  # Verificar que la etiqueta POS no sea None
                pos_tags.add(pos_tag)

# Convertir el set a una lista
pos_tags = list(pos_tags)

# Crear un diccionario para mapear cada etiqueta POS a un índice
pos_to_index = {pos: idx for idx, pos in enumerate(pos_tags)}

# Función para convertir etiquetas POS a one-hot
def pos_to_one_hot(pos, pos_to_index):
    one_hot = [0] * len(pos_to_index)
    if pos in pos_to_index:
        one_hot[pos_to_index[pos]] = 1
    return one_hot

# Convertir todas las etiquetas POS en el conjunto de entrenamiento a one-hot
one_hot_encoded_pos_train = []

for i in range(len(training)):
    for word_info in training[i]['words']:
        for word, attributes in word_info.items():
            pos_tag = attributes.get('POS')
            if pos_tag is not None:
                one_hot_vector = pos_to_one_hot(pos_tag, pos_to_index)
                one_hot_encoded_pos_train.append(one_hot_vector)
            else:
                one_hot_encoded_pos_train.append([0] * len(pos_to_index))

# Verificar los vectores one-hot
print(one_hot_encoded_pos_train[0])


# Convertir todas las etiquetas POS en el conjunto de entrenamiento a one-hot
one_hot_encoded_pos_test = []

for i in range(len(test)):
    for word_info in test[i]['words']:
        for word, attributes in word_info.items():
            pos_tag = attributes.get('POS')
            if pos_tag is not None:
                one_hot_vector = pos_to_one_hot(pos_tag, pos_to_index)
                one_hot_encoded_pos_test.append(one_hot_vector)
            else:
                one_hot_encoded_pos_test.append([0]*len(pos_to_index))
                
# Verificar los vectores one-hot
print(one_hot_encoded_pos_test[0])


[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0]


In [6]:
print(test[0]['words'])
print(pos_to_index)

[{'informe': {'POS': 'VERB', 'init_cap': False, 'alphanum': True, 'has_num': False, 'has_cap': False, 'has_dash': False, 'has_us': False, 'punctuation': False, 'suf_n': '', 'pre_n': 'in', 'bigram_before': [], 'bigram_after': ['informe', "d'alta"], 'before_pos': [], 'check_special_words': False}}, {"d'alta": {'POS': 'PROPN', 'init_cap': False, 'alphanum': False, 'has_num': False, 'has_cap': False, 'has_dash': False, 'has_us': False, 'punctuation': True, 'suf_n': '', 'pre_n': '', 'bigram_before': [], 'bigram_after': ["d'alta", "d'hospitalitzacio"], 'before_pos': ['VERB'], 'check_special_words': False}}, {"d'hospitalitzacio": {'POS': 'PROPN', 'init_cap': False, 'alphanum': False, 'has_num': False, 'has_cap': False, 'has_dash': False, 'has_us': False, 'punctuation': True, 'suf_n': '', 'pre_n': '', 'bigram_before': [['informe', "d'alta"]], 'bigram_after': ["d'hospitalitzacio", 'motiu'], 'before_pos': ['VERB', 'PROPN'], 'check_special_words': False}}, {'motiu': {'POS': 'PROPN', 'init_cap': F

In [7]:
from nltk import sent_tokenize
from nltk import word_tokenize

def word_extraction(document,n):
    "Receive a document and the number of the document. Then it returns the labels divided in sentences"    
    text = document['text']
    tagged_sentences = []
    tag_index = 0
    
    for sentence in sent_tokenize(text):
        if(any(char.isalpha() for char in sentence)):
            l = []
            for word in word_tokenize(sentence):
                 l.append(word)
                 tag_index += 1
            tagged_sentences.append(l)

    return tagged_sentences

In [8]:
l_words_train = []
for i in range(len(training)):
    l_words_train += word_extraction(training[i],i)

l_words_test = []
for i in range(len(test)):
    l_words_test += word_extraction(test[i],i)


In [9]:
print(l_words_train[0])
print(one_hot_encoded_pos_train[0])
c= 0
print(len(one_hot_encoded_pos_train))
for sentence in l_words_train:
    for word in sentence:
        c+=1
print(c)

['informe', "d'alta", "d'hospitalitzacio", 'motiu', "d'ingres", 'paciente', 'que', 'ingresa', 'de', 'forma', 'programada', 'para', 'realizacion', 'de', 'uretrotomia', 'interna', '.']
[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0]
205114
205114


In [10]:
"""l_words_train = []
one_hot_encoded_pos_train = []
for sentence in training:
    words_in_sentence = []
    one_hot_in_sentence = []
    for word_info in sentence['words']:
        for word, attributes in word_info.items():
            words_in_sentence.append(word)
            pos_tag = attributes.get('POS')
            one_hot_vector = pos_to_one_hot(pos_tag, pos_to_index)
            one_hot_in_sentence.append(one_hot_vector)
    l_words_train.append(words_in_sentence)
    one_hot_encoded_pos_train.append(one_hot_in_sentence)

# Verificar las listas organizadas por oraciones
print(l_words_train[0])
print(one_hot_encoded_pos_train[0])

print(len(l_words_train[0]))
print(len(one_hot_encoded_pos_train[0]))

l_words_test = []
one_hot_encoded_pos_test = []
for sentence in test:
    words_in_sentence = []
    one_hot_in_sentence = []
    for word_info in sentence['words']:
        for word, attributes in word_info.items():
            words_in_sentence.append(word)
            pos_tag = attributes.get('POS')
            one_hot_vector = pos_to_one_hot(pos_tag, pos_to_index)
            one_hot_in_sentence.append(one_hot_vector)
    l_words_train.append(words_in_sentence)
    one_hot_encoded_pos_train.append(one_hot_in_sentence)
"""

"l_words_train = []\none_hot_encoded_pos_train = []\nfor sentence in training:\n    words_in_sentence = []\n    one_hot_in_sentence = []\n    for word_info in sentence['words']:\n        for word, attributes in word_info.items():\n            words_in_sentence.append(word)\n            pos_tag = attributes.get('POS')\n            one_hot_vector = pos_to_one_hot(pos_tag, pos_to_index)\n            one_hot_in_sentence.append(one_hot_vector)\n    l_words_train.append(words_in_sentence)\n    one_hot_encoded_pos_train.append(one_hot_in_sentence)\n\n# Verificar las listas organizadas por oraciones\nprint(l_words_train[0])\nprint(one_hot_encoded_pos_train[0])\n\nprint(len(l_words_train[0]))\nprint(len(one_hot_encoded_pos_train[0]))\n\nl_words_test = []\none_hot_encoded_pos_test = []\nfor sentence in test:\n    words_in_sentence = []\n    one_hot_in_sentence = []\n    for word_info in sentence['words']:\n        for word, attributes in word_info.items():\n            words_in_sentence.append(w

In [11]:
pos_train_separated = []
index = 0

for sentence in l_words_train:
    sentence_length = len(sentence)
    pos_train_separated.append(one_hot_encoded_pos_train[index:index+sentence_length])
    index += sentence_length

print(len(pos_train_separated[44]))
print(len(l_words_train[44]))


pos_test_separated = []
index = 0

for sentence in l_words_test:
    sentence_length = len(sentence)
    pos_test_separated.append(one_hot_encoded_pos_train[index:index+sentence_length])
    index += sentence_length

print(len(pos_test_separated[44]))
print(len(l_words_test[44]))

56
56
8
8


In [12]:
print(len(pos_test_separated))
print(len(pos_train_separated))

print(len(l_words_test))
print(len(l_words_train))



3211
11782
3211
11782


In [13]:
"""def save_json(data, file_path):
    with open(file_path, 'w') as f:
        json.dump(data, f)

save_json(pos_train_separated,'train_pos_onehot.json')
save_json(pos_test_separated,'test_pos_onehot.json')
"""

"def save_json(data, file_path):\n    with open(file_path, 'w') as f:\n        json.dump(data, f)\n\nsave_json(pos_train_separated,'train_pos_onehot.json')\nsave_json(pos_test_separated,'test_pos_onehot.json')\n"

In [14]:
def add_to_res(vector,attribute):
    if attribute == True:
        vector.append(1)
    else:
        vector.append(0)

    return vector

# Convertir todas las etiquetas POS en el conjunto de entrenamiento a one-hot
one_hot_encoded_casing_train = []

for i in range(len(training)):
    for word_info in training[i]['words']:
        for word, attributes in word_info.items():
            vector = []
            vector = add_to_res(vector,attributes.get('init_cap'))
            vector = add_to_res(vector,attributes.get('alpahnum'))
            vector = add_to_res(vector,attributes.get('has_num'))
            vector = add_to_res(vector,attributes.get('has_cap'))
            vector = add_to_res(vector,attributes.get('has_dash'))
            vector = add_to_res(vector,attributes.get('has_us'))
            vector = add_to_res(vector,attributes.get('punctuation'))
            vector = add_to_res(vector,attributes.get('check_special_words'))
            one_hot_encoded_casing_train.append(vector)

one_hot_encoded_casing_test = []

for i in range(len(test)):
    for word_info in test[i]['words']:
        for word, attributes in word_info.items():
            vector = []
            vector = add_to_res(vector,attributes.get('init_cap'))
            vector = add_to_res(vector,attributes.get('alpahnum'))
            vector = add_to_res(vector,attributes.get('has_num'))
            vector = add_to_res(vector,attributes.get('has_cap'))
            vector = add_to_res(vector,attributes.get('has_dash'))
            vector = add_to_res(vector,attributes.get('has_us'))
            vector = add_to_res(vector,attributes.get('punctuation'))
            vector = add_to_res(vector,attributes.get('check_special_words'))
            one_hot_encoded_casing_test.append(vector)


In [15]:
casing_train_separated = []
index = 0

for sentence in l_words_train:
    sentence_length = len(sentence)
    casing_train_separated.append(one_hot_encoded_casing_train[index:index+sentence_length])
    index += sentence_length

print(len(casing_train_separated[44]))
print(len(l_words_train[44]))


casing_test_separated = []
index = 0

for sentence in l_words_test:
    sentence_length = len(sentence)
    casing_test_separated.append(one_hot_encoded_casing_test[index:index+sentence_length])
    index += sentence_length

print(len(casing_test_separated[44]))
print(len(l_words_test[44]))

56
56
8
8


In [16]:
"""def save_json(data, file_path):
    with open(file_path, 'w') as f:
        json.dump(data, f)

save_json(casing_train_separated,'train_casing_onehot.json')
save_json(casing_test_separated,'test_casing_onehot.json')"""


"def save_json(data, file_path):\n    with open(file_path, 'w') as f:\n        json.dump(data, f)\n\nsave_json(casing_train_separated,'train_casing_onehot.json')\nsave_json(casing_test_separated,'test_casing_onehot.json')"