## Preprocessing the raw data 

Run this notebook to generate the processed data and put it (if it doesn't exist) into the data/preprocessed folder.

In [None]:
import pandas as pd
import numpy as np
import codecs
import os

import sys
sys.path.append("nlp_project")
from read_write_data import read_raw_data, write_conll, read_processed_data

In [None]:
import gensim.models

GoogleEmbs = gensim.models.KeyedVectors.load_word2vec_format(
                                'nlp_project/models/GoogleNews-50k.bin', binary=True)

print('loading finished')

In [None]:
DATA_PATH = "nlp_project/data/raw/"
SAVE_PATH = "nlp_project/data/processed/"
MAX_SIZE = 100  # sentences longer than this will be discarded (for padding purposes)

In [None]:
def into_df(filename):
    return pd.DataFrame(read_raw_data(DATA_PATH+filename))

In [None]:
def transform_labels(df):
    df['isNE'] = df.loc[:, 1].copy()
    for i, sentence in enumerate(df[1]):
        new_sentence = []
        for word in sentence:
            if word == "O": 
                new_sentence.append(0)
            else: 
                new_sentence.append(1)
        df['isNE'].iat[i] = new_sentence
    return df

Function to add word embeddings to the dataframe:

In [None]:
def word_embeddings(df):
    sentences = df[0]
    google_embedding = []
    for sentence in sentences:
        embeddings = []
        for word in sentence:
            try:
                embeddings.append(GoogleEmbs.get_vector(word).astype(float))
            except KeyError:
                embeddings.append(np.zeros(300).astype(float))
        google_embedding.append(embeddings)
    df = df.assign(google_embeddings=google_embedding)
    return df

In [None]:
def restrict_size(df, max_size):
    df = df.assign(length=[len(s) for s in df[0]])
    df = df.loc[df['length'] <= max_size]
    df = df.drop('length', axis=1)
    return df

Training sets:

In [None]:
ans = into_df('en_ewt_nn_train_answers_only.conll').assign(domain='answers')
revs = into_df('en_ewt_nn_train_reviews_only.conll').assign(domain='reviews')
em = into_df('en_ewt_nn_train_email_only.conll').assign(domain='email')
news = into_df('en_ewt_nn_train_newsgroup_only.conll').assign(domain='newsgroup')
web = into_df('en_ewt_nn_train_weblogs_only.conll').assign(domain='weblogs')

total_train = pd.DataFrame()
total_train = total_train.append([ans,revs,em,news,web])
total_train = transform_labels(total_train)
total_train = restrict_size(total_train, MAX_SIZE)

save_path = SAVE_PATH+'train.conll'
if not os.path.exists(save_path):
    write_conll(total_train, save_path)

Dev sets:

In [None]:
ans = into_df('en_ewt_nn_answers_dev.conll').assign(domain='answers')
revs = into_df('en_ewt_nn_reviews_dev.conll').assign(domain='reviews')
em = into_df('en_ewt_nn_email_dev.conll').assign(domain='email')
news = into_df('en_ewt_nn_newsgroup_dev.conll').assign(domain='newsgroup')
web = into_df('en_ewt_nn_weblogs_dev.conll').assign(domain='weblogs')

total_dev = pd.DataFrame()
total_dev = total_dev.append([ans,revs,em,news,web])
total_dev = transform_labels(total_dev)
total_dev = restrict_size(total_dev, MAX_SIZE)

save_path = SAVE_PATH+'dev.conll'
if not os.path.exists(save_path):
    write_conll(total_dev, save_path)

Test sets:

In [None]:
ans = into_df('en_ewt_nn_answers_test.conll').assign(domain='answers')
revs = into_df('en_ewt_nn_reviews_test.conll').assign(domain='reviews')
em = into_df('en_ewt_nn_email_test.conll').assign(domain='email')
news = into_df('en_ewt_nn_newsgroup_test.conll').assign(domain='newsgroup')
web = into_df('en_ewt_nn_weblogs_test.conll').assign(domain='weblogs')

total_test = pd.DataFrame()
total_test = total_test.append([ans,revs,em,news,web])
total_test = transform_labels(total_test)
total_test = restrict_size(total_test, MAX_SIZE)

save_path = SAVE_PATH+'test.conll'
if not os.path.exists(save_path):
    write_conll(total_test, save_path)

In [None]:
len(list(read_processed_data(os.path.join(SAVE_PATH, 'train.conll'))))

Checking lenghts to make sure data was loaded correctly:

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=b2f14aee-af04-4db5-af55-57a3a58b9f40' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>