In [None]:
import codecs
from dataclasses import dataclass
from typing import List, Dict, Any

import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F
from torch import Tensor
from torch import nn
import time

import random
random.seed(0)
torch.manual_seed(0)

# from transformers import BertTokenizer, BertModel
# bert_tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
# bert_model = BertModel.from_pretrained("bert-base-cased")
import matplotlib.pyplot as plt

import sys
sys.path.insert(1, '/work/nlp-project')
from scripts.read_write_data import read_processed_data, write_baseline_pred
from scripts.evaluation_functions import f1_score, tag_accuracy
from models.classes import DataIterator, Batch

# import gensim.models
# GoogleEmbs = gensim.models.KeyedVectors.load_word2vec_format(
#                                 '/work/nlp-project/models/GoogleNews-50k.bin', binary=True)

import matplotlib.pyplot as plt

In [None]:
TRAIN_PATH = "nlp-project/data/processed/train_splits/labeled.conll"

# Loading data

x_train = []
y_train = []
for words, labels, _, _ in read_processed_data(TRAIN_PATH):
    x_train.append(words)
    y_train.append(labels)


In [None]:
x_train[:10]

In [None]:
ppdb_file=pd.read_csv('/work/nlp-project/models/ppdb-2.0-s-lexical.csv')
ppdb_file

In [None]:
ppdb_file = ppdb_file.drop_duplicates(subset=['word1', 'word2'])

In [None]:
ppdb_file['word1'] = ppdb_file['word1'].astype(str).str.strip().str.lower()
ppdb_file['word2'] = ppdb_file['word2'].astype(str).str.strip().str.lower()


In [None]:

def generate_paraphrases(sentence, ppdb_file, max_paraphrases=10):
    paraphrases = set()
    for index, row in ppdb_file.iterrows():
        if row['word1'] in sentence:
            paraphrases.add(sentence.replace(row['word1'], row['word2']))
        elif row['word2'] in sentence:
            paraphrases.add(sentence.replace(row['word2'], row['word1']))
        if len(paraphrases) >= max_paraphrases:
            break
    return list(paraphrases)


In [None]:
sentence = "I work on the project."
paraphrased_sentences = generate_paraphrases(sentence, ppdb_file)
print(paraphrased_sentences)

In [None]:
TRAIN_PATH = "nlp-project/data/processed/train.conll"
DEV_PATH = "nlp-project/data/processed/dev.conll"
TEST_PATH = "nlp-project/data/processed/test.conll"

# Loading data

x_train = []
y_train = []
for words, labels, _, _ in read_processed_data(TRAIN_PATH):
    x_train.append(words)
    y_train.append(labels)

x_dev = []
y_dev = []
for words, labels, _, _ in read_processed_data(DEV_PATH):
    x_dev.append(words)
    y_dev.append(labels)

In [None]:
ppdb_file.head()

In [None]:
def paraphrase_word(word):
    for w1, w2 in zip(ppdb_file['word1'], ppdb_file['word2']):
        if word.casefold() == w1.casefold():
            return w2
        elif word.casefold() == w2.casefold():
            return w1
    return None

In [None]:
# def paraphrase_word_options(word):
#     options = []
#     scores = []
#     for w1, w2, score in zip(ppdb_file['word1'], ppdb_file['word2'], ppdb_file['score']):
#         if word.casefold() == w1.casefold():
#             options.append(w2)
#             scores.append(score)
#         elif word.casefold() == w2.casefold():
#             options.append(w1)
#             scores.append(score)

#     return options, scores
        

In [None]:
NEs= []
for sentence, labels in zip(x_train,y_train):
    for word, label in zip(sentence, labels):
        if label == '1':
            NEs.append(word)

paraphrases = []
for NE in NEs:
    paraphrase = paraphrase_word(NE)
    paraphrases.append(paraphrase)
    # print(NE, paraphrase)
print(f"Proportion of NEs with no paraphrase: {np.mean(np.array(paraphrases) == None)}")



Proportion of NEs with no paraphrase: 0.629

In [None]:
for NE, p in zip(NEs, paraphrases):
    if p != '--------':
        print(NE, p)

Building paraphrased sentences:

In [None]:
p_train = []
for sentence in x_train:
    p_sentence = []
    for word in sentence:
        p_word = paraphrase_word(word)
        if p_word is None:
            p_sentence.append(word)
        else:
            p_sentence.append(p_word)
    p_train.append(p_sentence)

In [None]:
p_train

In [None]:
np.array(p_train).shape, np.array(y_train).shape

In [None]:
data = pd.DataFrame([p_train,y_train]).transpose()
# write_baseline_pred(data, "/work/nlp-project/data/paraphrased/train_labeled.conll")

In [None]:
# TRAIN_PATH = "nlp-project/data/processed/train.conll"
TRAIN_PATH = "/work/nlp-project/data/processed/train_splits/labeled.conll"
DEV_PATH = "nlp-project/data/processed/dev.conll"

# Loading data
x_train, y_train, _, _ = load_data(TRAIN_PATH)
x_dev, y_dev, _, _ = load_data(DEV_PATH)

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=b2f14aee-af04-4db5-af55-57a3a58b9f40' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>