In [None]:
#imports
#make sure to rerun this cell continually 
import nltk
from nltk.probability import FreqDist
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('stopwords')
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import SGDClassifier
import torch.nn as nn
import torch.optim as optim
import torch
import torch.nn.functional as F
#example text

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


#Downloading Data


In [None]:
#download data(onestop english)(Julian)
onestop_data = {'Advanced':[], 'Intermediate':[]}
with open('ADV-INT.txt', 'r') as file:
  adv_int_lines = file.readlines()
  #print(adv_int_data[0])
  for i in range(0, len(adv_int_lines), 3):
    onestop_data['Advanced'].append(adv_int_lines[i])
    onestop_data['Intermediate'].append(adv_int_lines[i+1])
onestop_df = pd.DataFrame(onestop_data)
onestop_df

Unnamed: 0,Advanced,Intermediate
0,Brazil and Peru have lodged objections to a bi...,Brazil and Peru have made objections to a bid ...
1,"Until now, the differences between commercial,...","Until now, the differences between commercial,..."
2,But these categories or generic top-level dom...,But these categories or generic top-level dom...
3,"Amazon has applied for dozens of new domains, ...","Amazon has applied for many new domains, inclu..."
4,Allowing private companies to register geograp...,Allowing private companies to register geograp...
...,...,...
2149,Workers on zero-hours contracts are often only...,Workers on zero-hours contracts are often only...
2150,We believe zero-hours contracts are essential ...,We believe zero-hours contracts are essential ...
2151,Our properties have told us its important to b...,Its important to be able to reorganize staff r...
2152,The institutes gures also suggest that 17% of...,Figures from the poll suggest that 17% of empl...


In [None]:
#download data (wiki manual)(Karl)
#initial import is dev data (4mb), secondd version is for the much larger train data(114 mb)
wiki_df = pd.read_csv("valid.tsv", sep = "\t", header=0)
#wiki_df = pd.read_csv("train.tsv", sep = "\t", header=0)
wiki_df


Unnamed: 0,Advanced,Intermediate
0,"Adjacent counties are Marin (to the south), Me...","countries next to it are Marin, Mendocino, Lak..."
1,"Adjacent counties are Marin (to the south), Me...","Nearby counties are Marin, Mendocino, Lake, Na..."
2,"Adjacent counties are Marin (to the south), Me...","Adjacent counties are Marin, Mendocino, Lake, ..."
3,"Adjacent counties are Marin (to the south), Me...","Neighboring counties are Marin, Mendocino, Lak..."
4,"Adjacent counties are Marin (to the south), Me...","Adjacent counties are Marin (south), Mendocino..."
...,...,...
19995,Modern African history has been rife with revo...,"Modern African history is full of revolutions,..."
19996,Modern African history has been rife with revo...,Modern African history is full of revolutions ...
19997,Modern African history has been rife with revo...,Common in modern African history are wars and ...
19998,Modern African history has been rife with revo...,"Revolutions and wars, and the growth of econom..."


In [None]:
#dowload embeddings for glove or bert (Karl)
#glove embeddings:
embeddings_dict = {}
with open("glove.6B.100d.txt", 'r', encoding="utf-8") as f:
  for line in f:
    values = line.split()
    word = values[0]
    vector = np.asarray(values[1:], "float32")
    embeddings_dict[word] = vector

FileNotFoundError: ignored

#Data Wrangling

In [None]:
stop_words = set(stopwords.words('english'))
def rm_stopwords(sentence):
  filtered_sentence = []
  for word in sentence:
    if word.lower() not in stop_words:
      filtered_sentence.append(word)
  return filtered_sentence

def split_data(df, split=0.1):
    dev_set = df.sample(frac = split, random_state = 7)
    dev_set.reset_index(inplace=True, drop=True)
    train_set = df.drop(dev_set.index)
    train_set.reset_index(inplace=True, drop=True)
    return train_set, dev_set

#preprocessing data data to pandas or something to this tune(Julian)
onestop_df['Advanced'] = onestop_df['Advanced'].apply(word_tokenize)
onestop_df['Intermediate'] = onestop_df['Intermediate'].apply(word_tokenize)
#onestop_df['Advanced'] = onestop_df['Advanced'].apply(rm_stopwords)
#onestop_df['Intermediate'] = onestop_df['Intermediate'].apply(rm_stopwords)
onestop_df

training_data = pd.concat(pd.DataFrame([[row['Advanced'], 'Advanced'],
                                        [row['Intermediate'], 'Intermediate']],
                                        columns=['Sentences', 'Tag']) for index, row in onestop_df.iterrows())
training_data.reset_index(inplace=True, drop=True)
train, dev = split_data(training_data)
bla = training_data.copy()

In [None]:
#data viewing (checking shape that sort of thing)(Julian)
onestop_adv_vocab = []
onestop_int_vocab = []
onestop_int_avg_len = 0
onestop_adv_avg_len = 0
for index, row in onestop_df.iterrows():
  onestop_int_avg_len += len(row['Intermediate'])
  onestop_adv_avg_len += len(row['Advanced'])
  for word in row['Advanced']:
    onestop_adv_vocab.append(word)
  for word in row['Intermediate']:
    onestop_int_vocab.append(word)
onestop_int_avg_len = onestop_int_avg_len/len(onestop_df)
print('The average length of an Intermediate sentence is:', onestop_int_avg_len)
onestop_adv_avg_len = onestop_adv_avg_len/len(onestop_df)
print('The average length of an Advanced sentence is:', onestop_adv_avg_len)
onestop_adv_fdist = FreqDist(onestop_adv_vocab)
print('Advanced Sentence Vocab Size:', len(onestop_adv_fdist))
onestop_int_fdist = FreqDist(onestop_int_vocab)
print('Intermediate Sentence Vocab Size:', len(onestop_int_fdist))

The average length of an Intermediate sentence is: 26.402042711234913
The average length of an Advanced sentence is: 28.41643454038997
Advanced Sentence Vocab Size: 10711
Intermediate Sentence Vocab Size: 9293


In [None]:
#glove embeddings config (Karl)
#try both 50 and 100 dimm
#embeddings 
#target_vocab needs to be defined dictionary of the vocab
#target_vocab = onestop_adv_fdist | onestop_int_fdist
def glove_embedding(target_vocab, embeddings_dict):
  embedding_dimm = 100 # change this value as file is changed
  matrix_len = len(target_vocab)
  weights_matrix = np.zeros((matrix_len, embedding_dimm))
  words_found = 0

  for i, word in enumerate(target_vocab):
      try: 
          weights_matrix[i] = embeddings_dict[word]
          words_found += 1
      except KeyError:
        #continue
        weights_matrix[i] = np.random.normal(scale=0.6, size=(embedding_dimm, ))
  print (weights_matrix.shape)
  return weights_matrix

In [None]:
#bert implementation (implement later)

In [None]:
#data pre-process for baseline classifier

corpus = wiki_df
#make it so models can acutally intake the data
#x = [label.split() for label in corpus['Advanced']] + [label.split() for label in corpus['Intermediate']]
x = pd.concat([corpus['Advanced'], corpus['Intermediate']])
#print ((x))
y = [1] * len(corpus) + [0]*len(corpus)
#print (y)

In [None]:
# data preproccessing functions for lstm classifiers
def get_ix_converter(sentences):
  word_to_ix = dict()
  for sentence in sentences:
    for word in sentence:
      if word not in word_to_ix:
        word_to_ix[word] = len(word_to_ix)
  return word_to_ix

def prepare_sequence(sentence, to_ix, w_unk=False):
  if w_unk:
    idxs = [to_ix.get(word, to_ix['<UNK>']) for word in sentence]
  else: 
    idxs = [to_ix[word] for word in sentence]
  return torch.tensor(idxs, dtype=torch.long)

#Classifier

In [None]:
#baseline non neural model uses tfidf vectorization can modify to use word embedding(SGD)(Karl)
#binarizer = MultiLabelBinarizer()
#binarizer.fit(y)
#y = binarizer.transform(y)

x_train, x_dt, y_train, y_dt = train_test_split(x, y, test_size = .3, random_state = 0)
#x_dev, x_test, y_dev, y_test = train_test_split(x_dt, y_dt, test_dt = .5, random_state = 0)
tfidf = TfidfVectorizer(min_df = 0)
print(type(x_train))
fitted = tfidf.fit(x_train.append(x_dt))
x_train = fitted.transform(x_train)
x_dt = fitted.transform(x_dt)
print(x_train.shape)
#train_data.describe()

classifier = OneVsRestClassifier(SGDClassifier(loss='hinge', alpha =.0001))
classifier.fit(x_train, y_train)


y_pred = classifier.predict(x_dt)
count = 0
right = 0
print (y_pred)
#y_pred = binarizer.inverse_transform(y_pred)
#y_dt = binarizer.inverse_transform(y_dt)
for i in range(0,len(y_pred)):
  if y_pred[i] == ():
    y_pred[i] = ('none',)
  if y_pred[i] == y_dt[i]:
    right = right +1
  #else:
  #  print (y_pred[i], y_dt[i])
  count = count + 1
ex = 2
#print (y_pred[ex], y_dt[ex])
print (right)
print (count)

<class 'pandas.core.series.Series'>
(28000, 12482)
[1 1 1 ... 1 0 0]
8708
12000




In [None]:
#tensor conversion functions(work together on this on sunday)

In [None]:
#model function(RNN)(Vishal)

In [None]:
#model function(LSTM)(Julian)
class LSTMBinaryClassifier(nn.Module):
  def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size, weights_matrix):
    super(LSTMBinaryClassifier, self).__init__()
    self.hidden_dim = hidden_dim
    #weight matrix to tensor
    weights_matrix = torch.from_numpy(weights_matrix)
        
    num_embeddings, embedding_dim = weights_matrix.shape
    
    self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
    #self.word_embeddings.load_state_dict({'weight': weights_matrix})
    #self.word_embeddings.weight.requires_grad = False

    self.lstm = nn.LSTM(embedding_dim, hidden_dim, bidirectional=False)

    self.hidden_to_tag = nn.Linear(hidden_dim, tagset_size)
    #print('tagset size is:', tagset_size)

  def forward(self, sentence):
    embeds = self.word_embeddings(sentence)
    lstm_out, (ht, ct) = self.lstm(embeds.view(len(sentence), 1, -1))

    #print('length of lstm_out is', len(lstm_out))
    #print('length of sentence is', len(sentence))
    #print('the hidden dimension is', self.hidden_dim)
    #out_forward = lstm_out[range(len(lstm_out)), len(sentence) - 1, :self.hidden_dim]

    #out_forward = lstm_out[:, -1, :self.hidden_dim]

    #out_reverse = lstm_out[:, 0, self.dimension:]
    #out_reduced = torch.cat((out_forward, out_reverse), 1)
    #print(ht[-1])
    #print(ht.shape)
    tag_space = self.hidden_to_tag(ht[-1])
    #tag_space = torch.squeeze(tag_space, 1)
    #tag_scores = F.log_softmax(tag_space, dim=1)
    return tag_space

In [None]:
#model function(biLSTM)(Julian)
class BiLSTMBinaryClassifier(nn.Module):
  def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size, weights_matrix):
    super(BiLSTMBinaryClassifier, self).__init__()
    self.hidden_dim = hidden_dim
    #weight matrix to tensor
    weights_matrix = torch.from_numpy(weights_matrix)
        
    num_embeddings, embedding_dim = weights_matrix.shape
    
    self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
    self.word_embeddings.load_state_dict({'weight': weights_matrix})
    self.word_embeddings.weight.requires_grad = False

    self.lstm = nn.LSTM(embedding_dim, hidden_dim, bidirectional=True)

    self.hidden_to_tag = nn.Linear(2*hidden_dim, tagset_size)
  
  def forward(self, sentence):
    embeds = self.word_embeddings(sentence)
    lstm_out, (ht, ct) = self.lstm(embeds.view(len(sentence), 1, -1))
    out_forward = ht[-1]
    out_reverse = ht[0]
    out_reduced = torch.cat((out_forward, out_reverse), 1)
    #print(ht)
    #print(ht.shape)
    tag_space = self.hidden_to_tag(out_reduced)
    #tag_scores = F.log_softmax(tag_space, dim=1)
    return tag_space

In [None]:
#training of models
#print(bla)
vocab = get_ix_converter(list(training_data['Sentences']))
data = training_data
tags = {'Intermediate': 0, 'Advanced': 1}
embedding_weights = glove_embedding(vocab.keys(), embeddings_dict)

EMBEDDING_DIM = 100
HIDDEN_DIM = 100
#def train_bi_lstm(model, embedding_dim, hidden_dim, vocab_size, tagset_size, optimizer, loss_function):
#  print('Not Implemented')

def train_lstm(model, loss_function, optimizer, training_data, epochs=10):
  for epoch in range(epochs):
    print("epoch", epoch)
    for index, row in training_data.iterrows():
      model.zero_grad()

      sentence_in = row['Sentences']
      targets = row['Tag']
      
      tag_scores = model(sentence_in)
      #print('targets:', targets)
      #print('scores:', tag_scores)
      #print (targets)
      loss = loss_function(tag_scores, targets)
      #print(loss)
      loss.backward()
      optimizer.step()

def test_model(model, test_data_ix, test_data, word_to_ix, tag_to_ix):
  total = len(test_data)
  wrong = 0
  tags = list(tag_to_ix.keys())
  with torch.no_grad():
    for index, row in test_data_ix.iterrows():
      sentence_in = row['Sentences']
      #targets = row['IOB Slot tags']

      tag_scores = model(sentence_in)
      #for scores in tag_scores:
      #print(tag_scores)
      scores = list(tag_scores[0])
      max_score = max(scores)
      #print('The maximum score is', max_score)
      tag = scores.index(max_score)
      predicted_tag = tags[tag]
      #print('Predicted tag: ', predicted_tag)
      #print('Actual Tag:', test_data.loc[index]['Tag'])
      #print(test_data.loc[index])
      if predicted_tag != test_data.loc[index]['Tag']:
        wrong += 1
        #print('Predicted Tags:', predicted_tags)
        #print('Actual Tags:', test_data.loc[index]['IOB Slot tags'])
    print('Accuracy: ', (total-wrong)/total)

# input_data_ix = sentences converted to arrays of numbers
# tag_to_ix = dictionary with keys = tags and values equal to indexes
def run_model(model, input_data_ix, tag_to_ix):
  output = []
  tags = list(tag_to_ix.keys())
  with torch.no_grad():
    for index, row in input_data_ix.iterrows():
      sentence_in = row['Sentences']
      tag_scores = model(sentence_in)
      predicted_tags = []
      for scores in tag_scores:
        scores = list(scores)
        max_score = max(scores)
        tag = scores.index(max_score)
        predicted_tags.append(tags[tag])
      output.append(predicted_tags)
    #output = pd.DataFrame(output, orient='index',columns=['IOB Slot tags'])
    return output

lstm = LSTMBinaryClassifier(EMBEDDING_DIM, HIDDEN_DIM, len(vocab), len(tags), embedding_weights)
bilstm = BiLSTMBinaryClassifier(EMBEDDING_DIM, HIDDEN_DIM, len(vocab), len(tags), embedding_weights)
loss_function = nn.CrossEntropyLoss()
optimizer = optim.Adam(lstm.parameters(), lr=0.001)
bi_optimizer = optim.Adam(bilstm.parameters(), lr=0.001)

training_data['Sentences'] = training_data['Sentences'].apply(lambda x: prepare_sequence(x, vocab))
training_data['Tag'] = training_data['Tag'].apply(lambda x: prepare_sequence([x], tags))

train_ix = train.copy()
train_ix['Sentences'] = train_ix['Sentences'].apply(lambda x: prepare_sequence(x, vocab))
train_ix['Tag'] = train_ix['Tag'].apply(lambda x: prepare_sequence([x], tags))
dev_ix = dev.copy()
dev_ix['Sentences'] = dev_ix['Sentences'].apply(lambda x: prepare_sequence(x, vocab))
dev_ix['Tag'] = dev_ix['Tag'].apply(lambda x: prepare_sequence([x], tags))

(10940, 100)


In [None]:
#train_lstm(lstm, loss_function, optimizer, train_ix, epochs=20)
train_lstm(bilstm, loss_function, bi_optimizer, train_ix, epochs=20)
print ("training done")

epoch 0
epoch 1
epoch 2
epoch 3
epoch 4
epoch 5
epoch 6
epoch 7
epoch 8
epoch 9
epoch 10
epoch 11
epoch 12
epoch 13
epoch 14
epoch 15
epoch 16
epoch 17
epoch 18
epoch 19
training done


In [None]:
#print (list(lstm.parameters()))
#run_model(lstm, training_data, tags)
#test_model(lstm, dev_ix, dev, vocab, tags)
test_model(bilstm, dev_ix, dev, vocab, tags)

Accuracy:  0.8654292343387471


#Evaluation

In [None]:
#run model on test data(Vishal)

In [None]:
#accuracy f1 precision recall metrics(Vishal)




#Testing

add different things we want to test into their own code blocks

In [None]:
#50 d embedding vs 100 d embedding GLOVe vs bert pretrained 

In [None]:
#rnn vs lstm vs bilstm

In [None]:
#different hyperparameter optimzization