In [1]:
from __future__ import print_function
import numpy as np
import gzip
import os
import sys
if (sys.version_info > (3, 0)):
    import pickle as pkl
else: #Python 2.7 imports
    import cPickle as pkl

In [2]:
#We download English word embeddings from here https://www.cs.york.ac.uk/nlp/extvec/
embeddingsPath = '/home/dl1/Arav/Neuralnets/Session 2 - Sentence CNN/code/embeddings/wiki_extvec.gz'

#Train, Dev, and Test files
folder = '/home/dl1/Arav/Neuralnets/Session 2 - Sentence CNN/code/data/'
files = [folder+'train.txt',  folder+'dev.txt', folder+'test.txt']

In [3]:
def createMatrices(sentences, word2Idx):
    unknownIdx = word2Idx['UNKNOWN_TOKEN']
    paddingIdx = word2Idx['PADDING_TOKEN']    
    
    
    xMatrix = []
    unknownWordCount = 0
    wordCount = 0
    
    for sentence in sentences:
        targetWordIdx = 0
        
        sentenceWordIdx = []
        
        for word in sentence:
            wordCount += 1
            
            if word in word2Idx:
                wordIdx = word2Idx[word]
            elif word.lower() in word2Idx:
                wordIdx = word2Idx[word.lower()]
            else:
                wordIdx = unknownIdx
                unknownWordCount += 1
                
            sentenceWordIdx.append(wordIdx)
            
        xMatrix.append(sentenceWordIdx)
       
    
    print("Unknown tokens: %.2f%%" % (unknownWordCount/(float(wordCount))*100))
    return xMatrix

In [4]:
def readFile(filepath):
    sentences = []    
    labels = []
    
    for line in open(filepath):   
        splits = line.split()
        label = int(splits[0])
        words = splits[1:]
        
        labels.append(label)
        sentences.append(words)
        
    print(filepath, len(sentences), "sentences")
    
    return sentences, labels

In [5]:
# ::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: #
#      Start of the preprocessing
# ::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: #

outputFilePath = '/home/dl1/Arav/Neuralnets/Session 2 - Sentence CNN/code/pkl/data`.pkl.gz'


trainDataset = readFile(files[0])
devDataset = readFile(files[1])
testDataset = readFile(files[2])

/home/dl1/Arav/Neuralnets/Session 2 - Sentence CNN/code/data/train.txt 5330 sentences
/home/dl1/Arav/Neuralnets/Session 2 - Sentence CNN/code/data/dev.txt 2664 sentences
/home/dl1/Arav/Neuralnets/Session 2 - Sentence CNN/code/data/test.txt 2668 sentences


In [6]:
# trainDataset

In [7]:
# :: Compute which words are needed for the train/dev/test set ::
words = {}
for sentences, labels in [trainDataset, devDataset, testDataset]:       
    for sentence in sentences:
        for token in sentence:
            words[token.lower()] = True

In [8]:
# words

In [9]:
# :: Read in word embeddings ::
word2Idx = {}
wordEmbeddings = []

In [10]:
# :: Downloads the embeddings from the York webserver ::
if not os.path.isfile(embeddingsPath):
    basename = os.path.basename(embeddingsPath)
    if basename == 'wiki_extvec.gz':
	       print("Start downloading word embeddings for English using wget ...")
	       #os.system("wget https://www.cs.york.ac.uk/nlp/extvec/"+basename+" -P embeddings/") #Original path from York University
	       os.system("wget https://public.ukp.informatik.tu-darmstadt.de/reimers/2017_english_embeddings/"+basename+" -P embeddings/")
    else:
        print(embeddingsPath, "does not exist. Please provide pre-trained embeddings")
        exit()

In [11]:
# :: Load the pre-trained embeddings file ::
fEmbeddings = gzip.open(embeddingsPath, "r") if embeddingsPath.endswith('.gz') else open(embeddingsPath, encoding="utf8")

In [12]:
# fEmbeddings

In [13]:
print("Load pre-trained embeddings file")
for line in fEmbeddings:
    split = line.decode("utf-8").strip().split(" ")
    word = split[0]
    
    if len(word2Idx) == 0: #Add padding+unknown
        word2Idx["PADDING_TOKEN"] = len(word2Idx)
        vector = np.zeros(len(split)-1) #Zero vector vor 'PADDING' word
        wordEmbeddings.append(vector)
        
        word2Idx["UNKNOWN_TOKEN"] = len(word2Idx)
        vector = np.random.uniform(-0.25, 0.25, len(split)-1)
        wordEmbeddings.append(vector)

    if word.lower() in words:
        vector = np.array([float(num) for num in split[1:]])
        wordEmbeddings.append(vector)
        word2Idx[word] = len(word2Idx)
       
        
wordEmbeddings = np.array(wordEmbeddings)

print("Embeddings shape: ", wordEmbeddings.shape)
print("Len words: ", len(words))

Load pre-trained embeddings file
Embeddings shape:  (16554, 300)
Len words:  21347


In [14]:
# word2Idx
# trainDataset[1]
# word2Idx['UNKNOWN_TOKEN']

In [15]:

# :: Create matrices ::
train_matrix = createMatrices(trainDataset[0], word2Idx)
dev_matrix = createMatrices(devDataset[0], word2Idx)
test_matrix = createMatrices(testDataset[0], word2Idx)


data = {
    'wordEmbeddings': wordEmbeddings, 'word2Idx': word2Idx,
    'train': {'sentences': train_matrix, 'labels': trainDataset[1]},
    'dev':   {'sentences': dev_matrix, 'labels': devDataset[1]},
    'test':  {'sentences': test_matrix, 'labels': testDataset[1]}
    }

Unknown tokens: 4.86%
Unknown tokens: 4.86%
Unknown tokens: 4.77%


In [25]:
# print(test_matrix)
# testDataset[1]
trainDataset[0]

[['i',
  'like',
  'my',
  'christmas',
  'movies',
  'with',
  'more',
  'elves',
  'and',
  'snow',
  'and',
  'less',
  'pimps',
  'and',
  "ho's",
  '.'],
 ['.',
  '.',
  '.',
  'liotta',
  'is',
  'put',
  'in',
  'an',
  'impossible',
  'spot',
  'because',
  'his',
  "character's",
  'deceptions',
  'ultimately',
  'undo',
  'him',
  'and',
  'the',
  'believability',
  'of',
  'the',
  'entire',
  'scenario',
  '.',
  'too',
  'bad',
  '.'],
 ['what',
  'can',
  'one',
  'say',
  'about',
  'a',
  'balding',
  '50-year-old',
  'actor',
  'playing',
  'an',
  'innocent',
  'boy',
  'carved',
  'from',
  'a',
  'log',
  '?'],
 ['normally',
  ',',
  "rohmer's",
  'talky',
  'films',
  'fascinate',
  'me',
  ',',
  'but',
  'when',
  'he',
  'moves',
  'his',
  'setting',
  'to',
  'the',
  'past',
  ',',
  'and',
  'relies',
  'on',
  'a',
  'historical',
  'text',
  ',',
  'he',
  'loses',
  'the',
  'richness',
  'of',
  'characterization',
  'that',
  'makes',
  'his',
  'films

In [73]:
text = 'I love my job'

In [74]:
splits = text.split()
testwords = splits
testwords
# sentences.append(words)

['I', 'love', 'my', 'job']

In [75]:
def createtestMatrix(sentence, word2Idx):
    unknownIdx = word2Idx['UNKNOWN_TOKEN']
    paddingIdx = word2Idx['PADDING_TOKEN']    
    
    
    testMatrix = []
    unknownWordCount = 0
    wordCount = 0
    
#     for sentence in sentences:
    targetWordIdx = 0

#     sentenceWordIdx = []

    for word in sentence:
        wordCount += 1

        if word in word2Idx:
            wordIdx = word2Idx[word]
        elif word.lower() in word2Idx:
            wordIdx = word2Idx[word.lower()]
        else:
            wordIdx = unknownIdx
            unknownWordCount += 1

#         sentenceWordIdx.append(wordIdx)

        testMatrix.append(wordIdx)

    
    print("Unknown tokens in test_Text: %.2f%%" % (unknownWordCount/(float(wordCount))*100))

    return testMatrix


In [76]:
finalTest_matrix = createtestMatrix(testwords, word2Idx)

Unknown tokens in test_Text: 0.00%


In [77]:
resultFilePath = '/home/dl1/Arav/Neuralnets/Session 2 - Sentence CNN/code/pkl/resultdata`.pkl.gz'

In [78]:
testarray = np.array(finalTest_matrix)
testarray = testarray.reshape(1,len())

ValueError: cannot reshape array of size 4 into shape (1,12)

In [79]:
testarray.tolist()
testarray.shape

(4,)

In [80]:
f = gzip.open(resultFilePath, 'wb')
pkl.dump(testarray, f)
f.close()