In [1]:
from __future__ import print_function
import numpy as np
import gzip
import os
import sys
if (sys.version_info > (3, 0)):
    import pickle as pkl
else: #Python 2.7 imports
    import cPickle as pkl

In [2]:
!pip install algorithmia



In [3]:
#We download English word embeddings from here https://www.cs.york.ac.uk/nlp/extvec/
embeddingsPath = '/home/dl1/Arav/Neuralnets/Session 2 - Sentence CNN/code/embeddings/wiki_extvec.gz'

#Train, Dev, and Test files
folder = '/home/dl1/Arav/Neuralnets/Session 2 - Sentence CNN/code/data/'
files = [folder+'train.txt',  folder+'dev.txt', folder+'test.txt']

In [4]:
def createMatrices(sentences, word2Idx):
    unknownIdx = word2Idx['UNKNOWN_TOKEN']
    paddingIdx = word2Idx['PADDING_TOKEN']    
    
    
    xMatrix = []
    unknownWordCount = 0
    wordCount = 0
    
    for sentence in sentences:
        targetWordIdx = 0
        
        sentenceWordIdx = []
        
        for word in sentence:
            wordCount += 1
            
            if word in word2Idx:
                wordIdx = word2Idx[word]
            elif word.lower() in word2Idx:
                wordIdx = word2Idx[word.lower()]
            else:
                wordIdx = unknownIdx
                unknownWordCount += 1
                
            sentenceWordIdx.append(wordIdx)
            
        xMatrix.append(sentenceWordIdx)
       
    
    print("Unknown tokens: %.2f%%" % (unknownWordCount/(float(wordCount))*100))
    return xMatrix

In [5]:
def readFile(filepath):
    sentences = []    
    labels = []
    
    for line in open(filepath):   
        splits = line.split()
        label = int(splits[0])
        words = splits[1:]
        
        labels.append(label)
        sentences.append(words)
        
    print(filepath, len(sentences), "sentences")
    
    return sentences, labels

In [6]:
# ::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: #
#      Start of the preprocessing
# ::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: #

outputFilePath = '/home/dl1/Arav/Neuralnets/Session 2 - Sentence CNN/code/pkl/data`.pkl.gz'


trainDataset = readFile(files[0])
devDataset = readFile(files[1])
testDataset = readFile(files[2])

/home/dl1/Arav/Neuralnets/Session 2 - Sentence CNN/code/data/train.txt 5330 sentences
/home/dl1/Arav/Neuralnets/Session 2 - Sentence CNN/code/data/dev.txt 2664 sentences
/home/dl1/Arav/Neuralnets/Session 2 - Sentence CNN/code/data/test.txt 2668 sentences


In [7]:
# trainDataset

In [8]:
# :: Compute which words are needed for the train/dev/test set ::
words = {}
for sentences, labels in [trainDataset, devDataset, testDataset]:       
    for sentence in sentences:
        for token in sentence:
            words[token.lower()] = True

In [9]:
# words

In [10]:
# :: Read in word embeddings ::
word2Idx = {}
wordEmbeddings = []

In [11]:
# # :: Downloads the embeddings from the York webserver ::
# if not os.path.isfile(embeddingsPath):
#     basename = os.path.basename(embeddingsPath)
#     if basename == 'wiki_extvec.gz':
# 	       print("Start downloading word embeddings for English using wget ...")
# 	       #os.system("wget https://www.cs.york.ac.uk/nlp/extvec/"+basename+" -P embeddings/") #Original path from York University
# 	       os.system("wget https://public.ukp.informatik.tu-darmstadt.de/reimers/2017_english_embeddings/"+basename+" -P embeddings/")
#     else:
#         print(embeddingsPath, "does not exist. Please provide pre-trained embeddings")
#         exit()

In [12]:
# :: Load the pre-trained embeddings file ::
fEmbeddings = gzip.open(embeddingsPath, "r") if embeddingsPath.endswith('.gz') else open(embeddingsPath, encoding="utf8")

In [13]:
# fEmbeddings
# word2Idx["PADDING_TOKEN"] = len(word2Idx)
# word2Idx

# vector = np.zeros(3)
# vector
# wordEmbeddings.append(vector)
# wordEmbeddings

# for line in fEmbeddings:
    
#     split = line.decode("utf-8").strip().split(" ")
# #     break
# print(split)

In [14]:
print("Load pre-trained embeddings file")
for line in fEmbeddings:
    split = line.decode("utf-8").strip().split(" ")
    word = split[0]
    
    if len(word2Idx) == 0: #Add padding+unknown
        word2Idx["PADDING_TOKEN"] = len(word2Idx)
        vector = np.zeros(len(split)-1) #Zero vector for 'PADDING' word
        wordEmbeddings.append(vector)
        
        word2Idx["UNKNOWN_TOKEN"] = len(word2Idx)
        vector = np.random.uniform(-0.25, 0.25, len(split)-1)
        wordEmbeddings.append(vector)

    if word.lower() in words:
        vector = np.array([float(num) for num in split[1:]])
        wordEmbeddings.append(vector)
        word2Idx[word] = len(word2Idx)
       
        
wordEmbeddings = np.array(wordEmbeddings)

print("Embeddings shape: ", wordEmbeddings.shape)
print("Len words: ", len(words))

Load pre-trained embeddings file
Embeddings shape:  (16554, 300)
Len words:  21347


In [15]:
# wordEmbeddings
# word2Idx
# trainDataset[1]
# word2Idx['UNKNOWN_TOKEN']

In [16]:

# :: Create matrices ::
train_matrix = createMatrices(trainDataset[0], word2Idx)
dev_matrix = createMatrices(devDataset[0], word2Idx)
test_matrix = createMatrices(testDataset[0], word2Idx)


data = {
    'wordEmbeddings': wordEmbeddings, 'word2Idx': word2Idx,
    'train': {'sentences': train_matrix, 'labels': trainDataset[1]},
    'dev':   {'sentences': dev_matrix, 'labels': devDataset[1]},
    'test':  {'sentences': test_matrix, 'labels': testDataset[1]}
    }

Unknown tokens: 4.86%
Unknown tokens: 4.86%
Unknown tokens: 4.77%


In [49]:
# print(test_matrix)
# testDataset[1]
# trainDataset[0]

In [17]:
text = "The greatest pleasure in life is doing what people say you cannot do."

In [18]:
splits = text.split()
testwords = splits
# testwords
# sentences.append(words)

In [19]:
def createtestMatrix(sentence, word2Idx):
    unknownIdx = word2Idx['UNKNOWN_TOKEN']
    paddingIdx = word2Idx['PADDING_TOKEN']    
    
    
    testMatrix = []
    unknownWordCount = 0
    wordCount = 0
    
#     for sentence in sentences:
    targetWordIdx = 0

#     sentenceWordIdx = []

    for word in sentence:
        wordCount += 1

        if word in word2Idx:
            wordIdx = word2Idx[word]
        elif word.lower() in word2Idx:
            wordIdx = word2Idx[word.lower()]
        else:
            wordIdx = unknownIdx
            unknownWordCount += 1

#         sentenceWordIdx.append(wordIdx)

        testMatrix.append(wordIdx)

    
    print("Unknown tokens in test_Text: %.2f%%" % (unknownWordCount/(float(wordCount))*100))

    return testMatrix


In [20]:
finalTest_matrix = createtestMatrix(testwords, word2Idx)

Unknown tokens in test_Text: 18.10%


In [21]:
len(finalTest_matrix)

105

In [22]:
resultFilePath = '/home/dl1/Arav/Neuralnets/Session 2 - Sentence CNN/code/pkl/resultdata`.pkl.gz'

In [23]:
testarray = np.array(finalTest_matrix)
testarray = testarray.reshape(1,len(finalTest_matrix))

In [24]:
testarray.tolist()
testarray.shape

(1, 105)

In [25]:
f = gzip.open(resultFilePath, 'wb')
pkl.dump(testarray, f)
f.close()

In [26]:
textLDA=[]
textLDA.append(text)

In [35]:
#usage of the LDA - Algorithmia for 

import Algorithmia

input = {
  "docsList": textLDA,
  "mode": "quality"
}
client = Algorithmia.client('sim+KZtb16R1rtOXC0dk9Y4sqEb1')
algo = client.algo('nlp/LDA/1.0.0')
LDAresult = algo.pipe(input).result
print(LDAresult)

[{'banks': 1, 'capability': 1, 'device': 1, 'educational': 1, 'functioning': 1, 'institutions': 1, 'large': 1, 'works': 1}, {'carry': 1, 'devices': 1, 'handle': 2, 'make': 1, 'printer': 1, 'reliable': 1, 'stored': 1, 'technology': 2}, {'child': 1, 'great': 1, 'invention': 1, 'memory': 1, 'offices': 1}, {'anytime': 1, 'common': 1, 'computer': 2, 'data': 3, 'input': 1, 'keyboard': 1, 'simple': 1, 'store': 2}]


In [31]:
import json

In [36]:
# jsontopython = json.load(LDAresult)
# print(jsontopython)
for item in LDAresult:
    print(item)

{'banks': 1, 'capability': 1, 'device': 1, 'educational': 1, 'functioning': 1, 'institutions': 1, 'large': 1, 'works': 1}
{'carry': 1, 'devices': 1, 'handle': 2, 'make': 1, 'printer': 1, 'reliable': 1, 'stored': 1, 'technology': 2}
{'child': 1, 'great': 1, 'invention': 1, 'memory': 1, 'offices': 1}
{'anytime': 1, 'common': 1, 'computer': 2, 'data': 3, 'input': 1, 'keyboard': 1, 'simple': 1, 'store': 2}
