In [1]:
from __future__ import print_function
import numpy as np
import gzip
import os
import sys
if (sys.version_info > (3, 0)):
    import pickle as pkl
else: #Python 2.7 imports
    import cPickle as pkl

In [2]:
#We download English word embeddings from here https://www.cs.york.ac.uk/nlp/extvec/
embeddingsPath = '/home/dl1/Arav/Sentence_classic/code/embeddings/wiki_extvec.gz'

#Train, Dev, and Test files
folder = '/home/dl1/Arav/Sentence_classic/code/data/'
files = [folder+'train.txt',  folder+'dev.txt', folder+'test.txt']

In [3]:
def createMatrices(sentences, word2Idx):
    unknownIdx = word2Idx['UNKNOWN_TOKEN']
    paddingIdx = word2Idx['PADDING_TOKEN']    
    
    
    xMatrix = []
    unknownWordCount = 0
    wordCount = 0
    
    for sentence in sentences:
        targetWordIdx = 0
        
        sentenceWordIdx = []
        
        for word in sentence:
            wordCount += 1
            
            if word in word2Idx:
                wordIdx = word2Idx[word]
            elif word.lower() in word2Idx:
                wordIdx = word2Idx[word.lower()]
            else:
                wordIdx = unknownIdx
                unknownWordCount += 1
                
            sentenceWordIdx.append(wordIdx)
            
        xMatrix.append(sentenceWordIdx)
       
    
    print("Unknown tokens: %.2f%%" % (unknownWordCount/(float(wordCount))*100))
    return xMatrix

In [4]:
def readFile(filepath):
    sentences = []    
    labels = []
    
    for line in open(filepath):   
        splits = line.split()
        label = int(splits[0])
        words = splits[1:]
        
        labels.append(label)
        sentences.append(words)
        
    print(filepath, len(sentences), "sentences")
    
    return sentences, labels

In [5]:
# ::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: #
#      Start of the preprocessing
# ::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: #

outputFilePath = '/home/dl1/Arav/Sentence_classic/code/pkl/data.pkl.gz'


trainDataset = readFile(files[0])
devDataset = readFile(files[1])
testDataset = readFile(files[2])

/home/dl1/Arav/Sentence_classic/code/data/train.txt 5330 sentences
/home/dl1/Arav/Sentence_classic/code/data/dev.txt 2664 sentences
/home/dl1/Arav/Sentence_classic/code/data/test.txt 2668 sentences


In [11]:
# trainDataset

In [6]:
# :: Compute which words are needed for the train/dev/test set ::
words = {}
for sentences, labels in [trainDataset, devDataset, testDataset]:       
    for sentence in sentences:
        for token in sentence:
            words[token.lower()] = True

In [7]:
# words

In [8]:
# :: Read in word embeddings ::
word2Idx = {}
wordEmbeddings = []

In [9]:
# :: Downloads the embeddings from the York webserver ::
if not os.path.isfile(embeddingsPath):
    basename = os.path.basename(embeddingsPath)
    if basename == 'wiki_extvec.gz':
	       print("Start downloading word embeddings for English using wget ...")
	       #os.system("wget https://www.cs.york.ac.uk/nlp/extvec/"+basename+" -P embeddings/") #Original path from York University
	       os.system("wget https://public.ukp.informatik.tu-darmstadt.de/reimers/2017_english_embeddings/"+basename+" -P embeddings/")
    else:
        print(embeddingsPath, "does not exist. Please provide pre-trained embeddings")
        exit()

In [10]:
# :: Load the pre-trained embeddings file ::
fEmbeddings = gzip.open(embeddingsPath, "r") if embeddingsPath.endswith('.gz') else open(embeddingsPath, encoding="utf8")

In [11]:
print("Load pre-trained embeddings file")
for line in fEmbeddings:
    split = line.decode("utf-8").strip().split(" ")
    word = split[0]
    
    if len(word2Idx) == 0: #Add padding+unknown
        word2Idx["PADDING_TOKEN"] = len(word2Idx)
        vector = np.zeros(len(split)-1) #Zero vector for 'PADDING' word
        wordEmbeddings.append(vector)
        
        word2Idx["UNKNOWN_TOKEN"] = len(word2Idx)
        vector = np.random.uniform(-0.25, 0.25, len(split)-1)
        wordEmbeddings.append(vector)

    if word.lower() in words:
        vector = np.array([float(num) for num in split[1:]])
        wordEmbeddings.append(vector)
        word2Idx[word] = len(word2Idx)
       
        
wordEmbeddings = np.array(wordEmbeddings)

print("Embeddings shape: ", wordEmbeddings.shape)
print("Len words: ", len(words))

Load pre-trained embeddings file
Embeddings shape:  (16554, 300)
Len words:  21347


In [12]:
word2Idx_path = open("/home/dl1/Arav/Sentence_classic/code/pkl/word2Idx.pickle","wb")
pkl.dump(word2Idx,word2Idx_path)
word2Idx_path.close()

In [79]:
# wordEmbeddings
# word2Idx
# trainDataset[1]
# word2Idx['UNKNOWN_TOKEN']

In [80]:

# # :: Create matrices ::
# train_matrix = createMatrices(trainDataset[0], word2Idx)
# dev_matrix = createMatrices(devDataset[0], word2Idx)
# test_matrix = createMatrices(testDataset[0], word2Idx)


# data = {
#     'wordEmbeddings': wordEmbeddings, 'word2Idx': word2Idx,
#     'train': {'sentences': train_matrix, 'labels': trainDataset[1]},
#     'dev':   {'sentences': dev_matrix, 'labels': devDataset[1]},
#     'test':  {'sentences': test_matrix, 'labels': testDataset[1]}
#     }

In [81]:
###check
# print(test_matrix)
# testDataset[1]
# trainDataset[0]

In [37]:
# text = "The greatest pleasure in life is doing what people say you cannot do."
# text = "She was dying to say something sarcastic to him, but bit her tongue and stayed silent"
# text = "I work 40 hours a week to be this poor"
# text = "I’m busy now. Can I ignore you some other time?"
# text = "I’m glad to see you’re not letting your education get in the way of your ignorance."
# text = "I just won the lottery. The worst part is that I can’t tell my family and friends because if I did, they’d all want some of the money."
# text = "I just won the lottery and I am happy about it"

In [62]:
# text = "I’d rather not own a highly successful restaurant. Sure, you could be famous and make a ton of money, but just think of those long hours and dealing with the public all the time."
# text = "You may say you want a cool sporty car, but I can’t imagine paying that car insurance and getting pulled over by cops all the time."
# text = "Dogs could be great companions and really brighten things up if they didn’t make such a mess and all that noise."
# text = "My job pays really well and fast tracks its employees into higher positions since they’re growing so quickly, but sitting in a grey, boring cubicle listening to sad FM radio all afternoon from the cubicle next door is just too much. "
# text = "After getting fired, Johnny and his wife celebrated for the job he would soon have."
# text = "Work was cancelled due to inclement weather. An optimistic person enjoyed the time off."
# text = "I love my job"

In [2]:
# text = "Whose car do you think is faster, yours or mine? I have no idea. Well, okay, what kind of car do you drive? A 1988 Honda Accord. That car is a big piece of shit! That is what I can afford. So, it is still a big piece of shit. Well, it works. Works like a big piece of shit works. What does that even mean?"

Speech Recognition -> Text -> Speech

In [119]:
# !pip install gTTS
# !pip install scipy

[33mYou are using pip version 9.0.3, however version 10.0.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [123]:
#for getting a audio source file for further testing the speech to text using pyttsx: 
from gtts import gTTS
import wave
# import scipy.io.wavfile
blabla = ("well it's about time you got here")
tts = gTTS(text=blabla, lang='en')
# data2 = np.asarray(tts, dtype=np.int16)
tts.save("/home/dl1/Arav/Sentence_classic/code/newtest.wav")
# wave.open("/home/dl1/Arav/Sentence_classic/code/nwtest.wav",mode='wb')
# scipy.io.wavfile.write("/home/dl1/Arav/Sentence_classic/code/newtest.wav",48000,data2)

In [13]:
#speech to text -> input text:

import speech_recognition
import pyttsx3

speech_engine = pyttsx3.init('espeak') # see http://pyttsx.readthedocs.org/en/latest/engine.html#pyttsx.init
print(speech_engine.getProperty('rate'))
speech_engine.setProperty('rate', 150)

200


In [14]:
def speak(text):
    speech_engine.say(text)
    speech_engine.runAndWait()

recognizer = speech_recognition.Recognizer()

def listen():
#     with speech_recognition.AudioFile("/home/dl1/Arav/Sentence_classic/code/newtest.wav") as source:
    with speech_recognition.Microphone() as source:
        recognizer.adjust_for_ambient_noise(source, duration=1)
        print(recognizer.energy_threshold)
        print("Chunking rate:", source.CHUNK)
        print("format rate:",source.format)
#         recognizer.energy_threshold +=80
#         print(recognizer.energy_threshold)

        audio = recognizer.listen(source)
        
#         audio = recognizer.record(source)

    try:
        text = recognizer.recognize_google(audio, language='en-GB')
        print(text)
        return text
    
#         or:return recognizer.recognize_sphinx(audio)
        
    except speech_recognition.UnknownValueError:
        print("Could not understand audio")
#         reply = "sorry, I could not understand that"
#         return reply
    except speech_recognition.RequestError as e:
        print("Recog Error; {0}".format(e))

    return ""

In [83]:
# speech_recognition.AudioFile
# recognizer.

In [15]:
speak("Say something!")

In [16]:
resultAudio = listen()
speak("I heard you say " + resultAudio)

86.59210593546467
Chunking rate: 1024
format rate: 8
hello hello hello hello hello


In [20]:
testwords = resultAudio.split()
testwords
# sentences.append(words)

['hello', 'hello', 'hello', 'hello', 'hello']

In [18]:
def createtestMatrix(sentence, word2Idx):
    unknownIdx = word2Idx['UNKNOWN_TOKEN']
    paddingIdx = word2Idx['PADDING_TOKEN']    
    
    
    testMatrix = []
    unknownWordCount = 0
    wordCount = 0
    
#     for sentence in sentences:
    targetWordIdx = 0

#     sentenceWordIdx = []

    for word in sentence:
        wordCount += 1

        if word in word2Idx:
            wordIdx = word2Idx[word]
        elif word.lower() in word2Idx:
            wordIdx = word2Idx[word.lower()]
        else:
            wordIdx = unknownIdx
            unknownWordCount += 1

#         sentenceWordIdx.append(wordIdx)

        testMatrix.append(wordIdx)

    
    print("Unknown tokens in test_Text: %.2f%%" % (unknownWordCount/(float(wordCount))*100))

    return testMatrix


In [21]:
word2Idx_path = open("/home/dl1/Arav/Sentence_classic/code/pkl/word2Idx.pickle", "rb")
word2Idx = pkl.load(word2Idx_path)

In [22]:
finalTest_matrix = createtestMatrix(testwords, word2Idx)

Unknown tokens in test_Text: 100.00%


In [23]:
len(finalTest_matrix)

5

In [24]:
resultFilePath = '/home/dl1/Arav/Sentence_classic/code/pkl/resultdata.pkl.gz'

In [25]:
testarray = np.array(finalTest_matrix)
testarray = testarray.reshape(1,len(finalTest_matrix))

In [26]:
testarray.tolist()
testarray.shape

(1, 5)

In [27]:
f = gzip.open(resultFilePath, 'wb')
pkl.dump(testarray, f)
f.close()

Deep Learning model prediction on sentiment:

In [28]:
from keras.models import load_model
from keras.preprocessing import sequence

loadedModel = load_model("/home/dl1/Arav/Sentence_classic/models/model1.h5")

result_data = pkl.load(gzip.open("/home/dl1/Arav/Sentence_classic/code/pkl/resultdata.pkl.gz","rb"))
print("data loaded!")

result_data.tolist()


data = pkl.load(gzip.open("/home/dl1/Arav/Sentence_classic/code/pkl/data.pkl.gz","rb"))
print("data loaded!")

train_labels = data['train']['labels']
train_sentences = data['train']['sentences']

dev_labels = data['dev']['labels']
dev_sentences = data['dev']['sentences']

test_labels = data['test']['labels']
test_sentences = data['test']['sentences']

word_embeddings = data['wordEmbeddings']


# :: Find the longest sentence in our dataset ::
max_sentence_len = 0
for sentence in train_sentences + dev_sentences + test_sentences:
    max_sentence_len = max(len(sentence), max_sentence_len)

print("Longest sentence: %d" % max_sentence_len)

batch_size = 50
result_X = sequence.pad_sequences(result_data, maxlen=max_sentence_len)

result_y_pred = loadedModel.predict(result_X, batch_size=batch_size)

senti = result_y_pred.ravel()

senti

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


data loaded!
data loaded!
Longest sentence: 59


array([0.2765223], dtype=float32)

Deep learning model's prediction (speech):

In [29]:
if(senti[0]>=0.65):
    speak("The User is positive")
    print("The User is positive")
elif(senti[0]>0.35 and senti[0]<0.65):
    speak("The user is Neutral")
    print("The user is Neutral")
else:
    speak("The user is negative")
    print("The user is negative")

The user is negative


Pre-existing LDA Service (Algorithmia) Usage  starts from here:

In [1]:

textLDA=[]
textLDA.append(text)
textLDA

NameError: name 'text' is not defined

In [43]:
#usage of the LDA - Algorithmia

# !sudo pip install algorithmia
import Algorithmia
from nltk.corpus import stopwords

In [44]:
stop_words = set(stopwords.words("english"))
stopwords = [w for w in stop_words]
stopwords

['doing',
 'too',
 'more',
 "doesn't",
 'aren',
 "mightn't",
 'can',
 't',
 'didn',
 "she's",
 'an',
 "that'll",
 'hers',
 'than',
 'my',
 'further',
 'his',
 'she',
 'only',
 'yourselves',
 'does',
 'our',
 'it',
 "couldn't",
 'down',
 "you'd",
 'for',
 'a',
 'and',
 'after',
 'm',
 'your',
 "shouldn't",
 'there',
 "you've",
 'until',
 'which',
 'these',
 "it's",
 'were',
 'the',
 'own',
 'while',
 'wouldn',
 'do',
 'most',
 'very',
 'ma',
 'mustn',
 'at',
 'isn',
 'we',
 'you',
 'what',
 'should',
 're',
 'ours',
 'no',
 'under',
 'from',
 'wasn',
 'in',
 "aren't",
 'over',
 'all',
 "shan't",
 'nor',
 'itself',
 'with',
 'hadn',
 'yours',
 'just',
 'them',
 'will',
 'both',
 'their',
 'y',
 'doesn',
 'o',
 'themselves',
 'during',
 'ain',
 "didn't",
 'not',
 'by',
 'when',
 's',
 'he',
 'out',
 "wouldn't",
 'few',
 'had',
 'don',
 'be',
 'into',
 'shan',
 'each',
 'herself',
 'has',
 'theirs',
 'i',
 'about',
 'to',
 'some',
 'that',
 'whom',
 'of',
 'below',
 'other',
 "should've",


In [45]:
input = {
    "docsList": textLDA,
#     "mode": "quality",
    "customSettings":{
        "numTopics":2,
        "numIterations":50,
        "numWords":4
    },
    "stopWordsList":stopwords
}
client = Algorithmia.client('sim+KZtb16R1rtOXC0dk9Y4sqEb1')
algo = client.algo('nlp/LDA/1.0.0')
LDAresult = algo.pipe(input).result
print(LDAresult)

[{'afford': 1, 'honda': 1, 'piece': 3, 'works': 3}, {'accord': 1, 'big': 3, 'car': 3, 'shit': 3}]


In [46]:
import json

In [47]:
# jsontopython = json.load(LDAresult)
# print(jsontopython)
for item in LDAresult:
    print(item)

{'afford': 1, 'honda': 1, 'piece': 3, 'works': 3}
{'accord': 1, 'big': 3, 'car': 3, 'shit': 3}


Usage of the pre-existing sentiment analysis services from Algorithmia:

In [48]:
#Algorithmia sentiment module: (positive,negative,neutral/compound)
# import Algorithmia

input = {
  "sentence": "The greatest pleasure in life is doing what people say you cannot do."
}
client = Algorithmia.client('sim+KZtb16R1rtOXC0dk9Y4sqEb1')
algo = client.algo('nlp/SocialSentimentAnalysis/0.1.4')
print(algo.pipe(input).result)

[{'compound': 0.836, 'negative': 0, 'neutral': 0.582, 'positive': 0.418, 'sentence': 'The greatest pleasure in life is doing what people say you cannot do.'}]


In [49]:
#Algorithmia sentiment module: (positive,negative) - Sentiment value between -1 and 1 (very negative to very positive)
# import Algorithmia

input = {
  "document": "The greatest pleasure in life is doing what people say you cannot do."
}
client = Algorithmia.client('sim+KZtb16R1rtOXC0dk9Y4sqEb1')
algo = client.algo('nlp/SentimentAnalysis/1.0.4')
print(algo.pipe(input).result)

[{'document': 'The greatest pleasure in life is doing what people say you cannot do.', 'sentiment': 0.836}]


In [82]:
import speech_recognition as sr
for index, name in enumerate(sr.Microphone.list_microphone_names()):
    print("Microphone with name \"{1}\" found for `Microphone(device_index={0})`".format(index, name))

Microphone with name "HDA Intel PCH: ALC1220 Analog (hw:0,0)" found for `Microphone(device_index=0)`
Microphone with name "HDA Intel PCH: ALC1220 Alt Analog (hw:0,2)" found for `Microphone(device_index=1)`
Microphone with name "HDA NVidia: HDMI 0 (hw:1,3)" found for `Microphone(device_index=2)`
Microphone with name "HDA NVidia: HDMI 1 (hw:1,7)" found for `Microphone(device_index=3)`
Microphone with name "HDA NVidia: HDMI 2 (hw:1,8)" found for `Microphone(device_index=4)`
Microphone with name "HDA NVidia: HDMI 3 (hw:1,9)" found for `Microphone(device_index=5)`
Microphone with name "HDA NVidia: HDMI 0 (hw:2,3)" found for `Microphone(device_index=6)`
Microphone with name "HDA NVidia: HDMI 1 (hw:2,7)" found for `Microphone(device_index=7)`
Microphone with name "HDA NVidia: HDMI 2 (hw:2,8)" found for `Microphone(device_index=8)`
Microphone with name "HDA NVidia: HDMI 3 (hw:2,9)" found for `Microphone(device_index=9)`
Microphone with name "HDA NVidia: HDMI 0 (hw:3,3)" found for `Microphone(d