In [1]:
from __future__ import print_function
import numpy as np
import gzip
import os
import sys
if (sys.version_info > (3, 0)):
    import pickle as pkl
else: #Python 2.7 imports
    import cPickle as pkl
    

#We download English word embeddings from here https://www.cs.york.ac.uk/nlp/extvec/
embeddingsPath = 'C:/Users/Aravindhan.Poopathy/OneDrive - So Energy/Arav/Sentence_classic_new-master/Sentence_classic_new-master/code/embeddings/wiki_extvec.gz'

#Train, Dev, and Test files
folder = 'C:/Users/Aravindhan.Poopathy/OneDrive - So Energy/Arav/Sentence_classic_new-master/Sentence_classic_new-master/code/soeMailData/'
files = [folder+'mailtrain.txt',  folder+'maildev.txt', folder+'mailtest.txt']


def createMatrices(sentences, word2Idx):
    unknownIdx = word2Idx['UNKNOWN_TOKEN']
    paddingIdx = word2Idx['PADDING_TOKEN']    
    
    
    xMatrix = []
    unknownWordCount = 0
    wordCount = 0
    
    for sentence in sentences:
        targetWordIdx = 0
        
        sentenceWordIdx = []
        
        for word in sentence:
            wordCount += 1
            
            if word in word2Idx:
                wordIdx = word2Idx[word]
            elif word.lower() in word2Idx:
                wordIdx = word2Idx[word.lower()]
            else:
                wordIdx = unknownIdx
                unknownWordCount += 1
                
            sentenceWordIdx.append(wordIdx)
            
        xMatrix.append(sentenceWordIdx)
       
    
    print("Unknown tokens: %.2f%%" % (unknownWordCount/(float(wordCount))*100))
    return xMatrix

def readFile(filepath):
    sentences = []    
    labels = []
    
    for line in open(filepath, encoding = "utf-8"):   
        splits = line.split()
        label = int(splits[0])
        words = splits[1:]
        
        labels.append(label)
        sentences.append(words)
        
    print(filepath, len(sentences), "sentences")
    
    return sentences, labels


# ::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: #
#      Start of the preprocessing
# ::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: #

outputFilePath = 'C:/Users/Aravindhan.Poopathy/OneDrive - So Energy/Arav/Sentence_classic_new-master/Sentence_classic_new-master/code/soeMailData/pkl/maildata.pkl.gz'


trainDataset = readFile(files[0])
devDataset = readFile(files[1])
testDataset = readFile(files[2])


# :: Compute which words are needed for the train/dev/test set ::
words = {}
for sentences, labels in [trainDataset, devDataset, testDataset]:       
    for sentence in sentences:
        for token in sentence:
            words[token.lower()] = True

# :: Read in word embeddings ::
word2Idx = {}
wordEmbeddings = []

# :: Downloads the embeddings from the York webserver ::
if not os.path.isfile(embeddingsPath):
    basename = os.path.basename(embeddingsPath)
    if basename == 'wiki_extvec.gz':
	       print("Start downloading word embeddings for English using wget ...")
	       #os.system("wget https://www.cs.york.ac.uk/nlp/extvec/"+basename+" -P embeddings/") #Original path from York University
	       os.system("wget https://public.ukp.informatik.tu-darmstadt.de/reimers/2017_english_embeddings/"+basename+" -P embeddings/")
    else:
        print(embeddingsPath, "does not exist. Please provide pre-trained embeddings")
        exit()
        

# :: Load the pre-trained embeddings file ::
fEmbeddings = gzip.open(embeddingsPath, "r") if embeddingsPath.endswith('.gz') else open(embeddingsPath, encoding="utf8")


print("Load pre-trained embeddings file")
for line in fEmbeddings:
    split = line.decode("utf-8").strip().split(" ")
    word = split[0]
    
    if len(word2Idx) == 0: #Add padding+unknown
        word2Idx["PADDING_TOKEN"] = len(word2Idx)
        vector = np.zeros(len(split)-1) #Zero vector for 'PADDING' word
        wordEmbeddings.append(vector)
        
        word2Idx["UNKNOWN_TOKEN"] = len(word2Idx)
        vector = np.random.uniform(-0.25, 0.25, len(split)-1)
        wordEmbeddings.append(vector)

    if word.lower() in words:
        vector = np.array([float(num) for num in split[1:]])
        wordEmbeddings.append(vector)
        word2Idx[word] = len(word2Idx)
       
        
wordEmbeddings = np.array(wordEmbeddings)

print("Embeddings shape: ", wordEmbeddings.shape)
print("Len words: ", len(words))


word2Idx_path = open("C:/Users/Aravindhan.Poopathy/OneDrive - So Energy/Arav/Sentence_classic_new-master/Sentence_classic_new-master/code/soeMailData/pkl/word2Idx.pickle","wb")
pkl.dump(word2Idx,word2Idx_path)
word2Idx_path.close()


C:/Users/Aravindhan.Poopathy/OneDrive - So Energy/Arav/Sentence_classic_new-master/Sentence_classic_new-master/code/soeMailData/mailtrain.txt 8094 sentences
C:/Users/Aravindhan.Poopathy/OneDrive - So Energy/Arav/Sentence_classic_new-master/Sentence_classic_new-master/code/soeMailData/maildev.txt 4047 sentences
C:/Users/Aravindhan.Poopathy/OneDrive - So Energy/Arav/Sentence_classic_new-master/Sentence_classic_new-master/code/soeMailData/mailtest.txt 4048 sentences
Load pre-trained embeddings file
Embeddings shape:  (7290, 300)
Len words:  11720


In [22]:
#---------------------------------speech to text -> input text:

# import speech_recognition
# import pyttsx3

# speech_engine = pyttsx3.init('espeak') # see http://pyttsx.readthedocs.org/en/latest/engine.html#pyttsx.init
# print("Actual speech rate is: ",speech_engine.getProperty('rate'))
# speech_engine.setProperty('rate', 125)

# def speak(text):
#     speech_engine.say(text)
#     speech_engine.runAndWait()

# recognizer = speech_recognition.Recognizer()

# print("Energy threshold - before: ", recognizer.energy_threshold)



# def listen():
# #     with speech_recognition.AudioFile("/home/dl1/Arav/Sentence_classic/code/newtest.wav") as source:
#     with speech_recognition.Microphone() as source:
#         recognizer.adjust_for_ambient_noise(source, duration=1)
#         recognizer.dynamic_energy_threshold = False
# #         print("Chunking rate:", source.CHUNK)
# #         print("format rate:", source.format)
#         recognizer.energy_threshold =400
#         print("Energy threshold - after: ", recognizer.energy_threshold)

#         audio = recognizer.listen(source)
        
# #         audio = recognizer.record(source)

#     try:
#         text = recognizer.recognize_google(audio, language='en-GB')
#         print(text)
#         return text
    
# #         or:return recognizer.recognize_sphinx(audio)
        
#     except speech_recognition.UnknownValueError:
#         print("Could not understand audio")
# #         reply = "sorry, I could not understand that"
# #         return reply
#     except speech_recognition.RequestError as e:
#         print("Recog Error; {0}".format(e))

#     return ""

Actual speech rate is:  125
Energy threshold - before:  300


In [None]:
# import os
# os.environ["CUDA_VISIBLE_DEVICES"]="2,3"

# from tensorflow.python.client import device_lib
# print (device_lib.list_local_devices())


# import nltk
# import gensim
# from nltk.tokenize import RegexpTokenizer
# from stop_words import get_stop_words
# from nltk.stem.porter import PorterStemmer
# from gensim import corpora, models

# tokenizer = RegexpTokenizer(r'\w+')

# # create English stop words list
# en_stop = get_stop_words('en')

# # Create p_stemmer of class PorterStemmer
# p_stemmer = PorterStemmer()


# #Loop should start-----------------------------------------------------------------------------

# while True:
#     speak("Say something!")
#     resultAudio = listen()
    
#     if resultAudio!="":
#         speak("I heard you say " + resultAudio)

#     ####################### Preprocessing for sentiment analysis starts here---------------------------------------    

#         testwords = resultAudio.split()


#         def createtestMatrix(sentence, word2Idx):
#             unknownIdx = word2Idx['UNKNOWN_TOKEN']
#             paddingIdx = word2Idx['PADDING_TOKEN']    


#             testMatrix = []
#             unknownWordCount = 0
#             wordCount = 0

#             targetWordIdx = 0

#             for word in sentence:
#                 wordCount += 1

#                 if word in word2Idx:
#                     wordIdx = word2Idx[word]
#                 elif word.lower() in word2Idx:
#                     wordIdx = word2Idx[word.lower()]
#                 else:
#                     wordIdx = unknownIdx
#                     unknownWordCount += 1
#                 testMatrix.append(wordIdx)
#             print("Unknown tokens in test_Text: %.2f%%" % (unknownWordCount/(float(wordCount))*100))

#             return testMatrix


#         word2Idx_path = open("C:/Users/Aravindhan.Poopathy/OneDrive/Arav/Sentence_classic_new-master/Sentence_classic_new-master/code/pkl/word2Idx.pickle", "rb")
#         word2Idx = pkl.load(word2Idx_path)

#         if len(testwords)!=0:
#             finalTest_matrix = createtestMatrix(testwords, word2Idx)
#         else:
#             continue

#         resultFilePath = 'C:/Users/Aravindhan.Poopathy/OneDrive/Arav/Sentence_classic_new-master/Sentence_classic_new-master/code/pkl/resultdata.pkl.gz'

#         testarray = np.array(finalTest_matrix)
#         testarray = testarray.reshape(1,len(finalTest_matrix))

#         testarray.tolist()
#         testarray.shape

#         f = gzip.open(resultFilePath, 'wb')
#         pkl.dump(testarray, f)
#         f.close()

#     ##########################pre-processing for the sentiment analysis ends here


#     ##########################Preprocessing for topic modelling starts here

#         # compile sample documents into a list
#         doc_set = [resultAudio]

#         # list for tokenized documents in loop
#         texts = []

#         # loop through document list
#         for i in doc_set:

#             # clean and tokenize document string
#             raw = i.lower()
#             tokens = tokenizer.tokenize(raw)

#             # remove stop words from tokens
#             stopped_tokens = [i for i in tokens if not i in en_stop]

#             # stem tokens
#     #         stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]

#             # add tokens to list
#             texts.append(stopped_tokens)


#         # turn our tokenized documents into a id <-> term dictionary
#         dictionary = corpora.Dictionary(texts)

#         # convert tokenized documents into a document-term matrix
#         corpus = [dictionary.doc2bow(text) for text in texts]

#         # generate LDA model
#         ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=2, id2word = dictionary, passes=20)
#         topicresult = ldamodel.print_topics(num_topics=1, num_words=5)
#         topicList = []

#         for j in range(len(topicresult)):
#             topic = topicresult[j][1]
#         # topic = topic.strip('\"')
#             x = topic.split("+")
#             a = []
#             words=[]

#             for i in range(len(x)):
#                 b = x[i][6:].strip(' ')
#                 b = b.strip('*')
#                 b = b.strip('"')
#                 words.append(b)
#                 pos = nltk.pos_tag(words)
#     #         print(b)
#                 if pos[0][1][0]=='N' or pos[0][1][0]=='V':
#                     a.append(b)
#                 words = []
#                 pos=[]
#         #topicList will have all the topics if the num_topics>1        
#         topicList.append(a)

#         str1 =  " ".join(stri for stri in a)

#     ##########################preprocessing for topic modelling ends here


#     ############################Deeplearning model:

#         from keras.models import load_model
#         from keras.preprocessing import sequence

#         loadedModel = load_model("C:/Users/Aravindhan.Poopathy/OneDrive/Arav/Sentence_classic_new-master/Sentence_classic_new-master/models/model1.h5")

#         result_data = pkl.load(gzip.open("C:/Users/Aravindhan.Poopathy/OneDrive/Arav/Sentence_classic_new-master/Sentence_classic_new-master/code/pkl/resultdata.pkl.gz","rb"))
#         print("input data loaded!")

#         result_data.tolist()


#         data = pkl.load(gzip.open("C:/Users/Aravindhan.Poopathy/OneDrive/Arav/Sentence_classic_new-master/Sentence_classic_new-master/code/pkl/data.pkl.gz","rb"))
#         print("data loaded!")

#         train_labels = data['train']['labels']
#         train_sentences = data['train']['sentences']

#         dev_labels = data['dev']['labels']
#         dev_sentences = data['dev']['sentences']

#         test_labels = data['test']['labels']
#         test_sentences = data['test']['sentences']

#         word_embeddings = data['wordEmbeddings']


#         # :: Find the longest sentence in our dataset ::
#         max_sentence_len = 0
#         for sentence in train_sentences + dev_sentences + test_sentences:
#             max_sentence_len = max(len(sentence), max_sentence_len)

#         print("Longest sentence: %d" % max_sentence_len)

#         batch_size = 50
#         result_X = sequence.pad_sequences(result_data, maxlen=max_sentence_len)

#         result_y_pred = loadedModel.predict(result_X, batch_size=batch_size)

#         senti = result_y_pred.ravel()
#         print (senti)

#         if(senti[0]>=0.65):
#             speak("The User is positive")
#             print("The User is positive about: ", topicList)
#         elif(senti[0]>=0.35 and senti[0]<0.65):
#             speak("The user is Neutral")
#             print("The user is Neutral about: ", topicList)
#         else:
#             speak("The user is negative")
#             print("The user is negative about: ", topicList)
        
#     else:
#         speak("Sorry I could not understand that!")

# #Loop should end here----------------------------------------------------------------------------------

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 12311574173532944334
, name: "/device:GPU:0"
device_type: "GPU"
memory_limit: 170459136
locality {
  bus_id: 1
}
incarnation: 10317876230059822873
physical_device_desc: "device: 0, name: GeForce GTX 1080 Ti, pci bus id: 0000:65:00.0, compute capability: 6.1"
, name: "/device:GPU:1"
device_type: "GPU"
memory_limit: 338231296
locality {
  bus_id: 1
}
incarnation: 16976386436290909991
physical_device_desc: "device: 1, name: GeForce GTX 1080 Ti, pci bus id: 0000:b3:00.0, compute capability: 6.1"
]
Energy threshold - after:  400
tackle
Unknown tokens in test_Text: 100.00%
input data loaded!
data loaded!
Longest sentence: 59
[0.25253412]
The user is negative about:  [['tackle']]
Energy threshold - after:  400
ok right yeah
Unknown tokens in test_Text: 0.00%
input data loaded!
data loaded!
Longest sentence: 59
[0.3003311]
The user is negative about:  [['right', 'ok', 'yeah']]
Energy threshold - after: 

In [1]:
# nltk.download()

In [34]:
#--------------------excluding the speech recognition and text to speech concept from the previous implementation:


####################### Preprocessing for sentiment analysis starts here---------------------------------------    

resultAudio = 'Dear So Energy Team On my latest bill there is a mention that I could save money by changingfrom the Gorilla to the Kangaroo tariff but I cant see how I go about doingthis.   Can you advise please. Many thanks  Jane Inman'

#1. However when my contract finishes on the 2/07/2018 I am moving to a contract which is substantially more expensive than my current deal therefore I prefer to switch to rates that are lower

testwords = resultAudio.split()


def createtestMatrix(sentence, word2Idx):
    unknownIdx = word2Idx['UNKNOWN_TOKEN']
    paddingIdx = word2Idx['PADDING_TOKEN']    


    testMatrix = []
    unknownWordCount = 0
    wordCount = 0

    targetWordIdx = 0

    for word in sentence:
        wordCount += 1

        if word in word2Idx:
            wordIdx = word2Idx[word]
        elif word.lower() in word2Idx:
            wordIdx = word2Idx[word.lower()]
        else:
            wordIdx = unknownIdx
            unknownWordCount += 1
        testMatrix.append(wordIdx)
    print("Unknown tokens in test_Text: %.2f%%" % (unknownWordCount/(float(wordCount))*100))

    return testMatrix


word2Idx_path = open("C:/Users/Aravindhan.Poopathy/OneDrive - So Energy/Arav/Sentence_classic_new-master/Sentence_classic_new-master/code/soeMailData/pkl/word2Idx.pickle", "rb")
word2Idx = pkl.load(word2Idx_path)

# if len(testwords)!=0:
#     finalTest_matrix = createtestMatrix(testwords, word2Idx)
# else:
#     continue

finalTest_matrix = createtestMatrix(testwords, word2Idx)

resultFilePath = 'C:/Users/Aravindhan.Poopathy/OneDrive - So Energy/Arav/Sentence_classic_new-master/Sentence_classic_new-master/code/soeMailData/pkl/resultdata.pkl.gz'

testarray = np.array(finalTest_matrix)
testarray = testarray.reshape(1,len(finalTest_matrix))

testarray.tolist()
testarray.shape

f = gzip.open(resultFilePath, 'wb')
pkl.dump(testarray, f)
f.close()

##########################pre-processing for the sentiment analysis ends here


Unknown tokens in test_Text: 7.14%


In [35]:
##########################Preprocessing for topic modelling starts here

import nltk
import gensim
from nltk.tokenize import RegexpTokenizer
from stop_words import get_stop_words
from nltk.stem.porter import PorterStemmer
from gensim import corpora, models

tokenizer = RegexpTokenizer(r'\w+')

# create English stop words list
en_stop = get_stop_words('en')

# Create p_stemmer of class PorterStemmer
p_stemmer = PorterStemmer()


# compile sample documents into a list
doc_set = [resultAudio]

# list for tokenized documents in loop
texts = []

# loop through document list
for i in doc_set:

    # clean and tokenize document string
    raw = i.lower()
    tokens = tokenizer.tokenize(raw)

    # remove stop words from tokens
    stopped_tokens = [i for i in tokens if not i in en_stop]

    # stem tokens
#         stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]

    # add tokens to list
    texts.append(stopped_tokens)


# turn our tokenized documents into a id <-> term dictionary
dictionary = corpora.Dictionary(texts)

# convert tokenized documents into a document-term matrix
corpus = [dictionary.doc2bow(text) for text in texts]

# generate LDA model
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=10, id2word = dictionary, passes=20)
topicresult = ldamodel.print_topics(num_topics=10, num_words=5)
topicList = []

for j in range(len(topicresult)):
    topic = topicresult[j][1]
# topic = topic.strip('\"')
    x = topic.split("+")
    a = []
    words=[]

    for i in range(len(x)):
        b = x[i][6:].strip(' ')
        b = b.strip('*')
        b = b.strip('"')
        words.append(b)
        pos = nltk.pos_tag(words)
#         print(b)
        if pos[0][1][0]=='N' or pos[0][1][0]=='V':
            a.append(b)
        words = []
        pos=[]
#topicList will have all the topics if the num_topics>1        
topicList.append(a)

str1 =  " ".join(stri for stri in a)

##########################preprocessing for topic modelling ends here

In [22]:
# !pip install msgpack

In [23]:
# !pip install keras

In [24]:
# !pip install tensorflow

In [36]:
############################Deeplearning model:

from keras.models import load_model
from keras.preprocessing import sequence

loadedModel = load_model("C:/Users/Aravindhan.Poopathy/OneDrive - So Energy/Arav/Sentence_classic_new-master/Sentence_classic_new-master/code/soeMailData/soemailModels/classMailModel1.h5")

result_data = pkl.load(gzip.open("C:/Users/Aravindhan.Poopathy/OneDrive - So Energy/Arav/Sentence_classic_new-master/Sentence_classic_new-master/code/soeMailData/pkl/resultdata.pkl.gz","rb"))
print("input data loaded!")

result_data.tolist()


data = pkl.load(gzip.open("C:/Users/Aravindhan.Poopathy/OneDrive - So Energy/Arav/Sentence_classic_new-master/Sentence_classic_new-master/code/soeMailData/pkl/maildata.pkl.gz","rb"))
print("data loaded!")

train_labels = data['train']['labels']
train_sentences = data['train']['sentences']

dev_labels = data['dev']['labels']
dev_sentences = data['dev']['sentences']

test_labels = data['test']['labels']
test_sentences = data['test']['sentences']

word_embeddings = data['wordEmbeddings']


# :: Find the longest sentence in our dataset ::
max_sentence_len = 0
for sentence in train_sentences + dev_sentences + test_sentences:
    max_sentence_len = max(len(sentence), max_sentence_len)

print("Longest sentence: %d" % max_sentence_len)

batch_size = 50
result_X = sequence.pad_sequences(result_data, maxlen=max_sentence_len)

result_y_pred = loadedModel.predict(result_X, batch_size=batch_size)

# senti = result_y_pred.ravel()
# print (senti)

print(result_y_pred)




input data loaded!
data loaded!
Longest sentence: 704
[[0.08725893 0.05795438 0.00773239 0.00435072 0.8427035 ]]


In [37]:
topicList

[['money', 'mention', 'tariff', 'dear']]

In [38]:
result_y_pred = result_y_pred.tolist()

In [39]:
result = result_y_pred[0]
result

[0.08725893497467041,
 0.05795437842607498,
 0.00773239228874445,
 0.004350717179477215,
 0.8427035212516785]

In [40]:
import operator
index, value = max(enumerate(result), key=operator.itemgetter(1))
print(index,value)
type(index)

4 0.8427035212516785


int

In [41]:
if(index==0):
    print("The User Rating is Excellent! ", topicList)
elif(index==1):
    print("The user Rating is Good! ", topicList)
elif(index==2):
    print("The user Rating is Average! ", topicList)
elif(index==3):
    print("The user Rating is Bad! ", topicList)
elif(index==4):
    print("The user Rating is Terrible ", topicList)

The user Rating is Terrible  [['money', 'mention', 'tariff', 'dear']]
