In [9]:
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer 
from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktParameters
from nltk.tag import pos_tag
import nltk
nltk.download('averaged_perceptron_tagger')
from nltk.corpus import stopwords
nltk.download('stopwords')
from matplotlib import pyplot as plt
from keras.preprocessing.text import text_to_word_sequence
import heapdict
from gensim.models.phrases import Phrases, Phraser
import sys

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/aditisaini/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/aditisaini/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [10]:
def performStemming(tokens):
    stemmed_words = []
    ps = PorterStemmer() 
    for token in tokens:
        stemmed = ps.stem(token)
        stemmed_words.append(stemmed)
    return stemmed_words

In [11]:
def removeStopWords(tokens):
    all_stopwords = stopwords.words('english')
    tokens_without_sw = [word for word in tokens if not word in all_stopwords]
    return tokens_without_sw

In [12]:
def performSentenceSegmentation(file_content):
    #Training the model using given text: unsupervised learning
    tokenizer = PunktSentenceTokenizer()
    tokenizer.train(file_content)
    sentence_segmentation = tokenizer.tokenize(file_content)
    return sentence_segmentation

In [13]:
def performPOSTagging(sentences):
    pos_tagged = {}
    for sentence in sentences:
        pos_tagged[sentence] = pos_tag(word_tokenize(sentence))
    return pos_tagged

In [14]:
#Get sentence length
def averageSentenceLength(segmented_sentence):
    total = 0
    size = len(segmented_sentence)
    for s in segmented_sentence:
        words = s.split()
        total+=len(words)
    return total/size

In [15]:
#Plotting graph
def plotgraph(freqdict, graphname, xlabel, ylabel):
    x = list(freqdict.keys())
    y = list(freqdict.values())
    plt.figure()
    plt.bar(x, y, width=1.0)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.tight_layout()
    #plt.show()
    plt.savefig(graphname + '.png')
    plt.clf()

In [16]:
#the x-axis is the length of a token in number of characters, and the y-axis is the number of tokens of each length
def visualTokenAnalysis(tokens):
    token_analysis = {}
    for token in tokens:
        if len(token) in token_analysis.keys():
            token_analysis[len(token)]+=1
        else:
            token_analysis[len(token)]=1
    return token_analysis

In [17]:
#the x-axis is the length of a sentence in number of tokens/words, and the y-axis is the number of sentences of each length
def visualSentenceAnalysis(sentence_segmentation):
    sentence_analysis = {}
    for sentence in sentence_segmentation:
        words = word_tokenize(sentence)
        words_size = len(words)
        if words_size in sentence_analysis.keys():
            sentence_analysis[words_size]+=1
        else:
            sentence_analysis[words_size]=1
    return sentence_analysis

In [18]:
def top20Words(tokens):
    token_size = heapdict.heapdict() 
    for token in tokens:
        if token in token_size.keys():
            token_size[token]-=1
        else:
            token_size[token]=-1
    top20 = []
    for poptokens in range(20):
        top20.append(token_size.popitem()[0])
    return top20

In [19]:
def performVisualAnalysis(token_analysis, stemmed_token_analysis, sentence_analysis):
    plotgraph(token_analysis, "Token length analysis", "length of a token in number of characters", "number of tokens of each length")
    plotgraph(sentence_analysis, "Sentence length analysis", "length of a sentence in number of words/tokens", "number of sentences each length")
    plotgraph(stemmed_token_analysis, "Stemmed token length analysis", "length of a stemmed token in number of characters", "number of stemmed tokens of each length")

In [20]:
def extract_phrases(my_tree, phrase):
    my_phrases = []
    if my_tree.label() == phrase:
        my_phrases.append(my_tree.copy(True))
    for child in my_tree:
        if type(child) is nltk.Tree:
            list_of_phrases = extract_phrases(child, phrase)
            if len(list_of_phrases) > 0:
                my_phrases.extend(list_of_phrases)

    return my_phrases

In [21]:
def improveTokeniser(sentence_segmentation):
    phrases = []
    grammar = "NP: {<JJ>*<NN>|<NNP>*}"
    cp = nltk.RegexpParser(grammar)
    for x in sentence_segmentation:
        sentence = pos_tag(word_tokenize(x))
        tree = cp.parse(sentence)
        list_of_noun_phrases = extract_phrases(tree, 'NP')
        for phrase in list_of_noun_phrases:
            phrases.append("_".join([x[0] for x in phrase.leaves()]))
    return phrases

In [23]:
def printPOSTagged(pos_tagged):
    for tag in pos_tag:
        print(tag)
        print("\n")
        print(pos_tag[tag])
        print('\n\n')

In [22]:
def main():
    #1: Load databases
    file_content = open('datasets/dataset1.txt').read()
    #2: Split the content for each review
    each_reviews = file_content.split("---xxx---")
    #3: Tokenisation
    #tokens = word_tokenize(file_content)
    tokens = text_to_word_sequence(file_content)
    #4: Remove stop words from text
    tokens_without_sw = removeStopWords(tokens)
    #5: Stemming
    stemmed_words = performStemming(tokens_without_sw)
    #6: Top 20 words
    top20 = top20Words(tokens_without_sw)
    top20StemmedWords = top20Words(stemmed_words)
    #7: Sentence segmentation
    sentence_segmentation = performSentenceSegmentation(file_content)
    #8: Improving tokeniser by extracting noun phrases
    phrases = improveTokeniser(sentence_segmentation)
    #9: POS Tagging
    ##9.1 Sentences from each datasets 1, 2, 3
    sentences = ["All restaurants have children’s menus.", "", ""]
    ##9.2 POS Tagged sentences
    pos_tagged = performPOSTagging(sentences)
    #10: Average length of each sentence
    avg_length = averageSentenceLength(sentence_segmentation)
    #11: Graphical analysis
    token_analysis = visualTokenAnalysis(tokens_without_sw)
    stemmed_token_analysis = visualTokenAnalysis(stemmed_words)
    sentence_analysis = visualSentenceAnalysis(sentence_segmentation)
    performVisualAnalysis(token_analysis, stemmed_token_analysis, sentence_analysis)

In [221]:
main()

<Figure size 432x288 with 0 Axes>

<Figure size 432x288 with 0 Axes>

<Figure size 432x288 with 0 Axes>

In [24]:
file_content = open('datasets/dataset1.txt').read()
sentences = ["All restaurants have children’s menus.", "Complimentary amenities include a welcome pack and daily ice-cream passes", "You won’t have to jostle with other hotel guests even if there’s a crowd."]
pos_tagged = performPOSTagging(sentences)
printPOSTagged(pos_tagged)


TypeError: 'function' object is not iterable