# Instructions:
1. Compute the most important key-words (a key-word can be between 1-3 words)
2. Choose the top n words from the previously generated list. Compare these key- words with all the words occurring in all of the transcripts.
3. Generate a score (rank) for these top n words based on analysed transcripts.

-----------------------------------------------------------------------------

In [51]:
from sklearn.feature_extraction import stop_words
from nltk.corpus import stopwords

In [52]:
import math
from textblob import TextBlob as tb

In [2]:
with open("scripts/script.txt", "r") as f:
    data = f.read()

#with open("scripts/script.txt", "r") as f:
#    data2 = f.readlines()

#for line in data:
#    words = data.split()

In [3]:
with open("scripts/transcript_1.txt", "r") as t1:
    t1 = t1.read()
    
with open("scripts/transcript_2.txt", "r") as t2:
    t2 = t2.read()

with open("scripts/transcript_3.txt", "r") as t3:
    t3 = t3.read()


# PRE-PROCESSING!

In [4]:
from spacy.en import English
import nltk
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS


In [7]:
parser = English()
parsedData = parser(data)

Food is any substance consumed to provide nutritional support for

In [18]:
# All you have to do is iterate through the parsedData
# Each token is an object with lots of different properties
# A property with an underscore at the end returns the string representation
# while a property without the underscore returns an index (int) into spaCy's vocabulary
# The probability estimate is based on counts from a 3 billion word
# corpus, smoothed using the Simple Good-Turing method.
for i, token in enumerate(parsedData[0:2]):
    print("original:", token.orth, token.orth_)
    print("lowercased:", token.lower, token.lower_)
    print("lemma:", token.lemma, token.lemma_)
    print("shape:", token.shape, token.shape_)
    print("prefix:", token.prefix, token.prefix_)
    print("suffix:", token.suffix, token.suffix_)
    print("log probability:", token.prob)
    print("Brown cluster id:", token.cluster)
    print("----------------------------------------")

original: 6832 Food
lowercased: 960 food
lemma: 960 food
shape: 354724 Xxxx
prefix: 3792 F
suffix: 155057 ood
log probability: -11.701742172241211
Brown cluster id: 838
----------------------------------------
original: 474 is
lowercased: 474 is
lemma: 488 be
shape: 21581 xx
prefix: 570 i
suffix: 474 is
log probability: -4.457748889923096
Brown cluster id: 762
----------------------------------------


# TF-IDF

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

def tf(word, blob):
    return blob.words.count(word) / len(blob.words)

def n_containing(word, bloblist):
    return sum(1 for blob in bloblist if word in blob.words)

def idf(word, bloblist):
    return math.log(len(bloblist) / (1 + n_containing(word, bloblist)))

def tfidf(word, blob, bloblist):
    return tf(word, blob) * idf(word, bloblist)


In [None]:
bloblist = []
[bloblist.append(tb(doc)) for doc in [data, t1, t2, t3]]

for i, blob in enumerate(bloblist):
    print("Top words in document {}".format(i + 1))
    scores = {word: tfidf(word, blob, bloblist) for word in blob.words}
    sorted_words = sorted(scores.items(), key=lambda x: x[1], reverse=True)
    for word, score in sorted_words[:3]:
        print("Word: {}, TF-IDF: {}".format(word, round(score, 5)))

In [None]:
CountVectorizer(data)

In [None]:
tf = TfidfVectorizer(analyzer='word', ngram_range=(1,3), min_df = 0, stop_words = 'english')


In [None]:
tfidf_matrix = tf.fit_transform(data2)

In [None]:
feature_names = tf.get_feature_names() 

In [None]:
tfidf_matrix.shape, len(feature_names)

In [None]:
dense = tfidf_matrix.todense()
episode = dense[0].tolist()[0]

In [None]:
phrase_scores = [pair for pair in zip(range(0, len(episode)), episode) if pair[1] > 0]

In [None]:
sorted_phrase_scores = sorted(phrase_scores, key=lambda t: t[1] * -1)
for phrase, score in [(feature_names[word_id], score) for (word_id, score) in sorted_phrase_scores][:20]:
   print('{0: <20} {1}'.format(phrase, score))

In [None]:
def freq(word, tokens):
    return tokens.count(word)


#Compute the frequency for each term.
vocabulary = []
docs = {}
all_tips = []
for tip in (venue.tips()):
    tokens = tokenizer.tokenize(tip.text)

    bi_tokens = bigrams(tokens)
    tri_tokens = trigrams(tokens)
    tokens = [token.lower() for token in tokens if len(token) > 2]
    tokens = [token for token in tokens if token not in stopwords]

    bi_tokens = [' '.join(token).lower() for token in bi_tokens]
    bi_tokens = [token for token in bi_tokens if token not in stopwords]

    tri_tokens = [' '.join(token).lower() for token in tri_tokens]
    tri_tokens = [token for token in tri_tokens if token not in stopwords]

    final_tokens = []
    final_tokens.extend(tokens)
    final_tokens.extend(bi_tokens)
    final_tokens.extend(tri_tokens)
    docs[tip.text] = {'freq': {}}

    for token in final_tokens:
        docs[tip.text]['freq'][token] = freq(token, final_tokens)

print docs

## RAKE

In [20]:
from rake_nltk import Rake

r = Rake() # Uses stopwords for english from NLTK, and all puntuation characters.

In [24]:
# If you want to provide your own set of stop words and punctuations to
# r = Rake(<list of stopwords>, <string of puntuations to ignore>

r.extract_keywords_from_text(data)

In [23]:
r.get_ranked_phrases_with_scores() # To get keyword phrases ranked highest to lowest.

[(35.5, 'supermarket giants wield great purchasing power'),
 (35.416666666666664,
  'support programs allowed major grain exporting countries'),
 (33.923809523809524,
  'western kitchens use variable temperature convection ovens'),
 (33.09701492537313, 'wikimedia commons food timeline wikibooks cookbook'),
 (31.798245614035086,
  'common agricultural policy ), national government policy'),
 (30.097014925373134, 'food market following george w bush'),
 (29.583333333333332,
  '2013 overseas development institute researchers showed'),
 (29.554347826086957, 'may help remove potentially harmful chemicals'),
 (28.9, 'digestive complaints usually develop within half'),
 (27.668443496801704, 'taste another universal phenomenon regarding food'),
 (25.69316877152698, 'raw food preparation certain cultures highlight animal'),
 (24.378040566398774, 'dried foods also promote food safety'),
 (24.25, 'growing trend toward sustainable agricultural practices'),
 (24.234696084793423, 'evidence suggests 

### WORD2VEC + DOC2VEC

### TEXT RANK?