In [54]:
import nltk

nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /home/alexander/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [27]:
def trivia_fromDict(dict_obj):
    try:
        return trivia(
            dict_obj["question_text"],
            dict_obj["choices"],
            dict_obj["answer"],
            dict_obj["meta"])
    except Exception as e:
        print(dict_obj)
        raise e


In [41]:
import json
from trivia_scraper.trivia import trivia

all_trivia = None
with open("data/all.json",'r') as f:
    all_trivia = json.load(f)

print(trivia_fromDict(all_trivia[10]))
print(trivia_fromDict(all_trivia[0]))
print("questions:",len(all_trivia))

A European style of the late eighteenth and early nineteenth centuries. Its elegant, balanced works revived the order and harmony of ancient Greek and Roman art. : 
	 Neoclassicism (Neoclassicism)
"Now is the winter of our discontent" is a line from which Shakespearian play? : 
	 Richard III (Richard III,Romeo and Juliet,Macbeth)
questions: 1950


In [29]:
import pandas as pd

question_text_list = [q["question_text"] for q in all_trivia]

question_data_frame = pd.DataFrame(question_text_list,columns=["question_text_raw"])
question_data_frame.head()

Unnamed: 0,question_text_raw
0,"""Now is the winter of our discontent"" is a lin..."
1,"""Our Town"" is a play by whom?"
2,"""The Diary of Anne Frank"" was first published ..."
3,"A band of painted or sculpted decoration, ofte..."
4,A composition made of cut and pasted pieces of...


In [30]:

#insert cleaning methods

def reading_clean(data):
    return data.strip()


question_data_frame["question_readable"] = question_data_frame["question_text_raw"].apply(reading_clean)
question_data_frame.head()

Unnamed: 0,question_text_raw,question_readable
0,"""Now is the winter of our discontent"" is a lin...","""Now is the winter of our discontent"" is a lin..."
1,"""Our Town"" is a play by whom?","""Our Town"" is a play by whom?"
2,"""The Diary of Anne Frank"" was first published ...","""The Diary of Anne Frank"" was first published ..."
3,"A band of painted or sculpted decoration, ofte...","A band of painted or sculpted decoration, ofte..."
4,A composition made of cut and pasted pieces of...,A composition made of cut and pasted pieces of...


In [55]:

from nltk.corpus import stopwords
from textblob import Word
stop = stopwords.words('english')

punctuation = '''!()-[]{};:'"\,<>./?@#$%^&*_~“”’'''

def lower(data):
    return map( lambda x:x.lower() ,data)

def remove_punc(data):
    for x in punctuation:
        data = data.replace(x,'')
    return data
def drop_stop_words(data):
    return " ".join(x for x in data.split(" ") if x not in stop)

def lemmatization(data):
    return " ".join(Word(x).lemmatize() for x in data.split(" "))

def clean(data):
    data = "".join(lower(data))
    data = remove_punc(data)
    data = drop_stop_words(data)
    data = lemmatization(data)
    return data

question_data_frame["question_clean"] = question_data_frame["question_readable"].apply(clean)
question_data_frame.head()

Unnamed: 0,question_text_raw,question_readable,question_clean
0,"""Now is the winter of our discontent"" is a lin...","""Now is the winter of our discontent"" is a lin...",winter discontent line shakespearian play
1,"""Our Town"" is a play by whom?","""Our Town"" is a play by whom?",town play
2,"""The Diary of Anne Frank"" was first published ...","""The Diary of Anne Frank"" was first published ...",diary anne frank first published english title
3,"A band of painted or sculpted decoration, ofte...","A band of painted or sculpted decoration, ofte...",band painted sculpted decoration often top wall
4,A composition made of cut and pasted pieces of...,A composition made of cut and pasted pieces of...,composition made cut pasted piece material som...


In [51]:
freq = pd.Series(' '.join(question_data_frame["question_clean"]).split()).value_counts()
print(freq[:10])
print(freq[-10:])

what      183
name      157
wrote      98
us         95
the        91
first      89
known      67
famous     67
book       56
used       53
dtype: int64
front           1
manufacturer    1
governors       1
campaign        1
flushing        1
portrayed       1
zuckerberg      1
sneakers        1
fries           1
homers          1
dtype: int64


In [61]:
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer

quesiton_text = question_data_frame['question_clean'].tolist()

stop = stopwords.words('english')

cv = CountVectorizer(max_df=0.85,stop_words = stop)
cv.fit(quesiton_text)

word_count_vector = cv.fit_transform(quesiton_text)



In [62]:
print(list(cv.vocabulary_.keys())[:10])

['winter', 'discontent', 'line', 'shakespearian', 'play', 'town', 'diary', 'anne', 'frank', 'first']


In [67]:
from sklearn.feature_extraction.text import TfidfTransformer

tfidf_transformer = TfidfTransformer(smooth_idf=True,use_idf=True)
tfidf_transformer.fit(word_count_vector)



TfidfTransformer()

geting keywords from a document

In [74]:
def sort_coo(coo_matrix):
    tuples = zip(coo_matrix.col,coo_matrix.data)
    return sorted(tuples,key=lambda x:(x[1],x[0]),reverse=True)

def extract_topn_from_vector(feature_names,sorted_items,topn = 5):
    sorted_items = sorted_items[:topn]

    score_values = []
    keyword = []

    for idx,score in sorted_items:
        score_values.append(round(score,3))
        keyword.append(feature_names[idx])
    
    results = {}
    for i in range(len(score_values)):
        results[keyword[i]] = score_values[i]
    return results

quesiton_text = question_data_frame['question_clean'].tolist()

target_document = quesiton_text[0]



tf_idf_vector = tfidf_transformer.transform(cv.transform([target_document]))

sorted_items = sort_coo(tf_idf_vector.tocoo())

keywords = extract_topn_from_vector(cv.get_feature_names(),sorted_items)


print(target_document)
for k,v in keywords.items():
    print(k,v)

winter discontent line shakespearian play
shakespearian 0.535
discontent 0.508
winter 0.473
line 0.345
play 0.337
