In [38]:
import itertools
import random
import json
import pprint
import pickle
import numpy as np
import pandas as pd

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
#nltk.download('punkt')

In [39]:
with open('/Users/christopher/Desktop/projects/Metis_Projects/Project_4/data/train-v2.0.json') as f:
    data = json.load(f)

In [40]:
# dictionary of wiki articles and their respective content presented as one long string
page = {}

# dictionary of questions pertaining to specific wiki page
qa = {}

In [49]:
def extract_content(data):
    for i in range(len(data["data"])):
        page[str(i)] = dict([("title", None), ("content", [])])
        qa[str(i)] = dict([("questions", []), ("answers",[])])


    for i in range(len(data["data"])):
        page[str(i)]["title"] = data["data"][i]["title"]
        for j in data["data"][i]["paragraphs"]:
            page[str(i)]["content"].append(j["context"])
            for k in range(len(j["qas"])):
                qa[str(i)]["questions"].append(j["qas"][k]["question"])
                
# save unconcatenated paragraphs for each wiki page
    pickle_out = open('unconcat_paragraphs.pickle','wb')
    pickle.dump(page, pickle_out)
    pickle_out.close()

# concatenate all paragraphs for a wiki page into 1 long string
    for i in range(len(data["data"])):
        page[str(i)]["content"] = ' '.join(page[str(i)]["content"])
                                                                          
# save content & questions
    json.dump(page, open('wiki_pages.json','w'))
    json.dump(qa, open('wiki_questions.json','w'))

In [50]:
extract_content(data)

In [51]:
# create corpus of all wiki pages content
corpus_titles = []
corpus = []
for i in range(len(page)):
    corpus_titles.append(page[str(i)]["title"])
    corpus.append(page[str(i)]["content"])

In [52]:
# sanity check
number_of_docs = len(page)
corpus_length = len(corpus)
corpus_titles_length = len(corpus_titles)
print('number of docs:', number_of_docs, '\n', 'corpus_length: ', corpus_length, '\n', 'corpus_title_length:', corpus_titles_length)

number of docs: 442 
 corpus_length:  442 
 corpus_title_length: 442


In [53]:
def top_wiki_indices(arr, n):

    ordered_cs = {}
# select the inputed question's cosine similarity array
    cs_q = arr[-1]
    
# delete cosine similarity value of 1 (the question itself)
    cs_q = np.delete(cs_q,-1)


# get indices of top n wikis based on highest cosine similarity score
    wiki_indices = np.argpartition(cs_q, -n)[-n:]
    wiki_indices_list = list(wiki_indices)
    
    wiki_values_list = list(cs_q[wiki_indices])
    
# create dictionary with index of cs value as keys, and actual cs value for values 
    index_cs_values = dict(zip(wiki_indices_list, wiki_values_list))

# created ordered dictionary (highest to lowest cs values)
    for key,value in sorted(index_cs_values.items(), key=lambda item:(item[1],item[0]), reverse=True):
        ordered_cs[key]=value
    
    return list(ordered_cs.keys())


In [74]:
def best_wiki(own_question=False):
    
    X_corpus = corpus
    X_corpus_titles = corpus_titles
    print(len(X_corpus_titles), len(X_corpus))
    
# choose a question & add it to the corpus
    if own_question==True:
        own_question = input('Ask a quesiton: ')
        X_corpus.append(own_question)
    
    else:
        wiki_num = random.randint(0,len(qa))
        chosen_q = random.choice(qa[str(wiki_num)]["questions"])
        X_corpus.append(chosen_q)
        #X_corpus_titles.append('placeholder')
        
        print(chosen_q)
    
    tfidf = TfidfVectorizer(strip_accents='ascii', stop_words='english')
    X_tfidf = tfidf.fit_transform(X_corpus).toarray()
    
# convert array of documents & tfidf values to dataframe
    df = pd.DataFrame(X_tfidf, columns=tfidf.get_feature_names())
    
# compute cosine similarity
    cs_array = cosine_similarity(df)
    
# Find which wiki page corresponds highest to question asked by finding
# index of highest value in the question's cosine similarity array.
# Index in cs_array is mapped 1 to 1 with the index of article titles in X_corpus_titles
    best_wiki = X_corpus_titles[np.argmax(np.delete(cs_array[-1],-1))]
    
# remove appended question to maintain clean corpus
    del X_corpus[-1]
    
    print('Most Relevant Wiki Article: ', best_wiki)
    
# get indices of top n wiki articles
    top_wikis = top_wiki_indices(cs_array, 5)
    
    top_wikis_list = []
    
    for index in top_wikis:
        top_wikis_list.append(X_corpus_titles[index])
        
# get first few paragraphs of top wiki article
    top_wiki_index = top_wikis[0]

# load unconcatenated paragraphs dictionary
    pickle_in = open('unconcat_paragraphs.pickle','rb')
    paragraphs = pickle.load(pickle_in)

    few_paragraphs = []
    
    for num in range(3):
        few_paragraphs.append(paragraphs[str(top_wiki_index)]["content"][num])

    
# find top n titles
# print top 3 paragraphs
    
# best_wiki
    return top_wikis_list, few_paragraphs


In [76]:
best_wiki()

442 442
Who does the senior leadership roles in the USAF include? 


  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


Most Relevant Wiki Article:  United_States_Air_Force


(['United_States_Air_Force',
  'Party_leaders_of_the_United_States_House_of_Representatives',
  'Labour_Party_(UK)',
  'Order_of_the_British_Empire',
  'Korean_War'],
 ["The United States Air Force (USAF) is the aerial warfare service branch of the United States Armed Forces and one of the seven American uniformed services. Initially part of the United States Army, the USAF was formed as a separate branch of the military on 18 September 1947 under the National Security Act of 1947. It is the most recent branch of the U.S. military to be formed, and is the largest and one of the world's most technologically advanced air forces. The USAF articulates its core functions as Nuclear Deterrence Operations, Special Operations, Air Superiority, Global Integrated ISR, Space Superiority, Command and Control, Cyberspace Superiority, Personnel Recovery, Global Precision Attack, Building Partnerships, Rapid Global Mobility and Agile Combat Support.",
  'The U.S. Air Force is a military service organ

In [73]:
best_wiki(own_question=True)

442 442
Ask a quesiton: who is yeezy?


  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


Most Relevant Wiki Article:  Kanye_West


(['Kanye_West',
  'Matter',
  'Franco-Prussian_War',
  'General_Electric',
  'Valencia'],
 ' ',
 ['Kanye Omari West (/ˈkɑːnjeɪ/; born June 8, 1977) is an American hip hop recording artist, record producer, rapper, fashion designer, and entrepreneur. He is among the most acclaimed musicians of the 21st century, attracting both praise and controversy for his work and his outspoken public persona.',
  "Raised in Chicago, West briefly attended art school before becoming known as a producer for Roc-A-Fella Records in the early 2000s, producing hit singles for artists such as Jay-Z and Alicia Keys. Intent on pursuing a solo career as a rapper, West released his debut album The College Dropout in 2004 to widespread commercial and critical success, and founded record label GOOD Music. He went on to explore a variety of different musical styles on subsequent albums that included the baroque-inflected Late Registration (2005), the arena-inspired Graduation (2007), and the starkly polarizing 808s