#### Import Libraries

In [1]:
import gensim
import nltk
import json
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
import spacy
from spacy import displacy
from collections import Counter
import en_core_web_md

#### Import Train Data

In [3]:
with open ('train-v2.0.json') as f:
    train = json.load(f)

In [4]:
text = train['data'][5]['paragraphs'][0]['context']
text

"Spectre (2015) is the twenty-fourth James Bond film produced by Eon Productions. It features Daniel Craig in his fourth performance as James Bond, and Christoph Waltz as Ernst Stavro Blofeld, with the film marking the character's re-introduction into the series. It was directed by Sam Mendes as his second James Bond film following Skyfall, and was written by John Logan, Neal Purvis, Robert Wade and Jez Butterworth. It is distributed by Metro-Goldwyn-Mayer and Columbia Pictures. With a budget around $245 million, it is the most expensive Bond film and one of the most expensive films ever made."

In [4]:
text = text.lower()
text = text.replace('-', ' ')
text

"spectre (2015) is the twenty fourth james bond film produced by eon productions. it features daniel craig in his fourth performance as james bond, and christoph waltz as ernst stavro blofeld, with the film marking the character's re introduction into the series. it was directed by sam mendes as his second james bond film following skyfall, and was written by john logan, neal purvis, robert wade and jez butterworth. it is distributed by metro goldwyn mayer and columbia pictures. with a budget around $245 million, it is the most expensive bond film and one of the most expensive films ever made."

#### Convert text into Sentences

In [5]:
sentences = text.split('.')
sentences

['spectre (2015) is the twenty fourth james bond film produced by eon productions',
 " it features daniel craig in his fourth performance as james bond, and christoph waltz as ernst stavro blofeld, with the film marking the character's re introduction into the series",
 ' it was directed by sam mendes as his second james bond film following skyfall, and was written by john logan, neal purvis, robert wade and jez butterworth',
 ' it is distributed by metro goldwyn mayer and columbia pictures',
 ' with a budget around $245 million, it is the most expensive bond film and one of the most expensive films ever made',
 '']

In [6]:
del(sentences[-1])
sentences

['spectre (2015) is the twenty fourth james bond film produced by eon productions',
 " it features daniel craig in his fourth performance as james bond, and christoph waltz as ernst stavro blofeld, with the film marking the character's re introduction into the series",
 ' it was directed by sam mendes as his second james bond film following skyfall, and was written by john logan, neal purvis, robert wade and jez butterworth',
 ' it is distributed by metro goldwyn mayer and columbia pictures',
 ' with a budget around $245 million, it is the most expensive bond film and one of the most expensive films ever made']

#### Vectorization

In [9]:
vect = TfidfVectorizer(min_df = 1).fit(sentences)

#### Test

In [10]:
train_array = vect.transform(sentences).toarray()

#### TF-IDF

In [11]:
dict_vect = list(vect.vocabulary_.keys())

In [12]:
idf = vect.idf_

In [13]:
tf_list = [text.count(word) for word in dict_vect]
tf = np.array(tf_list)

In [14]:
tfidf = tf * idf

In [15]:
df = pd.DataFrame(index = dict_vect, data = tfidf)
df.head()

Unnamed: 0,0
spectre,2.098612
2015,2.098612
is,9.458572
the,12.591674
twenty,1.693147


#### Test

In [16]:
query = 'How many James Bond films has Eon Productions produced?'

In [17]:
qn = [query.lower()]

In [18]:
def qn_ans(train_array, qn_list):
    
    ans = []
    
    for qn in qn_list:
        qn_array = vect.transform([qn]).toarray()
        result = cosine_similarity(train_array, qn_array)
        
        if result.max() > 0.1:
            ans.append((sentences[result.argmax()]))
        else:
            ans.append('')
            
    return ans

In [19]:
qn_ans(train_array, qn)

['spectre (2015) is the twenty fourth james bond film produced by eon productions']

#### Questions

In [20]:
qn_list = []
for i in range(len(train['data'][5]['paragraphs'][0]['qas'])):
    qn_list.append(train['data'][5]['paragraphs'][0]['qas'][i]['question'].lower())

In [21]:
qn_list

['which company made spectre?',
 'who is the star of spectre?',
 'what role does daniel craig play in spectre?',
 'what 007 movie did sam mendes previously direct?',
 'how much money did it take to make spectre?',
 'how many james bond films has eon productions produced?',
 'which bond film was the most expensive ever made?',
 'how many films has daniel craig appeared in as james bond?',
 'which two movie studios distributed the james bond film spectre?',
 'who directed spectre?',
 'what is the name of the thirty-fourth james bond film?',
 'in what year was the thirty-fourth james bond film produced?',
 'daniel craig stars as ernst stavro blofeld in what 2015 film?',
 'sam mendes wrote what 2015 film?',
 'john logan directed what 2015 film?']

In [22]:
pd.DataFrame(index = qn_list, data = qn_ans(train_array, qn_list))

Unnamed: 0,0
which company made spectre?,spectre (2015) is the twenty fourth james bond...
who is the star of spectre?,spectre (2015) is the twenty fourth james bond...
what role does daniel craig play in spectre?,it features daniel craig in his fourth perfor...
what 007 movie did sam mendes previously direct?,it was directed by sam mendes as his second j...
how much money did it take to make spectre?,spectre (2015) is the twenty fourth james bond...
how many james bond films has eon productions produced?,spectre (2015) is the twenty fourth james bond...
which bond film was the most expensive ever made?,"with a budget around $245 million, it is the ..."
how many films has daniel craig appeared in as james bond?,it features daniel craig in his fourth perfor...
which two movie studios distributed the james bond film spectre?,spectre (2015) is the twenty fourth james bond...
who directed spectre?,spectre (2015) is the twenty fourth james bond...


#### NER

In [26]:
nlp = en_core_web_md.load()

In [33]:
cosine_similarity(vect.transform(['James Bond']).toarray(), vect.transform(['Daniel Craig']).toarray())

array([[ 0.]])

#### Create QA Dataset

In [33]:
qa_dataset = pd.DataFrame(columns = ['Text', 'Questions', 'Answers'])

In [88]:
for i in range(len(train['data'])):
    qn,ans = [],[]
    
    text = [train['data'][i]['paragraphs'][0]['context']]
    
    for j in range(len(train['data'][i]['paragraphs'][0]['qas'])):
        qn.append(train['data'][i]['paragraphs'][0]['qas'][j]['question'])
        ans.append(train['data'][i]['paragraphs'][0]['qas'][j]['answers'])
    
    df = pd.DataFrame({'Text':text, 'Questions':[qn], 'Answers':[ans]})
    
    qa_dataset = pd.concat([qa_dataset, df])

In [93]:
qa_df = qa_dataset.reset_index().drop(['index'], axis = 1)
qa_df.head()

Unnamed: 0,Text,Questions,Answers
0,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,"[When did Beyonce start becoming popular?, Wha...","[in the late 1990s, singing and dancing, 2003,..."
1,Frédéric François Chopin (/ˈʃoʊpæn/; French pr...,"[What was Frédéric's nationalities?, In what e...","[Polish and French, Romantic era, solo piano, ..."
2,The exact nature of relations between Tibet an...,[Who were Wang Jiawei and Nyima Gyaincain?],[Mainland Chinese scholars]
3,The iPod is a line of portable media players a...,"[Which company produces the iPod?, When was th...","[Apple, October 23, 2001, three, portable medi..."
4,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,"[When did Beyonce start becoming popular?, Wha...","[in the late 1990s, singing and dancing, 2003,..."


In [97]:
qa_df.to_csv('QA Dataset.csv')