# Sentence Selection

Holy Lovenia / 13515113

In [4]:
%load_ext autoreload
%autoreload 2

## Data Preparation

In [1]:
from nltk import RegexpTokenizer
from pandas.io.json import json_normalize


import json
import pandas as pd
import re

### Read original data

In [2]:
data = []
with open('dataset/SQuAD/train-v2.0.json') as f:
    json_data = json.load(f)['data']

    for i in range(len(json_data)):
        json_data_i = json_data[i]['paragraphs']
        
        for j in range(1):
            paragraph = json_data_i[j]['context']
                
            data.append(paragraph)

### Preprocess data

In [3]:
preprocessed_data = []
with open('dataset/SQuAD/train-v2.0.json') as f:
    json_data = json.load(f)['data']

    for i in range(len(json_data)):
        json_data_i = json_data[i]['paragraphs']
        
        for j in range(1):
            paragraph = json_data_i[j]['context']
            
            # replace all dictionary phonetic with ''
            paragraph = re.sub('\/.*\ˈ.*\/', '', paragraph)
            
            # replace all japanese characters with ''
            paragraph = re.sub('[\u3000-\u303f\u3040-\u309f\u30a0-\u30ff\uff00-\uff9f\u4e00-\u9faf\u3400-\u4dbf]+', '', paragraph)
            paragraph = re.sub(r'[^\x00-\x7f]',r'', paragraph) 
            
            # replace dots in the center of words
            words = paragraph.split(' ')
            for i in range(len(words)):
                if(words[i].find('.') != len(words[i]) - 1 and words[i].find('.') != -1):
                    words[i] = words[i].replace('.', '')
                if(words[i].find(',') != len(words[i]) - 1 and words[i].find(',') != -1):
                    words[i] = words[i].replace(',', '')

            paragraph = ' '.join(words)
            
            data_i_j = paragraph.split('.')
            
            paragraph = []
            for k in range(len(data_i_j)):
                tokenizer = RegexpTokenizer('[\w\/\&\-\:]+', flags=re.UNICODE)
    
                token_list = tokenizer.tokenize(data_i_j[k])
                token_list = [token.strip() for token in token_list if len(token.strip()) > 1 or token.lower() == 'a']
            
                if token_list != []:
                    paragraph.append(token_list)
                
            preprocessed_data.append(paragraph)

## Text Summarization

In [5]:
from sentence_selection.lsa import SteinbergerJezekLSA
from sentence_selection.text_rank import TextRank
from sentence_selection.multi_word_phrase_extraction import MultiWordPhraseExtractor

In [17]:
from sentence_selection.base import jaccard_similarity

In [24]:
jaccard_similarity(set(['dog', 'cat', 'cat', 'rat']), set(['hehe', 'hehe', 'hehe']))

0.5

### TextRank

In [14]:
tr_jaccard = TextRank(similarity='jaccard')
tr_cosine = TextRank(similarity='cosine')

### MultiWordPhraseExtraction

In [7]:
mwpe = MultiWordPhraseExtractor()

### LSA

In [9]:
lsa_tfidf = SteinbergerJezekLSA(matrix_technique='tfidf')

In [10]:
lsa_binary = SteinbergerJezekLSA(matrix_technique='binary')

### Usage example

In [15]:
count_same_results = 0

for i in range(len(preprocessed_data)):
    lsa_tfidf_result = lsa_tfidf.summarize(preprocessed_data[i])
    lsa_binary_result = lsa_binary.summarize(preprocessed_data[i])
    multiword_result = mwpe.summarize(preprocessed_data[i])
    tr_jaccard_result = tr_jaccard.summarize(preprocessed_data[i])
    tr_cosine_result = tr_cosine.summarize(preprocessed_data[i])
    print(i, lsa_tfidf_result, lsa_binary_result, multiword_result, tr_jaccard_result, tr_cosine_result)

0 [0, 2] [3, 1] [0, 1] [1, 0] [1, 2]
1 [2, 0] [1, 0] [0, 1] [2, 0] [1, 0]


  """


2 [5, 4] [2, 6] [2, 3] [4, 3] [2, 0]
3 [2, 1] [0, 3] [0, 1] [0, 2] [3, 1]
4 [4, 1] [2, 0] [0, 2] [4, 2] [4, 1]
5 [3, 0] [1, 2] [2, 0] [0, 4] [3, 1]
6 [0] [0] [0] [0] [0]
7 [1, 2] [0, 1] [1, 0] [2, 1] [2, 1]
8 [0, 1] [2, 1] [1, 2] [0, 1] [1, 0]
9 [0] [0] [0] [0] [0]
10 [0, 1] [1, 0] [0, 1] [1, 0] [1, 0]
11 [2, 0] [1, 0] [0, 1] [0, 1] [2, 0]
12 [2, 1] [3, 1] [3, 0] [1, 3] [2, 3]
13 [0] [0] [0] [0] [0]
14 [1, 0] [2, 0] [0, 2] [0, 1] [2, 1]
15 [1, 2] [0, 2] [0, 2] [1, 2] [0, 1]
16 [3, 0] [2, 0] [0, 1] [1, 3] [3, 1]
17 [1, 0] [1, 0] [0, 1] [1, 0] [0, 1]
18 [1, 0] [3, 1] [0, 3] [0, 2] [3, 2]
19 [1, 2] [0, 2] [1, 2] [0, 1] [1, 2]
20 [1, 2] [0, 2] [0, 2] [1, 2] [1, 2]
21 [1, 2] [0, 2] [0, 1] [1, 2] [0, 2]
22 [0, 2] [1, 2] [2, 0] [0, 1] [2, 0]
23 [0] [0] [0] [0] [0]
24 [0, 1] [1, 0] [0, 1] [0, 1] [0, 1]
25 [1, 2] [2, 3] [0, 1] [3, 1] [2, 3]
26 [4, 3] [1, 4] [4, 0] [0, 3] [0, 2]
27 [2, 1] [3, 0] [3, 0] [0, 2] [0, 2]
28 [2, 1] [0, 1] [0, 1] [1, 2] [0, 2]
29 [1, 2] [2, 0] [0, 1] [1, 0] [0, 1]
30 [

222 [8, 7] [1, 2] [2, 9] [3, 8] [8, 6]
223 [1, 3] [4, 2] [4, 0] [2, 3] [0, 1]
224 [4, 2] [0, 3] [0, 1] [1, 3] [4, 2]
225 [1, 0] [2, 3] [1, 0] [1, 3] [2, 0]
226 [2, 4] [0, 1] [0, 3] [2, 1] [3, 0]
227 [4, 2] [0, 1] [0, 2] [3, 1] [5, 2]
228 [1, 2] [0, 3] [0, 1] [4, 2] [1, 3]
229 [0, 2] [1, 3] [0, 1] [2, 3] [3, 0]
230 [3, 5] [4, 1] [6, 0] [3, 7] [0, 4]
231 [3, 4] [2, 6] [2, 0] [2, 1] [0, 3]
232 [4, 1] [2, 0] [1, 0] [7, 5] [5, 7]
233 [2, 4] [4, 0] [2, 3] [2, 0] [0, 2]
234 [3, 1] [4, 2] [4, 0] [1, 2] [0, 4]
235 [2, 0] [3, 1] [3, 0] [2, 1] [0, 3]
236 [3, 1] [1, 0] [1, 0] [2, 1] [3, 1]
237 [1, 3] [2, 6] [6, 0] [5, 2] [2, 4]
238 [1, 3] [5, 4] [0, 5] [1, 5] [5, 1]
239 [2, 0] [4, 1] [0, 1] [3, 2] [4, 1]
240 [6, 1] [4, 2] [4, 0] [6, 5] [6, 0]
241 [2, 6] [1, 0] [1, 5] [4, 1] [3, 1]
242 [6, 3] [1, 0] [4, 0] [3, 6] [0, 2]
243 [2, 0] [1, 2] [1, 0] [0, 2] [1, 2]
244 [1, 0] [0, 2] [0, 1] [1, 0] [1, 2]
245 [2, 1] [0, 3] [0, 1] [1, 0] [1, 0]
246 [3, 2] [0, 1] [0, 1] [3, 1] [0, 3]
247 [2, 5] [4, 3] [4, 0] 

433 [0, 2] [2, 1] [2, 0] [1, 2] [1, 2]
434 [5, 1] [2, 3] [2, 0] [5, 6] [1, 4]
435 [2, 1] [1, 0] [0, 3] [3, 2] [3, 2]
436 [1, 0] [0, 1] [1, 0] [1, 0] [1, 0]
437 [0, 4] [2, 4] [1, 0] [1, 3] [1, 3]
438 [3, 1] [1, 2] [1, 0] [2, 3] [2, 1]
439 [4, 1] [3, 5] [0, 3] [1, 0] [0, 5]
440 [3, 7] [0, 2] [4, 0] [3, 2] [1, 3]
441 [2, 3] [1, 0] [0, 1] [2, 1] [3, 1]


In [151]:
idx = 211

for i in range(len(preprocessed_data[idx])):
    print(i, ' '.join(preprocessed_data[idx][i]))

0 jews originated as a national and religious group in the middle east during the second millennium bce in the part of the levant known as the land of israel
1 the merneptah stele appears to confirm the existence of a people of israel associated with the god el somewhere in canaan as far back as the 13th century bce
2 the israelites as an outgrowth of the canaanite population consolidated their hold with the emergence of the kingdom of israel and the kingdom of judah
3 some consider that these canaanite sedentary israelites melded with incoming nomadic groups known as hebrews
4 though few sources in the bible mention the exilic periods in detail the experience of diaspora life from the ancient egyptian rule over the levant to assyrian captivity and exile to babylonian captivity and exile to seleucid imperial rule to the roman occupation and the historical relations between israelites and the homeland became a major feature of jewish history identity and memory
