In [1]:
import re
import glob
import numpy as np
from collections import Counter 
import nltk
from nltk.corpus import stopwords 
from nltk.tokenize import sent_tokenize
from nltk.cluster.util import cosine_distance as cosDist
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer
from gensim.models import KeyedVectors
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
model = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz', binary=True)



In [36]:
def getDocByPara(fname):
    file = open(fname,"r") 
    line = file.read()
    para = line.split("\n\n")
    return (para)

In [37]:
def get_no_of_quoted_words(sentence):
    counter = 0
    start = 0
    for eachWord in sentence.split():
        if(eachWord[0]=="\""):
            start = 1
        if (start == 1):
            #print (eachWord)
            counter+=1
            if(eachWord[len(eachWord)-2]=="\""):
                start = 0    
    return counter

In [38]:
def get_count_of_content_words(sentence):
    counter = 0
    words = sentence.split()
    stop_words = set(stopwords.words('english')) 
    for word in words:
        if not word in stop_words:
            counter += 1
    return counter

In [39]:
def get_surf_features_from_para(paragraph):
    sentence_list = sent_tokenize(paragraph)
    feature_list = []
    sentence_no = 1
    for sentence in sentence_list:
        if(len(sentence) == 1):
            continue
        vector = []
        # First feature is 'para_start', which is 1 if sentence is first sentence of its paragraph, 0 otherwise
        if sentence_no == 1:
            vector.append(1)
        else:
            vector.append(0)
        # Second feature is 'position', which is equal to reciprocal of the sentence no. in its paragraph
        vector.append(1/sentence_no)
        sentence_no += 1
        # Third feature is 'length', which is equal to no. of content words in the sentence. Threshold length 
        # is 5 words. If less than or equal to 5 words, feature value set to 0, else equal to the number of 
        # content words in the sentence.   
        sentence_size = get_count_of_content_words(sentence)
        if sentence_size <= 5 :
            vector.append(0)
        else:
            vector.append(sentence_size)
        # Fourth feature is 'quoted_words', which is equal to the no. of quoted words in the sentence
        quoted_words_count = get_no_of_quoted_words(sentence)
        total_size = len(sentence.split())
        quoted_words_ratio = (total_size - quoted_words_count) / total_size
        vector.append(quoted_words_ratio)
        feature_list.append(vector)
        
    return feature_list

In [40]:
def get_surf_features_from_doc(file_path):
    paragraphs_list = getDocByPara(file_path)
    doc_features_list = []
    # Note removing the heading
    for paragraph in paragraphs_list[1:]:
        vector = get_surf_features_from_para(paragraph)
        doc_features_list += vector
    # Fifth feature is 'document_first', which is 1 if sentence is first sentence of the document, 0 otherwise
    doc_features_list[0].append(1)
    for i in range(1,len(doc_features_list)):
        doc_features_list[i].append(0)
    return np.array(doc_features_list)

In [41]:
def get_document_without_heading(fname, is_summ = False):
    file = open(fname,"r") 
    line = file.read()
    sent_text = nltk.sent_tokenize(line)
    for sent in sent_text:
        if(len(sent) == 1):
            sent_text.remove(sent)
    if is_summ is False:
        heading = sent_text.pop(0)
        heading = heading.split('\n\n')
        if len(heading) > 1:
            sent_text.insert(0,heading[1])
    return sent_text

In [42]:
def get_document_as_statements_list(file_location):
    
    """
    get_document_data method will read the file from the path provided and will remove stopwords and will 
    return the document as a list of statements
    
    """
    stop_words = set(stopwords.words('english')) 
    file_data = get_document_without_heading(file_location)
    sentences = []
    for sentence in file_data:
        sent = ''
        splitted_sent = sentence.split(" ")
        for word in splitted_sent:
            if not word in stop_words: 
                sent = sent + str(word) + " "
        sent = sent[:-1]
        sentences.append(sent)
    return sentences

In [43]:
def get_bigram_tfidf_dictionary_and_feature_names(document):
    vectorizer = TfidfVectorizer(ngram_range=(1,2))
    X = vectorizer.fit_transform(document)
    feature_names = vectorizer.get_feature_names()
    X = X.tocoo()
    row_no = X.row
    col_no = X.col
    data = X.data
    dictionary = {(l, k): v for l, k, v in zip(row_no, col_no, data)}
    return feature_names, dictionary, vectorizer.build_tokenizer()

In [44]:
def get_unigram_tfidf_dictionary_and_feature_names(document):
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(document)
    feature_names = vectorizer.get_feature_names()
    X = X.tocoo()
    row_no = X.row
    col_no = X.col
    data = X.data
    dictionary = {(l, k): v for l, k, v in zip(row_no, col_no, data)}
    return feature_names, dictionary, vectorizer.build_tokenizer()

In [45]:
def get_unigram_average_tfidf_list(document, tokenizer, feature_names, dictionary):
    average_tfidf_list = []
    line_no = 0
    for line in document:
        words = tokenizer(line)
        total_tfidf = 0
        for word in words:
            index = feature_names.index(word.lower())
            total_tfidf += dictionary[(line_no, index)]
        if len(words) != 0:
            average_tfidf = total_tfidf / len(words)
        else:
            average_tfidf = 0
        average_tfidf_list.append(average_tfidf)
        line_no += 1
    return average_tfidf_list

In [46]:
def get_bigram_average_tfidf_list(document, tokenizer, feature_names, dictionary):
    average_tfidf_list = []
    line_no = 0
    for line in document:
        words = tokenizer(line)
        if len(words) != 0:
            f_word = words[0].lower()
            index = feature_names.index(f_word)
            total_tfidf = dictionary[(line_no, index)]
            for word in words[1:]:
                word = word.lower()
                index = feature_names.index(word)
                total_tfidf += dictionary[(line_no, index)]
                phrase = f_word + " " + word
                index = feature_names.index(phrase)
                total_tfidf += dictionary[(line_no, index)]
                f_word = word
            average_tfidf = total_tfidf / len(words)
        else:
            average_tfidf = 0
        average_tfidf_list.append(average_tfidf)
        line_no += 1
    return average_tfidf_list

In [47]:
def total_uni_bi(sent_text):
    """
    total_uni_bi method will give the total unigram and bigrams in the documents 
    """
    len_of_docu=len(sent_text)
    total_tokens_uni=[]
    total_tokens_bi=[]
    for i in range(len_of_docu):
        temp=list(nltk.ngrams(sent_text[i].split(' ') , 1 ))
        for j1 in temp:
            total_tokens_uni.append( j1 )
        temp2=list(nltk.ngrams(sent_text[i].split(' ') , 2 ))
        for j2 in temp2:
            total_tokens_bi.append( j2 )
    return total_tokens_uni,total_tokens_bi

In [48]:
def get_uniFreq_bifreq(path):
    """
    get_uniFreq_biFreq will give the uniFreq and biFreq feature for each sentence in the document 
    """
    sent_text=get_document_as_statements_list(path)
    total_tokens_uni,total_tokens_bi=total_uni_bi(sent_text)
    count_unigrams_in_docu=Counter(total_tokens_uni)
    count_bigrams_in_docu=Counter(total_tokens_bi)
    uni_vector=[0] * len(sent_text)
    bi_vector=[0] * len(sent_text)
    for q in range(len(sent_text)):
        tokens_uni = list(nltk.ngrams(sent_text[q].split(' ') , 1 ))
        tokens_bi = list(nltk.ngrams(sent_text[q].split(' ') , 2 ))
        dic_uni=Counter(tokens_uni)
        dic_bi=Counter(tokens_bi)
        ans=0
        ans_bi=0
        for i in range(len(tokens_uni)):
            count_of_token_in_sentence_uni = dic_uni.get(tokens_uni[i])
            count_in_docu_uni = count_unigrams_in_docu.get(tokens_uni[i])
            freq = count_of_token_in_sentence_uni / count_in_docu_uni
            ans=ans+freq
        ans=ans/len(tokens_uni)
        uni_vector[q]= ans
        for i in range(len(tokens_bi)):
            count_of_token_in_sentence_bi = dic_bi.get(tokens_bi[i])
            count_in_docu_bi = count_bigrams_in_docu.get(tokens_bi[i])
            freq = count_of_token_in_sentence_bi / count_in_docu_bi
            ans_bi=ans_bi+freq
        if(len(tokens_bi)==0):
            ans_bi = 0
        else:
            ans_bi = ans_bi/len(tokens_bi)
        bi_vector[q]= ans_bi
    return uni_vector,bi_vector

In [49]:
def get_document_data_for_signature(file_location):
#     stop_words = set(stopwords.words('english')) 
#     file1 = open(file_location,"r") 
    data = get_document_as_statements_list(file_location)
    sentences = []
    data_words=[]
    for sentence in data:
        words = sentence.split()
        sent = ''        
        for r in words: 
            new_words = ''
            if len(r)>1: 
                if(r.find("'")==-1):
                    if(r.find("-")>0):
                        r=r.replace('-', ' ')
                        z=r.split(' ')
                        w1=z[0]
                        w2=z[1]
                        data_words.append(lemmatizer.lemmatize(w1)) 
                        data_words.append(lemmatizer.lemmatize(w2))
                        new_words = w1 + " " + w2 + " "
                        sent += new_words
                        continue
                    data_words.append(lemmatizer.lemmatize(r)) 
                    new_words = r + " "
                    sent += new_words
                else:
                    if(r.find("-")>0):
                        r=r.replace('-', ' ')
                        z=r.split(' ')
                        w1=z[0]
                        w2=z[1]
                        data_words.append(SnowballStemmer("english").stem(w1)) 
                        data_words.append(SnowballStemmer("english").stem(w2)) 
                        new_words = w1 + " " + w2 + " "
                        sent += new_words
                        continue                
                    data_words.append(SnowballStemmer("english").stem(r))
                    new_words = r + " "
                    sent += new_words
        sent = sent[:-1]
        sentences.append(sent)
    return sentences, data_words

In [50]:
def get_sig_feature(file):
    sent,total_words_for_sig_feature = get_document_data_for_signature(file)
    most_frq_words=Counter(total_words_for_sig_feature).most_common(5)
    topics = []
    sig_vector=[]
    for i in range(5):
        topics.append(most_frq_words[i][0])
    signatures=[]
    for i in topics:
        try:
            signatures += model.wv.most_similar(i)
        except:
            continue
    signatures_with_value= [[],[]]
    for tup in signatures:
        signatures_with_value[0].append(tup[0])
        signatures_with_value[1].append(tup[1])

    for i in sent:
        wordss = nltk.word_tokenize(i)
        total = 0
        for j in wordss:
            if(j in signatures_with_value[0]):
                total += signatures_with_value[1][signatures_with_value[0].index(j)]
        sig_vector.append(total)
    return sig_vector

In [1]:
def get_content_feature_vectors(document_path_as_string):
    file_data = get_document_as_statements_list(document_path_as_string)
    # First feature : Average unigram tfidf of sentence
    feature_names, dictionary, tokenizer = get_unigram_tfidf_dictionary_and_feature_names(file_data)
    average_uni_tfidf_list = get_unigram_average_tfidf_list(file_data, tokenizer, feature_names, dictionary)
    # Second feature : Average bigram tfidf of sentence
    feature_names, dictionary, tokenizer = get_bigram_tfidf_dictionary_and_feature_names(file_data)
    average_bi_tfidf_list = get_bigram_average_tfidf_list(file_data, tokenizer, feature_names, dictionary)
    # Third and Fourth feature : Average unigram frequency, Average bigram frequency of sentence
    unifreq_vector,bifreq_vector = get_uniFreq_bifreq(document_path_as_string)
    # Fifth feature : Average weight of unigram signatures 
    sig_vector = get_sig_feature(document_path_as_string)
    
    content_feature_vectors = []
    for i in range(len(file_data)):
        vector = []
        vector.append(average_uni_tfidf_list[i])
        vector.append(average_bi_tfidf_list[i])
        vector.append(unifreq_vector[i])
        vector.append(bifreq_vector[i])
        vector.append(sig_vector[i])
        content_feature_vectors.append(vector)
    return np.array(content_feature_vectors)

In [52]:
def sentence_similarity(sentence_1, sentence_2):
    sentence_1 = [eachWord.lower() for eachWord in sentence_1]
    sentence_2 = [eachWord.lower() for eachWord in sentence_2]
    allWords = list(set(sentence_1+sentence_2))
    vector_1 = [0]*len(allWords)
    vector_2 = [0]*len(allWords)
    for eachWord in sentence_1:
        vector_1[allWords.index(eachWord)]+=1   
    for eachWord in sentence_2:
        vector_2[allWords.index(eachWord)]+=1
    return (1-(cosDist(vector_1,vector_2)))

In [53]:
def get_y(document, fname):
    y = []*len(document)
    eachPara = getDocByPara(fname)
    for k in range(1,len(eachPara)):
        paragraph = nltk.sent_tokenize(eachPara[k])
        
        stop_words = set(stopwords.words('english'))
        sentences = []
        for sentence in paragraph:
            if len(sentence) == 1:
                continue
            sent = ''
            splitted_sent = sentence.split(" ")
            for word in splitted_sent:
                if not word in stop_words: 
                    sent = sent + str(word) + " "
            sent = sent[:-1]
            sentences.append(sent)
        paragraph = sentences
        
        for i in range(0,len(paragraph)):
            y.append(sentence_similarity(paragraph[0].split(),paragraph[i].split()))
    return y

In [54]:
def build_similarity_matrix(sentences):
    S = np.zeros((len(sentences), len(sentences)))
    for i in range(len(sentences)):
        for j in range(len(sentences)):
            if i == j:
                continue
            S[i][j] = sentence_similarity(sentences[i].split(), sentences[j].split()) 
    for i in range(len(S)):
        if (S[i].sum()!=0):
            S[i] /= S[i].sum()
    return S

In [3]:
def get_relevance_features(fname):
    document = get_document_as_statements_list(fname)
    # First feature : Relevance wrt to first sentence in the document
    x = []*len(document)
    for i in range(len(document)):
        x.append(sentence_similarity(document[0].split(),document[i].split()))

    # Second feature : Relevance wrt to first sentence in the paragraph
    y = get_y(document, fname)
    
    # Third feature : Combined Relevance of sentence wrt to all sentences in the document
    z = build_similarity_matrix(document)    
    z = np.sum(z,axis=0)
    x = np.array(x)
    y = np.array(y)
    z = np.array(z)
    #print(x.shape, y.shape, z.shape)
    relevance_features = np.vstack((x,y,z)).T
    return relevance_features

In [2]:
def get_feature_vector(file):
    
    # First feature is 'para_start', which is 1 if sentence is first sentence of its paragraph, 0 otherwise
    # Second feature is 'position', which is equal to reciprocal of the sentence no. in its paragraph
    # Third feature is 'length', which is equal to no. of content words in the sentence. Threshold length 
        # is 5 words. If less than or equal to 5 words, feature value set to 0, else equal to the number of 
        # content words in the sentence.
    # Fourth feature is 'quoted_words', which is equal to the no. of quoted words in the sentence
    # Fifth feature is 'document_first', which is 1 if sentence is first sentence of the document, 0 otherwise
    surf_features = get_surf_features_from_doc(file)
    
    # Sixth feature : Relevance wrt to first sentence in the document
    # Seventh feature : Relevance wrt to first sentence in the paragraph
    # Eighth feature : Combined Relevance of sentence wrt to all sentences in the document
    relevance_features = get_relevance_features(file)
    
    # Ninth feature : Average unigram tfidf of sentence
    # Tenth feature : Average bigram tfidf of sentence
    # Eleventh and Twelveth feature : Average unigram frequency, Average bigram frequency of sentence    
    content_features = get_content_feature_vectors(file)
    
    #print(surf_features.shape, relevance_features.shape, content_features.shape)
    feature_vector = np.concatenate((surf_features, relevance_features, content_features),axis=1)
    return feature_vector

In [57]:
def get_labels(doc_data, summ_data):
    labels = []
    for sentence in doc_data:
        if sentence in summ_data:
            labels.append(1)
        else:
            labels.append(0)
    return labels

In [58]:
def get_feature_vector_with_labels():
    feature_vectors = abc = np.zeros((0,13))
    labels = []
    data_paths = [] 
    summ_paths = []
    for path in glob.glob("updated_data/Training_Data/*.txt"):
        data_paths.append(path)
    data_paths.sort()
    for path in glob.glob("updated_data/Training_Summ/*.txt"):
        summ_paths.append(path)
    summ_paths.sort()
    for i in range(len(data_paths)):
        print(data_paths[i])
        print(summ_paths[i])
        doc_data = get_document_without_heading(data_paths[i])
        summ_data = get_document_without_heading(summ_paths[i], is_summ = True)
        doc_labels = get_labels(doc_data, summ_data)
        doc_feature_vector = get_feature_vector(data_paths[i])
        feature_vectors = np.vstack([feature_vectors, doc_feature_vector])
        labels = labels + doc_labels
    return feature_vectors, labels

In [59]:
feature_vectors, labels = get_feature_vector_with_labels()

updated_data/Training_Data\1.txt
updated_data/Training_Summ\1.txt
(20,) (20,) (20,)


  # This is added back by InteractiveShellApp.init_path()
  if np.issubdtype(vec.dtype, np.int):


(20, 5) (20, 3) (20, 5)
updated_data/Training_Data\10.txt
updated_data/Training_Summ\10.txt
(10,) (10,) (10,)
(10, 5) (10, 3) (10, 5)
updated_data/Training_Data\100.txt
updated_data/Training_Summ\100.txt
(16,) (16,) (16,)
(16, 5) (16, 3) (16, 5)
updated_data/Training_Data\1000.txt
updated_data/Training_Summ\1000.txt
(27,) (27,) (27,)
(27, 5) (27, 3) (27, 5)
updated_data/Training_Data\1001.txt
updated_data/Training_Summ\1001.txt
(15,) (15,) (15,)
(15, 5) (15, 3) (15, 5)
updated_data/Training_Data\1002.txt
updated_data/Training_Summ\1002.txt
(19,) (19,) (19,)
(19, 5) (19, 3) (19, 5)
updated_data/Training_Data\1003.txt
updated_data/Training_Summ\1003.txt
(15,) (15,) (15,)
(15, 5) (15, 3) (15, 5)
updated_data/Training_Data\1004.txt
updated_data/Training_Summ\1004.txt
(15,) (15,) (15,)
(15, 5) (15, 3) (15, 5)
updated_data/Training_Data\1005.txt
updated_data/Training_Summ\1005.txt
(18,) (18,) (18,)
(18, 5) (18, 3) (18, 5)
updated_data/Training_Data\1006.txt
updated_data/Training_Summ\1006.tx

(7, 5) (7, 3) (7, 5)
updated_data/Training_Data\1065.txt
updated_data/Training_Summ\1065.txt
(12,) (12,) (12,)
(12, 5) (12, 3) (12, 5)
updated_data/Training_Data\1066.txt
updated_data/Training_Summ\1066.txt
(12,) (12,) (12,)
(12, 5) (12, 3) (12, 5)
updated_data/Training_Data\1067.txt
updated_data/Training_Summ\1067.txt
(12,) (12,) (12,)
(12, 5) (12, 3) (12, 5)
updated_data/Training_Data\1068.txt
updated_data/Training_Summ\1068.txt
(7,) (7,) (7,)
(7, 5) (7, 3) (7, 5)
updated_data/Training_Data\1069.txt
updated_data/Training_Summ\1069.txt
(12,) (12,) (12,)
(12, 5) (12, 3) (12, 5)
updated_data/Training_Data\107.txt
updated_data/Training_Summ\107.txt
(12,) (12,) (12,)
(12, 5) (12, 3) (12, 5)
updated_data/Training_Data\1070.txt
updated_data/Training_Summ\1070.txt
(13,) (13,) (13,)
(13, 5) (13, 3) (13, 5)
updated_data/Training_Data\1071.txt
updated_data/Training_Summ\1071.txt
(14,) (14,) (14,)
(14, 5) (14, 3) (14, 5)
updated_data/Training_Data\1072.txt
updated_data/Training_Summ\1072.txt
(21

(18, 5) (18, 3) (18, 5)
updated_data/Training_Data\1131.txt
updated_data/Training_Summ\1131.txt
(21,) (21,) (21,)
(21, 5) (21, 3) (21, 5)
updated_data/Training_Data\1132.txt
updated_data/Training_Summ\1132.txt
(6,) (6,) (6,)
(6, 5) (6, 3) (6, 5)
updated_data/Training_Data\1133.txt
updated_data/Training_Summ\1133.txt
(7,) (7,) (7,)
(7, 5) (7, 3) (7, 5)
updated_data/Training_Data\1134.txt
updated_data/Training_Summ\1134.txt
(7,) (7,) (7,)
(7, 5) (7, 3) (7, 5)
updated_data/Training_Data\1135.txt
updated_data/Training_Summ\1135.txt
(11,) (11,) (11,)
(11, 5) (11, 3) (11, 5)
updated_data/Training_Data\1136.txt
updated_data/Training_Summ\1136.txt
(9,) (9,) (9,)
(9, 5) (9, 3) (9, 5)
updated_data/Training_Data\1137.txt
updated_data/Training_Summ\1137.txt
(23,) (23,) (23,)
(23, 5) (23, 3) (23, 5)
updated_data/Training_Data\1138.txt
updated_data/Training_Summ\1138.txt
(16,) (16,) (16,)
(16, 5) (16, 3) (16, 5)
updated_data/Training_Data\1139.txt
updated_data/Training_Summ\1139.txt
(20,) (20,) (20,

(7, 5) (7, 3) (7, 5)
updated_data/Training_Data\1198.txt
updated_data/Training_Summ\1198.txt
(11,) (11,) (11,)
(11, 5) (11, 3) (11, 5)
updated_data/Training_Data\1199.txt
updated_data/Training_Summ\1199.txt
(17,) (17,) (17,)
(17, 5) (17, 3) (17, 5)
updated_data/Training_Data\12.txt
updated_data/Training_Summ\12.txt
(14,) (14,) (14,)
(14, 5) (14, 3) (14, 5)
updated_data/Training_Data\120.txt
updated_data/Training_Summ\120.txt
(15,) (15,) (15,)
(15, 5) (15, 3) (15, 5)
updated_data/Training_Data\1200.txt
updated_data/Training_Summ\1200.txt
(13,) (13,) (13,)
(13, 5) (13, 3) (13, 5)
updated_data/Training_Data\1201.txt
updated_data/Training_Summ\1201.txt
(24,) (24,) (24,)
(24, 5) (24, 3) (24, 5)
updated_data/Training_Data\1202.txt
updated_data/Training_Summ\1202.txt
(9,) (9,) (9,)
(9, 5) (9, 3) (9, 5)
updated_data/Training_Data\1203.txt
updated_data/Training_Summ\1203.txt
(18,) (18,) (18,)
(18, 5) (18, 3) (18, 5)
updated_data/Training_Data\1204.txt
updated_data/Training_Summ\1204.txt
(17,) (

(12, 5) (12, 3) (12, 5)
updated_data/Training_Data\1263.txt
updated_data/Training_Summ\1263.txt
(10,) (10,) (10,)
(10, 5) (10, 3) (10, 5)
updated_data/Training_Data\1264.txt
updated_data/Training_Summ\1264.txt
(14,) (14,) (14,)
(14, 5) (14, 3) (14, 5)
updated_data/Training_Data\1265.txt
updated_data/Training_Summ\1265.txt
(28,) (28,) (28,)
(28, 5) (28, 3) (28, 5)
updated_data/Training_Data\1266.txt
updated_data/Training_Summ\1266.txt
(22,) (22,) (22,)
(22, 5) (22, 3) (22, 5)
updated_data/Training_Data\1267.txt
updated_data/Training_Summ\1267.txt
(41,) (41,) (41,)
(41, 5) (41, 3) (41, 5)
updated_data/Training_Data\1268.txt
updated_data/Training_Summ\1268.txt
(12,) (12,) (12,)
(12, 5) (12, 3) (12, 5)
updated_data/Training_Data\1269.txt
updated_data/Training_Summ\1269.txt
(7,) (7,) (7,)
(7, 5) (7, 3) (7, 5)
updated_data/Training_Data\127.txt
updated_data/Training_Summ\127.txt
(21,) (21,) (21,)
(21, 5) (21, 3) (21, 5)
updated_data/Training_Data\1270.txt
updated_data/Training_Summ\1270.txt


(7, 5) (7, 3) (7, 5)
updated_data/Training_Data\1329.txt
updated_data/Training_Summ\1329.txt
(9,) (9,) (9,)
(9, 5) (9, 3) (9, 5)
updated_data/Training_Data\133.txt
updated_data/Training_Summ\133.txt
(14,) (14,) (14,)
(14, 5) (14, 3) (14, 5)
updated_data/Training_Data\1330.txt
updated_data/Training_Summ\1330.txt
(17,) (17,) (17,)
(17, 5) (17, 3) (17, 5)
updated_data/Training_Data\1331.txt
updated_data/Training_Summ\1331.txt
(8,) (8,) (8,)
(8, 5) (8, 3) (8, 5)
updated_data/Training_Data\1332.txt
updated_data/Training_Summ\1332.txt
(26,) (26,) (26,)
(26, 5) (26, 3) (26, 5)
updated_data/Training_Data\1333.txt
updated_data/Training_Summ\1333.txt
(26,) (26,) (26,)
(26, 5) (26, 3) (26, 5)
updated_data/Training_Data\1334.txt
updated_data/Training_Summ\1334.txt
(17,) (17,) (17,)
(17, 5) (17, 3) (17, 5)
updated_data/Training_Data\1335.txt
updated_data/Training_Summ\1335.txt
(16,) (16,) (16,)
(16, 5) (16, 3) (16, 5)
updated_data/Training_Data\1336.txt
updated_data/Training_Summ\1336.txt
(16,) (16

(15, 5) (15, 3) (15, 5)
updated_data/Training_Data\1395.txt
updated_data/Training_Summ\1395.txt
(14,) (14,) (14,)
(14, 5) (14, 3) (14, 5)
updated_data/Training_Data\1396.txt
updated_data/Training_Summ\1396.txt
(16,) (16,) (16,)
(16, 5) (16, 3) (16, 5)
updated_data/Training_Data\1397.txt
updated_data/Training_Summ\1397.txt
(16,) (16,) (16,)
(16, 5) (16, 3) (16, 5)
updated_data/Training_Data\1398.txt
updated_data/Training_Summ\1398.txt
(18,) (18,) (18,)
(18, 5) (18, 3) (18, 5)
updated_data/Training_Data\1399.txt
updated_data/Training_Summ\1399.txt
(8,) (8,) (8,)
(8, 5) (8, 3) (8, 5)
updated_data/Training_Data\14.txt
updated_data/Training_Summ\14.txt
(22,) (22,) (22,)
(22, 5) (22, 3) (22, 5)
updated_data/Training_Data\140.txt
updated_data/Training_Summ\140.txt
(22,) (22,) (22,)
(22, 5) (22, 3) (22, 5)
updated_data/Training_Data\1400.txt
updated_data/Training_Summ\1400.txt
(9,) (9,) (9,)
(9, 5) (9, 3) (9, 5)
updated_data/Training_Data\1401.txt
updated_data/Training_Summ\1401.txt
(13,) (13,

(47, 5) (47, 3) (47, 5)
updated_data/Training_Data\1460.txt
updated_data/Training_Summ\1460.txt
(10,) (10,) (10,)
(10, 5) (10, 3) (10, 5)
updated_data/Training_Data\1461.txt
updated_data/Training_Summ\1461.txt
(18,) (18,) (18,)
(18, 5) (18, 3) (18, 5)
updated_data/Training_Data\1462.txt
updated_data/Training_Summ\1462.txt
(37,) (37,) (37,)
(37, 5) (37, 3) (37, 5)
updated_data/Training_Data\1463.txt
updated_data/Training_Summ\1463.txt
(28,) (28,) (28,)
(28, 5) (28, 3) (28, 5)
updated_data/Training_Data\1464.txt
updated_data/Training_Summ\1464.txt
(15,) (15,) (15,)
(15, 5) (15, 3) (15, 5)
updated_data/Training_Data\1465.txt
updated_data/Training_Summ\1465.txt
(13,) (13,) (13,)
(13, 5) (13, 3) (13, 5)
updated_data/Training_Data\1466.txt
updated_data/Training_Summ\1466.txt
(94,) (94,) (94,)
(94, 5) (94, 3) (94, 5)
updated_data/Training_Data\1467.txt
updated_data/Training_Summ\1467.txt
(23,) (23,) (23,)
(23, 5) (23, 3) (23, 5)
updated_data/Training_Data\1468.txt
updated_data/Training_Summ\1

(19, 5) (19, 3) (19, 5)
updated_data/Training_Data\1526.txt
updated_data/Training_Summ\1526.txt
(28,) (28,) (28,)
(28, 5) (28, 3) (28, 5)
updated_data/Training_Data\1527.txt
updated_data/Training_Summ\1527.txt
(14,) (14,) (14,)
(14, 5) (14, 3) (14, 5)
updated_data/Training_Data\1528.txt
updated_data/Training_Summ\1528.txt
(30,) (30,) (30,)
(30, 5) (30, 3) (30, 5)
updated_data/Training_Data\1529.txt
updated_data/Training_Summ\1529.txt
(42,) (42,) (42,)
(42, 5) (42, 3) (42, 5)
updated_data/Training_Data\153.txt
updated_data/Training_Summ\153.txt
(21,) (21,) (21,)
(21, 5) (21, 3) (21, 5)
updated_data/Training_Data\1530.txt
updated_data/Training_Summ\1530.txt
(26,) (26,) (26,)
(26, 5) (26, 3) (26, 5)
updated_data/Training_Data\1531.txt
updated_data/Training_Summ\1531.txt
(25,) (25,) (25,)
(25, 5) (25, 3) (25, 5)
updated_data/Training_Data\1532.txt
updated_data/Training_Summ\1532.txt
(15,) (15,) (15,)
(15, 5) (15, 3) (15, 5)
updated_data/Training_Data\1533.txt
updated_data/Training_Summ\153

(25, 5) (25, 3) (25, 5)
updated_data/Training_Data\1592.txt
updated_data/Training_Summ\1592.txt
(41,) (41,) (41,)
(41, 5) (41, 3) (41, 5)
updated_data/Training_Data\1593.txt
updated_data/Training_Summ\1593.txt
(15,) (15,) (15,)
(15, 5) (15, 3) (15, 5)
updated_data/Training_Data\1594.txt
updated_data/Training_Summ\1594.txt
(46,) (46,) (46,)
(46, 5) (46, 3) (46, 5)
updated_data/Training_Data\1595.txt
updated_data/Training_Summ\1595.txt
(29,) (29,) (29,)
(29, 5) (29, 3) (29, 5)
updated_data/Training_Data\1596.txt
updated_data/Training_Summ\1596.txt
(32,) (32,) (32,)
(32, 5) (32, 3) (32, 5)
updated_data/Training_Data\1597.txt
updated_data/Training_Summ\1597.txt
(23,) (23,) (23,)
(23, 5) (23, 3) (23, 5)
updated_data/Training_Data\1598.txt
updated_data/Training_Summ\1598.txt
(20,) (20,) (20,)
(20, 5) (20, 3) (20, 5)
updated_data/Training_Data\1599.txt
updated_data/Training_Summ\1599.txt
(40,) (40,) (40,)
(40, 5) (40, 3) (40, 5)
updated_data/Training_Data\16.txt
updated_data/Training_Summ\16.

(15, 5) (15, 3) (15, 5)
updated_data/Training_Data\1658.txt
updated_data/Training_Summ\1658.txt
(21,) (21,) (21,)
(21, 5) (21, 3) (21, 5)
updated_data/Training_Data\1659.txt
updated_data/Training_Summ\1659.txt
(23,) (23,) (23,)
(23, 5) (23, 3) (23, 5)
updated_data/Training_Data\166.txt
updated_data/Training_Summ\166.txt
(12,) (12,) (12,)
(12, 5) (12, 3) (12, 5)
updated_data/Training_Data\1660.txt
updated_data/Training_Summ\1660.txt
(23,) (23,) (23,)
(23, 5) (23, 3) (23, 5)
updated_data/Training_Data\1661.txt
updated_data/Training_Summ\1661.txt
(18,) (18,) (18,)
(18, 5) (18, 3) (18, 5)
updated_data/Training_Data\1662.txt
updated_data/Training_Summ\1662.txt
(20,) (20,) (20,)
(20, 5) (20, 3) (20, 5)
updated_data/Training_Data\1663.txt
updated_data/Training_Summ\1663.txt
(30,) (30,) (30,)
(30, 5) (30, 3) (30, 5)
updated_data/Training_Data\1664.txt
updated_data/Training_Summ\1664.txt
(12,) (12,) (12,)
(12, 5) (12, 3) (12, 5)
updated_data/Training_Data\1665.txt
updated_data/Training_Summ\166

(18, 5) (18, 3) (18, 5)
updated_data/Training_Data\1723.txt
updated_data/Training_Summ\1723.txt
(16,) (16,) (16,)
(16, 5) (16, 3) (16, 5)
updated_data/Training_Data\1724.txt
updated_data/Training_Summ\1724.txt
(36,) (36,) (36,)
(36, 5) (36, 3) (36, 5)
updated_data/Training_Data\1725.txt
updated_data/Training_Summ\1725.txt
(31,) (31,) (31,)
(31, 5) (31, 3) (31, 5)
updated_data/Training_Data\1726.txt
updated_data/Training_Summ\1726.txt
(21,) (21,) (21,)
(21, 5) (21, 3) (21, 5)
updated_data/Training_Data\1727.txt
updated_data/Training_Summ\1727.txt
(35,) (35,) (35,)
(35, 5) (35, 3) (35, 5)
updated_data/Training_Data\1728.txt
updated_data/Training_Summ\1728.txt
(13,) (13,) (13,)
(13, 5) (13, 3) (13, 5)
updated_data/Training_Data\1729.txt
updated_data/Training_Summ\1729.txt
(15,) (15,) (15,)
(15, 5) (15, 3) (15, 5)
updated_data/Training_Data\173.txt
updated_data/Training_Summ\173.txt
(18,) (18,) (18,)
(18, 5) (18, 3) (18, 5)
updated_data/Training_Data\1730.txt
updated_data/Training_Summ\173

(20, 5) (20, 3) (20, 5)
updated_data/Training_Data\19.txt
updated_data/Training_Summ\19.txt
(12,) (12,) (12,)
(12, 5) (12, 3) (12, 5)
updated_data/Training_Data\190.txt
updated_data/Training_Summ\190.txt
(19,) (19,) (19,)
(19, 5) (19, 3) (19, 5)
updated_data/Training_Data\191.txt
updated_data/Training_Summ\191.txt
(20,) (20,) (20,)
(20, 5) (20, 3) (20, 5)
updated_data/Training_Data\192.txt
updated_data/Training_Summ\192.txt
(16,) (16,) (16,)
(16, 5) (16, 3) (16, 5)
updated_data/Training_Data\193.txt
updated_data/Training_Summ\193.txt
(14,) (14,) (14,)
(14, 5) (14, 3) (14, 5)
updated_data/Training_Data\194.txt
updated_data/Training_Summ\194.txt
(10,) (10,) (10,)
(10, 5) (10, 3) (10, 5)
updated_data/Training_Data\195.txt
updated_data/Training_Summ\195.txt
(21,) (21,) (21,)
(21, 5) (21, 3) (21, 5)
updated_data/Training_Data\196.txt
updated_data/Training_Summ\196.txt
(15,) (15,) (15,)
(15, 5) (15, 3) (15, 5)
updated_data/Training_Data\197.txt
updated_data/Training_Summ\197.txt
(13,) (13,) 

(9, 5) (9, 3) (9, 5)
updated_data/Training_Data\256.txt
updated_data/Training_Summ\256.txt
(11,) (11,) (11,)
(11, 5) (11, 3) (11, 5)
updated_data/Training_Data\257.txt
updated_data/Training_Summ\257.txt
(22,) (22,) (22,)
(22, 5) (22, 3) (22, 5)
updated_data/Training_Data\258.txt
updated_data/Training_Summ\258.txt
(20,) (20,) (20,)
(20, 5) (20, 3) (20, 5)
updated_data/Training_Data\259.txt
updated_data/Training_Summ\259.txt
(12,) (12,) (12,)
(12, 5) (12, 3) (12, 5)
updated_data/Training_Data\26.txt
updated_data/Training_Summ\26.txt
(14,) (14,) (14,)
(14, 5) (14, 3) (14, 5)
updated_data/Training_Data\260.txt
updated_data/Training_Summ\260.txt
(26,) (26,) (26,)
(26, 5) (26, 3) (26, 5)
updated_data/Training_Data\261.txt
updated_data/Training_Summ\261.txt
(33,) (33,) (33,)
(33, 5) (33, 3) (33, 5)
updated_data/Training_Data\262.txt
updated_data/Training_Summ\262.txt
(33,) (33,) (33,)
(33, 5) (33, 3) (33, 5)
updated_data/Training_Data\263.txt
updated_data/Training_Summ\263.txt
(16,) (16,) (16

(8, 5) (8, 3) (8, 5)
updated_data/Training_Data\322.txt
updated_data/Training_Summ\322.txt
(8,) (8,) (8,)
(8, 5) (8, 3) (8, 5)
updated_data/Training_Data\323.txt
updated_data/Training_Summ\323.txt
(19,) (19,) (19,)
(19, 5) (19, 3) (19, 5)
updated_data/Training_Data\324.txt
updated_data/Training_Summ\324.txt
(16,) (16,) (16,)
(16, 5) (16, 3) (16, 5)
updated_data/Training_Data\325.txt
updated_data/Training_Summ\325.txt
(13,) (13,) (13,)
(13, 5) (13, 3) (13, 5)
updated_data/Training_Data\326.txt
updated_data/Training_Summ\326.txt
(19,) (19,) (19,)
(19, 5) (19, 3) (19, 5)
updated_data/Training_Data\327.txt
updated_data/Training_Summ\327.txt
(15,) (15,) (15,)
(15, 5) (15, 3) (15, 5)
updated_data/Training_Data\328.txt
updated_data/Training_Summ\328.txt
(12,) (12,) (12,)
(12, 5) (12, 3) (12, 5)
updated_data/Training_Data\329.txt
updated_data/Training_Summ\329.txt
(14,) (14,) (14,)
(14, 5) (14, 3) (14, 5)
updated_data/Training_Data\33.txt
updated_data/Training_Summ\33.txt
(10,) (10,) (10,)
(10

(12, 5) (12, 3) (12, 5)
updated_data/Training_Data\39.txt
updated_data/Training_Summ\39.txt
(11,) (11,) (11,)
(11, 5) (11, 3) (11, 5)
updated_data/Training_Data\390.txt
updated_data/Training_Summ\390.txt
(9,) (9,) (9,)
(9, 5) (9, 3) (9, 5)
updated_data/Training_Data\391.txt
updated_data/Training_Summ\391.txt
(19,) (19,) (19,)
(19, 5) (19, 3) (19, 5)
updated_data/Training_Data\392.txt
updated_data/Training_Summ\392.txt
(16,) (16,) (16,)
(16, 5) (16, 3) (16, 5)
updated_data/Training_Data\393.txt
updated_data/Training_Summ\393.txt
(17,) (17,) (17,)
(17, 5) (17, 3) (17, 5)
updated_data/Training_Data\394.txt
updated_data/Training_Summ\394.txt
(8,) (8,) (8,)
(8, 5) (8, 3) (8, 5)
updated_data/Training_Data\395.txt
updated_data/Training_Summ\395.txt
(11,) (11,) (11,)
(11, 5) (11, 3) (11, 5)
updated_data/Training_Data\396.txt
updated_data/Training_Summ\396.txt
(8,) (8,) (8,)
(8, 5) (8, 3) (8, 5)
updated_data/Training_Data\397.txt
updated_data/Training_Summ\397.txt
(17,) (17,) (17,)
(17, 5) (17,

(11, 5) (11, 3) (11, 5)
updated_data/Training_Data\456.txt
updated_data/Training_Summ\456.txt
(14,) (14,) (14,)
(14, 5) (14, 3) (14, 5)
updated_data/Training_Data\457.txt
updated_data/Training_Summ\457.txt
(13,) (13,) (13,)
(13, 5) (13, 3) (13, 5)
updated_data/Training_Data\458.txt
updated_data/Training_Summ\458.txt
(13,) (13,) (13,)
(13, 5) (13, 3) (13, 5)
updated_data/Training_Data\459.txt
updated_data/Training_Summ\459.txt
(19,) (19,) (19,)
(19, 5) (19, 3) (19, 5)
updated_data/Training_Data\46.txt
updated_data/Training_Summ\46.txt
(15,) (15,) (15,)
(15, 5) (15, 3) (15, 5)
updated_data/Training_Data\460.txt
updated_data/Training_Summ\460.txt
(11,) (11,) (11,)
(11, 5) (11, 3) (11, 5)
updated_data/Training_Data\461.txt
updated_data/Training_Summ\461.txt
(12,) (12,) (12,)
(12, 5) (12, 3) (12, 5)
updated_data/Training_Data\462.txt
updated_data/Training_Summ\462.txt
(14,) (14,) (14,)
(14, 5) (14, 3) (14, 5)
updated_data/Training_Data\463.txt
updated_data/Training_Summ\463.txt
(15,) (15,) 

(11, 5) (11, 3) (11, 5)
updated_data/Training_Data\522.txt
updated_data/Training_Summ\522.txt
(13,) (13,) (13,)
(13, 5) (13, 3) (13, 5)
updated_data/Training_Data\523.txt
updated_data/Training_Summ\523.txt
(14,) (14,) (14,)
(14, 5) (14, 3) (14, 5)
updated_data/Training_Data\524.txt
updated_data/Training_Summ\524.txt
(10,) (10,) (10,)
(10, 5) (10, 3) (10, 5)
updated_data/Training_Data\525.txt
updated_data/Training_Summ\525.txt
(13,) (13,) (13,)
(13, 5) (13, 3) (13, 5)
updated_data/Training_Data\526.txt
updated_data/Training_Summ\526.txt
(14,) (14,) (14,)
(14, 5) (14, 3) (14, 5)
updated_data/Training_Data\527.txt
updated_data/Training_Summ\527.txt
(11,) (11,) (11,)
(11, 5) (11, 3) (11, 5)
updated_data/Training_Data\528.txt
updated_data/Training_Summ\528.txt
(44,) (44,) (44,)
(44, 5) (44, 3) (44, 5)
updated_data/Training_Data\529.txt
updated_data/Training_Summ\529.txt
(12,) (12,) (12,)
(12, 5) (12, 3) (12, 5)
updated_data/Training_Data\53.txt
updated_data/Training_Summ\53.txt
(9,) (9,) (9

(35, 5) (35, 3) (35, 5)
updated_data/Training_Data\59.txt
updated_data/Training_Summ\59.txt
(14,) (14,) (14,)
(14, 5) (14, 3) (14, 5)
updated_data/Training_Data\590.txt
updated_data/Training_Summ\590.txt
(18,) (18,) (18,)
(18, 5) (18, 3) (18, 5)
updated_data/Training_Data\591.txt
updated_data/Training_Summ\591.txt
(15,) (15,) (15,)
(15, 5) (15, 3) (15, 5)
updated_data/Training_Data\592.txt
updated_data/Training_Summ\592.txt
(19,) (19,) (19,)
(19, 5) (19, 3) (19, 5)
updated_data/Training_Data\593.txt
updated_data/Training_Summ\593.txt
(11,) (11,) (11,)
(11, 5) (11, 3) (11, 5)
updated_data/Training_Data\594.txt
updated_data/Training_Summ\594.txt
(8,) (8,) (8,)
(8, 5) (8, 3) (8, 5)
updated_data/Training_Data\595.txt
updated_data/Training_Summ\595.txt
(11,) (11,) (11,)
(11, 5) (11, 3) (11, 5)
updated_data/Training_Data\596.txt
updated_data/Training_Summ\596.txt
(11,) (11,) (11,)
(11, 5) (11, 3) (11, 5)
updated_data/Training_Data\597.txt
updated_data/Training_Summ\597.txt
(9,) (9,) (9,)
(9,

(23, 5) (23, 3) (23, 5)
updated_data/Training_Data\657.txt
updated_data/Training_Summ\657.txt
(7,) (7,) (7,)
(7, 5) (7, 3) (7, 5)
updated_data/Training_Data\658.txt
updated_data/Training_Summ\658.txt
(11,) (11,) (11,)
(11, 5) (11, 3) (11, 5)
updated_data/Training_Data\659.txt
updated_data/Training_Summ\659.txt
(20,) (20,) (20,)
(20, 5) (20, 3) (20, 5)
updated_data/Training_Data\66.txt
updated_data/Training_Summ\66.txt
(15,) (15,) (15,)
(15, 5) (15, 3) (15, 5)
updated_data/Training_Data\660.txt
updated_data/Training_Summ\660.txt
(11,) (11,) (11,)
(11, 5) (11, 3) (11, 5)
updated_data/Training_Data\661.txt
updated_data/Training_Summ\661.txt
(238,) (238,) (238,)
(238, 5) (238, 3) (238, 5)
updated_data/Training_Data\662.txt
updated_data/Training_Summ\662.txt
(18,) (18,) (18,)
(18, 5) (18, 3) (18, 5)
updated_data/Training_Data\663.txt
updated_data/Training_Summ\663.txt
(27,) (27,) (27,)
(27, 5) (27, 3) (27, 5)
updated_data/Training_Data\665.txt
updated_data/Training_Summ\665.txt
(18,) (18,) 

(21, 5) (21, 3) (21, 5)
updated_data/Training_Data\724.txt
updated_data/Training_Summ\724.txt
(23,) (23,) (23,)
(23, 5) (23, 3) (23, 5)
updated_data/Training_Data\725.txt
updated_data/Training_Summ\725.txt
(31,) (31,) (31,)
(31, 5) (31, 3) (31, 5)
updated_data/Training_Data\726.txt
updated_data/Training_Summ\726.txt
(10,) (10,) (10,)
(10, 5) (10, 3) (10, 5)
updated_data/Training_Data\727.txt
updated_data/Training_Summ\727.txt
(16,) (16,) (16,)
(16, 5) (16, 3) (16, 5)
updated_data/Training_Data\728.txt
updated_data/Training_Summ\728.txt
(24,) (24,) (24,)
(24, 5) (24, 3) (24, 5)
updated_data/Training_Data\729.txt
updated_data/Training_Summ\729.txt
(25,) (25,) (25,)
(25, 5) (25, 3) (25, 5)
updated_data/Training_Data\73.txt
updated_data/Training_Summ\73.txt
(11,) (11,) (11,)
(11, 5) (11, 3) (11, 5)
updated_data/Training_Data\730.txt
updated_data/Training_Summ\730.txt
(23,) (23,) (23,)
(23, 5) (23, 3) (23, 5)
updated_data/Training_Data\731.txt
updated_data/Training_Summ\731.txt
(8,) (8,) (8

(11, 5) (11, 3) (11, 5)
updated_data/Training_Data\791.txt
updated_data/Training_Summ\791.txt
(16,) (16,) (16,)
(16, 5) (16, 3) (16, 5)
updated_data/Training_Data\792.txt
updated_data/Training_Summ\792.txt
(16,) (16,) (16,)
(16, 5) (16, 3) (16, 5)
updated_data/Training_Data\793.txt
updated_data/Training_Summ\793.txt
(10,) (10,) (10,)
(10, 5) (10, 3) (10, 5)
updated_data/Training_Data\794.txt
updated_data/Training_Summ\794.txt
(28,) (28,) (28,)
(28, 5) (28, 3) (28, 5)
updated_data/Training_Data\795.txt
updated_data/Training_Summ\795.txt
(22,) (22,) (22,)
(22, 5) (22, 3) (22, 5)
updated_data/Training_Data\796.txt
updated_data/Training_Summ\796.txt
(27,) (27,) (27,)
(27, 5) (27, 3) (27, 5)
updated_data/Training_Data\797.txt
updated_data/Training_Summ\797.txt
(18,) (18,) (18,)
(18, 5) (18, 3) (18, 5)
updated_data/Training_Data\798.txt
updated_data/Training_Summ\798.txt
(26,) (26,) (26,)
(26, 5) (26, 3) (26, 5)
updated_data/Training_Data\799.txt
updated_data/Training_Summ\799.txt
(15,) (15,

(27, 5) (27, 3) (27, 5)
updated_data/Training_Data\858.txt
updated_data/Training_Summ\858.txt
(24,) (24,) (24,)
(24, 5) (24, 3) (24, 5)
updated_data/Training_Data\859.txt
updated_data/Training_Summ\859.txt
(17,) (17,) (17,)
(17, 5) (17, 3) (17, 5)
updated_data/Training_Data\86.txt
updated_data/Training_Summ\86.txt
(9,) (9,) (9,)
(9, 5) (9, 3) (9, 5)
updated_data/Training_Data\860.txt
updated_data/Training_Summ\860.txt
(13,) (13,) (13,)
(13, 5) (13, 3) (13, 5)
updated_data/Training_Data\861.txt
updated_data/Training_Summ\861.txt
(9,) (9,) (9,)
(9, 5) (9, 3) (9, 5)
updated_data/Training_Data\862.txt
updated_data/Training_Summ\862.txt
(18,) (18,) (18,)
(18, 5) (18, 3) (18, 5)
updated_data/Training_Data\863.txt
updated_data/Training_Summ\863.txt
(23,) (23,) (23,)
(23, 5) (23, 3) (23, 5)
updated_data/Training_Data\864.txt
updated_data/Training_Summ\864.txt
(24,) (24,) (24,)
(24, 5) (24, 3) (24, 5)
updated_data/Training_Data\865.txt
updated_data/Training_Summ\865.txt
(17,) (17,) (17,)
(17, 5

(13, 5) (13, 3) (13, 5)
updated_data/Training_Data\924.txt
updated_data/Training_Summ\924.txt
(12,) (12,) (12,)
(12, 5) (12, 3) (12, 5)
updated_data/Training_Data\925.txt
updated_data/Training_Summ\925.txt
(16,) (16,) (16,)
(16, 5) (16, 3) (16, 5)
updated_data/Training_Data\926.txt
updated_data/Training_Summ\926.txt
(26,) (26,) (26,)
(26, 5) (26, 3) (26, 5)
updated_data/Training_Data\927.txt
updated_data/Training_Summ\927.txt
(24,) (24,) (24,)
(24, 5) (24, 3) (24, 5)
updated_data/Training_Data\928.txt
updated_data/Training_Summ\928.txt
(8,) (8,) (8,)
(8, 5) (8, 3) (8, 5)
updated_data/Training_Data\929.txt
updated_data/Training_Summ\929.txt
(24,) (24,) (24,)
(24, 5) (24, 3) (24, 5)
updated_data/Training_Data\93.txt
updated_data/Training_Summ\93.txt
(14,) (14,) (14,)
(14, 5) (14, 3) (14, 5)
updated_data/Training_Data\930.txt
updated_data/Training_Summ\930.txt
(20,) (20,) (20,)
(20, 5) (20, 3) (20, 5)
updated_data/Training_Data\931.txt
updated_data/Training_Summ\931.txt
(29,) (29,) (29,)


(18, 5) (18, 3) (18, 5)
updated_data/Training_Data\991.txt
updated_data/Training_Summ\991.txt
(19,) (19,) (19,)
(19, 5) (19, 3) (19, 5)
updated_data/Training_Data\992.txt
updated_data/Training_Summ\992.txt
(22,) (22,) (22,)
(22, 5) (22, 3) (22, 5)
updated_data/Training_Data\993.txt
updated_data/Training_Summ\993.txt
(12,) (12,) (12,)
(12, 5) (12, 3) (12, 5)
updated_data/Training_Data\994.txt
updated_data/Training_Summ\994.txt
(14,) (14,) (14,)
(14, 5) (14, 3) (14, 5)
updated_data/Training_Data\995.txt
updated_data/Training_Summ\995.txt
(22,) (22,) (22,)
(22, 5) (22, 3) (22, 5)
updated_data/Training_Data\996.txt
updated_data/Training_Summ\996.txt
(15,) (15,) (15,)
(15, 5) (15, 3) (15, 5)
updated_data/Training_Data\997.txt
updated_data/Training_Summ\997.txt
(21,) (21,) (21,)
(21, 5) (21, 3) (21, 5)
updated_data/Training_Data\998.txt
updated_data/Training_Summ\998.txt
(25,) (25,) (25,)
(25, 5) (25, 3) (25, 5)
updated_data/Training_Data\999.txt
updated_data/Training_Summ\999.txt
(20,) (20,

In [26]:
print(len(labels))
print(feature_vectors.shape)

32615
(32615, 13)


In [61]:
#saveing dataset into an excel file
import pandas as pd

## convert your array into a dataframe
df = pd.DataFrame (feature_vectors)
df2 = pd.DataFrame (labels)

## save to xlsx file

filepath = 'X_train.xlsx'
filepath2= 'y_train.xlsx'

df.to_excel(filepath, index=False)
df2.to_excel(filepath2, index=False) 
# df.to_excel(filepath, header=['Para_First','Position','Length','Quote','Doc_First','FirstRel_Doc','FirstRel_Para','PageRankRel','CentroidVar_Uni','CentroidVar_Bi','FreqWord_Uni','FreqWord_Bi','SigTerm_Uni'])
# df2.to_excel(filepath2, header=['Label'])

In [62]:
def get_feature_vector_with_labels_for_testing():
    feature_vectors = abc = np.zeros((0,13))
    labels = []
    data_paths = [] 
    summ_paths = []
    for path in glob.glob("updated_data/Test_Data/*.txt"):
        data_paths.append(path)
    data_paths.sort()
    for path in glob.glob("updated_data/Test_Summ/*.txt"):
        summ_paths.append(path)
    summ_paths.sort()
    for i in range(len(data_paths)):
        print(data_paths[i])
        print(summ_paths[i])
        doc_data = get_document_without_heading(data_paths[i])
        summ_data = get_document_without_heading(summ_paths[i], is_summ = True)
        doc_labels = get_labels(doc_data, summ_data)
        doc_feature_vector = get_feature_vector(data_paths[i])
        feature_vectors = np.vstack([feature_vectors, doc_feature_vector])
        labels = labels + doc_labels
    return feature_vectors, labels

In [63]:
f,l = get_feature_vector_with_labels_for_testing()

updated_data/Test_Data\1.txt
updated_data/Test_Summ\1.txt
(31,) (31,) (31,)


  # This is added back by InteractiveShellApp.init_path()
  if np.issubdtype(vec.dtype, np.int):


(31, 5) (31, 3) (31, 5)
updated_data/Test_Data\10.txt
updated_data/Test_Summ\10.txt
(8,) (8,) (8,)
(8, 5) (8, 3) (8, 5)
updated_data/Test_Data\100.txt
updated_data/Test_Summ\100.txt
(10,) (10,) (10,)
(10, 5) (10, 3) (10, 5)
updated_data/Test_Data\101.txt
updated_data/Test_Summ\101.txt
(15,) (15,) (15,)
(15, 5) (15, 3) (15, 5)
updated_data/Test_Data\102.txt
updated_data/Test_Summ\102.txt
(7,) (7,) (7,)
(7, 5) (7, 3) (7, 5)
updated_data/Test_Data\103.txt
updated_data/Test_Summ\103.txt
(8,) (8,) (8,)
(8, 5) (8, 3) (8, 5)
updated_data/Test_Data\104.txt
updated_data/Test_Summ\104.txt
(9,) (9,) (9,)
(9, 5) (9, 3) (9, 5)
updated_data/Test_Data\105.txt
updated_data/Test_Summ\105.txt
(24,) (24,) (24,)
(24, 5) (24, 3) (24, 5)
updated_data/Test_Data\106.txt
updated_data/Test_Summ\106.txt
(13,) (13,) (13,)
(13, 5) (13, 3) (13, 5)
updated_data/Test_Data\107.txt
updated_data/Test_Summ\107.txt
(11,) (11,) (11,)
(11, 5) (11, 3) (11, 5)
updated_data/Test_Data\108.txt
updated_data/Test_Summ\108.txt
(52,

(11, 5) (11, 3) (11, 5)
updated_data/Test_Data\172.txt
updated_data/Test_Summ\172.txt
(19,) (19,) (19,)
(19, 5) (19, 3) (19, 5)
updated_data/Test_Data\173.txt
updated_data/Test_Summ\173.txt
(10,) (10,) (10,)
(10, 5) (10, 3) (10, 5)
updated_data/Test_Data\174.txt
updated_data/Test_Summ\174.txt
(9,) (9,) (9,)
(9, 5) (9, 3) (9, 5)
updated_data/Test_Data\175.txt
updated_data/Test_Summ\175.txt
(13,) (13,) (13,)
(13, 5) (13, 3) (13, 5)
updated_data/Test_Data\176.txt
updated_data/Test_Summ\176.txt
(12,) (12,) (12,)
(12, 5) (12, 3) (12, 5)
updated_data/Test_Data\177.txt
updated_data/Test_Summ\177.txt
(10,) (10,) (10,)
(10, 5) (10, 3) (10, 5)
updated_data/Test_Data\178.txt
updated_data/Test_Summ\178.txt
(12,) (12,) (12,)
(12, 5) (12, 3) (12, 5)
updated_data/Test_Data\179.txt
updated_data/Test_Summ\179.txt
(11,) (11,) (11,)
(11, 5) (11, 3) (11, 5)
updated_data/Test_Data\18.txt
updated_data/Test_Summ\18.txt
(22,) (22,) (22,)
(22, 5) (22, 3) (22, 5)
updated_data/Test_Data\180.txt
updated_data/Test

(33, 5) (33, 3) (33, 5)
updated_data/Test_Data\244.txt
updated_data/Test_Summ\244.txt
(4,) (4,) (4,)
(4, 5) (4, 3) (4, 5)
updated_data/Test_Data\245.txt
updated_data/Test_Summ\245.txt
(7,) (7,) (7,)
(7, 5) (7, 3) (7, 5)
updated_data/Test_Data\246.txt
updated_data/Test_Summ\246.txt
(27,) (27,) (27,)
(27, 5) (27, 3) (27, 5)
updated_data/Test_Data\247.txt
updated_data/Test_Summ\247.txt
(33,) (33,) (33,)
(33, 5) (33, 3) (33, 5)
updated_data/Test_Data\248.txt
updated_data/Test_Summ\248.txt
(11,) (11,) (11,)
(11, 5) (11, 3) (11, 5)
updated_data/Test_Data\249.txt
updated_data/Test_Summ\249.txt
(14,) (14,) (14,)
(14, 5) (14, 3) (14, 5)
updated_data/Test_Data\25.txt
updated_data/Test_Summ\25.txt
(10,) (10,) (10,)
(10, 5) (10, 3) (10, 5)
updated_data/Test_Data\250.txt
updated_data/Test_Summ\250.txt
(22,) (22,) (22,)
(22, 5) (22, 3) (22, 5)
updated_data/Test_Data\251.txt
updated_data/Test_Summ\251.txt
(14,) (14,) (14,)
(14, 5) (14, 3) (14, 5)
updated_data/Test_Data\252.txt
updated_data/Test_Summ\

(9, 5) (9, 3) (9, 5)
updated_data/Test_Data\316.txt
updated_data/Test_Summ\316.txt
(8,) (8,) (8,)
(8, 5) (8, 3) (8, 5)
updated_data/Test_Data\317.txt
updated_data/Test_Summ\317.txt
(14,) (14,) (14,)
(14, 5) (14, 3) (14, 5)
updated_data/Test_Data\318.txt
updated_data/Test_Summ\318.txt
(12,) (12,) (12,)
(12, 5) (12, 3) (12, 5)
updated_data/Test_Data\319.txt
updated_data/Test_Summ\319.txt
(8,) (8,) (8,)
(8, 5) (8, 3) (8, 5)
updated_data/Test_Data\32.txt
updated_data/Test_Summ\32.txt
(19,) (19,) (19,)
(19, 5) (19, 3) (19, 5)
updated_data/Test_Data\320.txt
updated_data/Test_Summ\320.txt
(22,) (22,) (22,)
(22, 5) (22, 3) (22, 5)
updated_data/Test_Data\321.txt
updated_data/Test_Summ\321.txt
(16,) (16,) (16,)
(16, 5) (16, 3) (16, 5)
updated_data/Test_Data\322.txt
updated_data/Test_Summ\322.txt
(24,) (24,) (24,)
(24, 5) (24, 3) (24, 5)
updated_data/Test_Data\323.txt
updated_data/Test_Summ\323.txt
(6,) (6,) (6,)
(6, 5) (6, 3) (6, 5)
updated_data/Test_Data\324.txt
updated_data/Test_Summ\324.txt
(

(17, 5) (17, 3) (17, 5)
updated_data/Test_Data\389.txt
updated_data/Test_Summ\389.txt
(38,) (38,) (38,)
(38, 5) (38, 3) (38, 5)
updated_data/Test_Data\39.txt
updated_data/Test_Summ\39.txt
(12,) (12,) (12,)
(12, 5) (12, 3) (12, 5)
updated_data/Test_Data\390.txt
updated_data/Test_Summ\390.txt
(16,) (16,) (16,)
(16, 5) (16, 3) (16, 5)
updated_data/Test_Data\391.txt
updated_data/Test_Summ\391.txt
(24,) (24,) (24,)
(24, 5) (24, 3) (24, 5)
updated_data/Test_Data\392.txt
updated_data/Test_Summ\392.txt
(19,) (19,) (19,)
(19, 5) (19, 3) (19, 5)
updated_data/Test_Data\393.txt
updated_data/Test_Summ\393.txt
(14,) (14,) (14,)
(14, 5) (14, 3) (14, 5)
updated_data/Test_Data\394.txt
updated_data/Test_Summ\394.txt
(25,) (25,) (25,)
(25, 5) (25, 3) (25, 5)
updated_data/Test_Data\395.txt
updated_data/Test_Summ\395.txt
(45,) (45,) (45,)
(45, 5) (45, 3) (45, 5)
updated_data/Test_Data\396.txt
updated_data/Test_Summ\396.txt
(18,) (18,) (18,)
(18, 5) (18, 3) (18, 5)
updated_data/Test_Data\397.txt
updated_dat

(12, 5) (12, 3) (12, 5)
updated_data/Test_Data\57.txt
updated_data/Test_Summ\57.txt
(11,) (11,) (11,)
(11, 5) (11, 3) (11, 5)
updated_data/Test_Data\58.txt
updated_data/Test_Summ\58.txt
(14,) (14,) (14,)
(14, 5) (14, 3) (14, 5)
updated_data/Test_Data\59.txt
updated_data/Test_Summ\59.txt
(11,) (11,) (11,)
(11, 5) (11, 3) (11, 5)
updated_data/Test_Data\6.txt
updated_data/Test_Summ\6.txt
(17,) (17,) (17,)
(17, 5) (17, 3) (17, 5)
updated_data/Test_Data\60.txt
updated_data/Test_Summ\60.txt
(16,) (16,) (16,)
(16, 5) (16, 3) (16, 5)
updated_data/Test_Data\61.txt
updated_data/Test_Summ\61.txt
(15,) (15,) (15,)
(15, 5) (15, 3) (15, 5)
updated_data/Test_Data\62.txt
updated_data/Test_Summ\62.txt
(7,) (7,) (7,)
(7, 5) (7, 3) (7, 5)
updated_data/Test_Data\63.txt
updated_data/Test_Summ\63.txt
(25,) (25,) (25,)
(25, 5) (25, 3) (25, 5)
updated_data/Test_Data\64.txt
updated_data/Test_Summ\64.txt
(23,) (23,) (23,)
(23, 5) (23, 3) (23, 5)
updated_data/Test_Data\65.txt
updated_data/Test_Summ\65.txt
(7,) (

In [64]:
print(len(l))
print(f.shape)

8635
(8635, 13)


In [68]:
#saveing dataset into an excel file
import pandas as pd

## convert your array into a dataframe
df3 = pd.DataFrame (l)
df4 = pd.DataFrame (f)

## save to xlsx file

filepath3 = 'X_test.xlsx'
filepath4= 'y_test.xlsx'

df3.to_excel(filepath4,index=False)
df4.to_excel(filepath3,index=False)
             
# df3.to_excel(filepath4,header=['Label'] )
# df4.to_excel(filepath3, header=['Para_First','Position','Length','Quote','Doc_First','FirstRel_Doc','FirstRel_Para','PageRankRel','CentroidVar_Uni','CentroidVar_Bi','FreqWord_Uni','FreqWord_Bi','SigTerm_Uni'])

In [None]:
# df.to_excel(filepath, index=False)