<a href="https://colab.research.google.com/github/ShoaibSheriff/NLP/blob/master/Project/Extractive_Summarization_NER.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import re
import string
import numpy as np
import matplotlib.pyplot as plt
from nltk import pos_tag
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus.reader.wordnet import NOUN, VERB, ADJ, ADV
from nltk.corpus import brown, stopwords
from nltk.cluster.util import cosine_distance
from operator import itemgetter
from pprint import pprint
%matplotlib

Using matplotlib backend: agg


In [0]:
import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm
from pprint import pprint
nlp = en_core_web_sm.load()

In [0]:
import zipfile
from google.colab import drive

drive.mount('/content/drive/')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive/


In [0]:
import nltk
nltk.download('brown')
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [0]:
class TextCleaner():
    
    def __init__(self):
        self.stop_words = set(stopwords.words("english"))
        self.punctuations = set(string.punctuation)
        self.pos_tags = {
                NOUN: ['NN', 'NNS', 'NNP', 'NNPS', 'PRP', 'PRP$', 'WP', 'WP$'],
                VERB: ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'],
                ADJ: ['JJ', 'JJR', 'JJS'],
                ADV: ['RB', 'RBR', 'RBS', 'WRB']
        }


    def _remove_stop_words(self, words):
        return [w for w in words if w not in self.stop_words]
     
    
    def _remove_regex(self):
        self.input_sent = " ".join([w.lower() for w in self.input_sent])
        self.input_sent = re.sub(r"i'm", "i am", self.input_sent)
        self.input_sent = re.sub(r"he's", "he is", self.input_sent)
        self.input_sent = re.sub(r"she's", "she is", self.input_sent)
        self.input_sent = re.sub(r"that's", "that is", self.input_sent)
        self.input_sent = re.sub(r"what's", "what is", self.input_sent)
        self.input_sent = re.sub(r"where's", "where is", self.input_sent)
        self.input_sent = re.sub(r"\'ll", " will", self.input_sent)
        self.input_sent = re.sub(r"\'ve", " have", self.input_sent)
        self.input_sent = re.sub(r"\'re", " are", self.input_sent)
        self.input_sent = re.sub(r"\'d", " would", self.input_sent)
        self.input_sent = re.sub(r"won't", "will not", self.input_sent)
        self.input_sent = re.sub(r"can't", "cannot", self.input_sent)
        self.input_sent = re.sub(r"don't", "do not", self.input_sent)
        patterns = re.finditer("#[\w]*", self.input_sent)
        for pattern in patterns:
            self.input_sent = re.sub(pattern.group().strip(), "", self.input_sent)
        self.input_sent = "".join(ch for ch in self.input_sent if ch not in self.punctuations)
    
    
    def _tokenize(self):
        return word_tokenize(self.input_sent)
    
    
    def _process_content_for_pos(self, words):
        tagged_words = pos_tag(words)
        pos_words = []
        for word in tagged_words:
            flag = False
            for key, value in self.pos_tags.items():
                if word[1] in value:
                    pos_words.append((word[0], key))
                    flag = True
                    break
            if not flag:
                pos_words.append((word[0], NOUN))
        return pos_words
       
                 
    def _remove_noise(self):
        self._remove_regex()
        words = self._tokenize()
        noise_free_words = self._remove_stop_words(words)
        return noise_free_words
    
    
    def _normalize_text(self, words):
        lem = WordNetLemmatizer()
        pos_words = self._process_content_for_pos(words)
        normalized_words = [lem.lemmatize(w, pos=p) for w, p in pos_words]
        return normalized_words
    
    
    def clean_up(self, input_sent):
        self.input_sent = input_sent
        cleaned_words = self._remove_noise()
        cleaned_words = self._normalize_text(cleaned_words)
        return cleaned_words

In [0]:
def pagerank(M, eps=1.0e-8, d=0.85):
    N = M.shape[1]
    v = np.random.rand(N, 1)
    v = v / np.linalg.norm(v, 1)
    last_v = np.ones((N, 1), dtype=np.float32) * np.inf
    M_hat = (d * M) + (((1 - d) / N) * np.ones((N, N), dtype=np.float32))
    
    while np.linalg.norm(v - last_v, 2) > eps:
        last_v = v
        v = np.matmul(M_hat, v)
    return v

In [0]:
def sentence_similarity(sent1, sent2):
    text_cleaner = TextCleaner()
    
    sent1 = text_cleaner.clean_up(sent1)
    sent2 = text_cleaner.clean_up(sent2)
    
    all_words = list(set(sent1 + sent2))
    
    vector1 = [0] * len(all_words)
    vector2 = [0] * len(all_words)
    
    for w in sent1:
        vector1[all_words.index(w)] += 1
    
    for w in sent2:
        vector2[all_words.index(w)] += 1

    
    return 1 - cosine_distance(vector1, vector2)


In [0]:
def build_similarity_matrix(sentences):
    S = np.zeros((len(sentences), len(sentences)))
    for i in range(len(sentences)):
        for j in range(len(sentences)):
            if i == j:
                continue
            else:
                S[i][j] = sentence_similarity(sentences[i], sentences[j])
    
    for i in range(len(S)):
        S[i] /= S[i].sum()
    return S

In [0]:
RATIO = 0.6

def get_summary(sentences):
  SUMMARY_SIZE = int(len(sentences) * RATIO)
  # print(SUMMARY_SIZE)
  S = build_similarity_matrix(sentences)
  sentence_ranks = pagerank(S)
  ranked_sentence_indexes = [(item[0], item[1][0]) for item in sorted(enumerate(sentence_ranks), key=lambda item: -item[1])]
  selected_sentences = sorted(ranked_sentence_indexes[:SUMMARY_SIZE], key = lambda item: item[0])
  summary = [sentences[item[0]] for item in selected_sentences]
  return summary, [item[1] for item in selected_sentences]

In [0]:
# i = 60

In [0]:
# stories_d = '/content/drive/My Drive/CSE 538/Project/af8xtr/'
# !rm -rf stories_d
# !mkdir stories_d

def add_entity_weights(sentence, weight, data) :
  # print(sentence)
  doc = nlp(sentence)
  entities = ([(X, X.ent_iob_, X.ent_type_) for X in doc])
  entity_list = []
  for entity in entities:
    if (len(entity[2]) > 0):
      entity_list.append(entity[0])
  for real_entity in set(entity_list): 
    if real_entity.text in data.keys():
      data[real_entity.text] = data.get(real_entity.text) + weight/len(entity_list)
    else:
      data[real_entity.text] = weight/len(entity_list)
  return data

In [0]:

import glob
from tqdm import tqdm
import os.path
from os import path

stories = glob.glob('/content/drive/My Drive/CSE 538/Project/b4xtr/*')
gdrive_out = '/content/drive/My Drive/CSE 538/Project/af8xtr/'

from google.colab import files

for story_path in tqdm(stories):

  file_name_af8r = gdrive_out+story_path[story_path.index('cnn'):-5]+'.story'
  file_name_af8r_imp = gdrive_out+story_path[story_path.index('cnn'):-5]+'.imp'
  if path.exists(file_name_af8r_imp):
    continue

  with open(story_path, 'r') as f:

    entitiy_to_weights = dict()

    lines = f.read().split('\n')
    # pprint(lines)
    main_lines = []
    known_summary = []
    sum_started = False
    for line in lines:
      if (line.startswith('@high')):
        sum_started = True
      if sum_started:
        known_summary.append(line)
      else:
        if (len(lines) > 0) :
          main_lines.append(line)
    main_lines = [x for x in main_lines if len(x) > 0]
    summary, weights = get_summary(main_lines)

    # pprint(summary)
    # pprint(weights)
    iii = 0
    filtered_lines = []
    for line in main_lines:
      if line in summary:
        if (iii < len(weights)):
          add_entity_weights(line, weights[iii], entitiy_to_weights)
        iii = iii + 1
        filtered_lines.append(line)
        filtered_lines.append("")

    filtered_lines.extend(known_summary)
    # print(entitiy_to_weights)
    
    with open(file_name_af8r_imp, 'w') as f_af84_imp:
      for ent in entitiy_to_weights.keys():
        f_af84_imp.write(ent + " " + str(entitiy_to_weights[ent]) + "\n")

    with open(file_name_af8r, 'w') as f_af84:
      for line in filtered_lines:
        # print(line)
        f_af84.write("%s\n" % line)


  sqrt(numpy.dot(u, u)) * sqrt(numpy.dot(v, v))))
 93%|█████████▎| 6575/7054 [2:58:20<1:23:15, 10.43s/it]

In [13]:
!zip -r '/content/drive/My Drive/CSE 538/Project/af8xtr_imp.zip' '/content/drive/My Drive/CSE 538/Project/af8xtr_imp'

  adding: content/drive/My Drive/CSE 538/Project/af8xtr_imp/ (stored 0%)
  adding: content/drive/My Drive/CSE 538/Project/af8xtr_imp/cnn_test3978.imp (deflated 61%)
  adding: content/drive/My Drive/CSE 538/Project/af8xtr_imp/cnn_test3979.imp (deflated 66%)
  adding: content/drive/My Drive/CSE 538/Project/af8xtr_imp/cnn_test3980.imp (deflated 60%)
  adding: content/drive/My Drive/CSE 538/Project/af8xtr_imp/cnn_test3981.imp (deflated 56%)
  adding: content/drive/My Drive/CSE 538/Project/af8xtr_imp/cnn_test3982.imp (deflated 59%)
  adding: content/drive/My Drive/CSE 538/Project/af8xtr_imp/cnn_test3983.imp (deflated 60%)
  adding: content/drive/My Drive/CSE 538/Project/af8xtr_imp/cnn_test3984.imp (deflated 62%)
  adding: content/drive/My Drive/CSE 538/Project/af8xtr_imp/cnn_test3985.imp (deflated 60%)
  adding: content/drive/My Drive/CSE 538/Project/af8xtr_imp/cnn_test3986.imp (deflated 59%)
  adding: content/drive/My Drive/CSE 538/Project/af8xtr_imp/cnn_test3987.imp (deflated 44%)
  addin