<a href="https://colab.research.google.com/github/BossBosssmile/-Extractive-Summarizer-/blob/main/IS_Nuttawat_6310412004.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Extractive Article Summarizer** [Using Integrated Text Rank and BM25]


## Install and Importing Necessary Packages

In [1]:
pip install rouge-score

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting rouge-score
  Downloading rouge_score-0.0.4-py2.py3-none-any.whl (22 kB)
Installing collected packages: rouge-score
Successfully installed rouge-score-0.0.4


In [2]:
pip install gensim==3.8.3

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting gensim==3.8.3
  Downloading gensim-3.8.3-cp37-cp37m-manylinux1_x86_64.whl (24.2 MB)
[K     |████████████████████████████████| 24.2 MB 1.8 MB/s 
Installing collected packages: gensim
  Attempting uninstall: gensim
    Found existing installation: gensim 3.6.0
    Uninstalling gensim-3.6.0:
      Successfully uninstalled gensim-3.6.0
Successfully installed gensim-3.8.3


In [3]:
# To collect text from a file
from os.path import abspath

# Basic libraries
import numpy as np
import pandas as pd 
import re
from math import log

# Page Rank
import networkx as nx

# nltk
import nltk
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from gensim import corpora
from gensim.summarization import bm25

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('omw-1.4')




[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

## Import data

In [4]:
#import pandas as pd
df = pd.read_csv('/content/dataset.csv')
df

Unnamed: 0,Seed_Name,Seed_info
0,Power Plant,Why should I buy Power Plant feminised seeds?\...
1,Amsterdam Amnesia,Why should I buy Amsterdam Amnesia feminised s...
2,Frisian Dew,Why I should buy Frisian Dew seeds ?\nA strong...
3,Durban Poison,Why should I get Durban Poison feminised seeds...
4,Passion #1,Why should I buy Passion #1 feminised seeds?\n...
5,Hollands Hope,Why should I buy Hollands Hope feminised seeds...
6,Durban Dew,Why should I buy Durban Dew feminised cannabis...
7,Skywalker Haze,Why should I buy Skywalker Haze feminised seed...
8,Sugar Bomb Punch,Why should I buy Sugar Bomb Punch feminized se...
9,C-Vibez,Why should I buy C-Vibez feminised seeds?\nShe...


## POS Tagging Function

In [5]:
def get_wordnet_pos(word):
    
    # Map POS tag to first character lemmatize() accepts
    tag = pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

## Similarity Matrix Functions

In [6]:
def similarity_original(text_data):
    
    sim = np.zeros([len(text_data), len(text_data)]) # Initialization
    for i, sentence_1 in enumerate(text_data):
        for j, sentence_2 in enumerate(text_data):
            sent_1 = set(sentence_1) # Unique words
            sent_2 = set(sentence_2)

            if(i == j):
                sim[i][j] = 0
            else:
                common = float(len(list(sent_1 & sent_2)))
                if(len(sentence_1) and len(sentence_2) > 1):
                    denominator = float(log(len(sentence_1)) + log(len(sentence_2)))
                else:
                    denominator = 1.0
                
                sim[i][j] = common / denominator
    return sim

In [7]:
def similarity_BM(text_data):
    
    dictionary = corpora.Dictionary(text_data) # BAG_OF_WORDS MODEL
    corpus = [dictionary.doc2bow(text) for text in text_data]
    bm25_obj = bm25.BM25(corpus) #object
    
    similarity = []

    for i, sentence in enumerate(text_data):
        query = dictionary.doc2bow(sentence)
        score = bm25_obj.get_scores(query)
        similarity.append(score)
    
    sim = np.array(similarity)    
    return sim

## Main Summarization 

In [8]:

article=df['Seed_info'][0]

In [11]:
 # Tokenization
sentences = sent_tokenize(article) 
sentences_clean = [re.sub(r'[^\w\s]','',sentence.lower()) for sentence in sentences]
print(sentences_clean)
# Stop words removal
stop_words = stopwords.words('english')
sentence_tokens = [[words for words in sentence.split(' ') if words not in stop_words] for sentence in sentences_clean]
print(stop_words)



['why should i buy power plant feminised seeds', 'power plant is an easy plant to grow', 'ideal for beginners but also interesting for experts looking to beat their current record yields', 'this variety has been developed to provide growers with the greatest possible yield', 'after more than 2 decades this sativa dominant hybrid has gained an indisputable reputation for its xxl yields and relatively short flowering time', 'power plant is a fast flowering sativa with a unique and recognisable aroma a strong taste and a very energetic and creative uphigh', 'her buds grow big and long they are full of sticky resin', 'this classic is suitable for any type of grower growing method grow medium and growing conditions', 'it is a true allrounder with high potency and an exceptional yield', 'power plant is a multiple cannabis cup winner\npower plant is a classic cannabis variety from the 90s that has won several more cups in canada in recent years\n\n1st prize sativa lift expo canada vancouver 2

In [14]:
# POS Tagging and Lemmatization
text_data = []
lemmatizer = WordNetLemmatizer() #object
count = 0
for sentence in sentence_tokens:
        sample_list = [word for word in sentence if word]
        #print('1','\n',sample_list)
        tags = pos_tag(sample_list)
        #print('2','\n',tags)
        line = []
        for word, tag in tags: 
            count += 1
            line.append(lemmatizer.lemmatize(word, pos = get_wordnet_pos(tag)))
        text_data.append(line) 

text_data

[['buy', 'power', 'plant', 'feminised', 'seed'],
 ['power', 'plant', 'easy', 'plant', 'grow'],
 ['ideal',
  'beginner',
  'also',
  'interesting',
  'expert',
  'looking',
  'beat',
  'current',
  'record',
  'yield'],
 ['variety',
  'developed',
  'provide',
  'grower',
  'greatest',
  'possible',
  'yield'],
 ['2',
  'decade',
  'sativa',
  'dominant',
  'hybrid',
  'gained',
  'indisputable',
  'reputation',
  'xxl',
  'yield',
  'relatively',
  'short',
  'flowering',
  'time'],
 ['power',
  'plant',
  'fast',
  'flowering',
  'sativa',
  'unique',
  'recognisable',
  'aroma',
  'strong',
  'taste',
  'energetic',
  'creative',
  'uphigh'],
 ['bud', 'grow', 'big', 'long', 'full', 'sticky', 'resin'],
 ['classic',
  'suitable',
  'type',
  'grower',
  'growing',
  'method',
  'grow',
  'medium',
  'growing',
  'condition'],
 ['true', 'allrounder', 'high', 'potency', 'exceptional', 'yield'],
 ['power',
  'plant',
  'multiple',
  'cannabis',
  'cup',
  'winner\npower',
  'plant',
  'cl

In [20]:
# Similarity Matrix
sim_a = similarity_original(text_data)
sim_d = similarity_BM(text_data)
print('similarity original')
print(sim_a,'\n')
print('similarity BM25')
print('\n',sim_d)


similarity original
[[0.         0.62133493 0.         ... 0.         0.         0.        ]
 [0.62133493 0.         0.         ... 0.         0.25562222 0.        ]
 [0.         0.         0.         ... 0.         0.         0.21714724]
 ...
 [0.         0.         0.         ... 0.         0.19703759 0.19703759]
 [0.         0.25562222 0.         ... 0.19703759 0.         0.        ]
 [0.         0.         0.21714724 ... 0.19703759 0.         0.        ]] 

similarity BM25

 [[16.30920762  1.26120724  0.         ...  0.          0.
   0.        ]
 [ 1.18442776 11.48357745  0.         ...  0.          1.92556205
   0.        ]
 [ 0.          0.         30.86689682 ...  0.          0.
   1.33721322]
 ...
 [ 0.          0.          0.         ... 39.09867161  1.82459677
   2.15547286]
 [ 0.          2.67449924  0.         ...  1.42543399 29.21649776
   0.        ]
 [ 0.          0.          1.33721322 ...  1.6839251   0.
  27.6949704 ]]


In [21]:
# Normalization
sim_a = sim_a / sim_a.max()    
sim_d = sim_d / sim_d.max()  
print(sim_a)

[[0.         0.55322982 0.         ... 0.         0.         0.        ]
 [0.55322982 0.         0.         ... 0.         0.22760322 0.        ]
 [0.         0.         0.         ... 0.         0.         0.19334552]
 ...
 [0.         0.         0.         ... 0.         0.17544011 0.17544011]
 [0.         0.22760322 0.         ... 0.17544011 0.         0.        ]
 [0.         0.         0.19334552 ... 0.17544011 0.         0.        ]]


In [22]:
# Combination
similarity_matrix = (sim_a + sim_d)
print(similarity_matrix)

[[0.30085244 0.57649504 0.         ... 0.         0.         0.        ]
 [0.5750787  0.21183508 0.         ... 0.         0.26312365 0.        ]
 [0.         0.         0.569395   ... 0.         0.         0.21801281]
 ...
 [0.         0.         0.         ... 0.72124478 0.20909805 0.21520165]
 [0.         0.27693913 0.         ... 0.20173478 0.53895045 0.        ]
 [0.         0.         0.21801281 ... 0.20650311 0.         0.51088316]]


In [27]:
# Page Rank
nx_graph = nx.from_numpy_array(similarity_matrix)
scores = nx.pagerank(nx_graph, max_iter = 700)
print(nx_graph)
print(scores)


Graph with 86 nodes and 1294 edges
{0: 0.01496633224252225, 1: 0.01685209542689019, 2: 0.011047918421937641, 3: 0.01168240652024823, 4: 0.014692795899907076, 5: 0.022215546487032584, 6: 0.00829784953159533, 7: 0.011024253884868516, 8: 0.010284937409649244, 9: 0.01726424017807005, 10: 0.01431776343126126, 11: 0.005498377277834527, 12: 0.0064998389124941635, 13: 0.006491281328320687, 14: 0.004098104172462027, 15: 0.008945890683097871, 16: 0.012682934306948643, 17: 0.009299723112181954, 18: 0.0062475458258434684, 19: 0.02004166604050687, 20: 0.01494214426458609, 21: 0.015576488238308985, 22: 0.006311787616781899, 23: 0.008815692277823826, 24: 0.014639650887478933, 25: 0.009490979518202063, 26: 0.02581355290340731, 27: 0.007346286050965655, 28: 0.00973361789655997, 29: 0.019542578243924236, 30: 0.016125106820311923, 31: 0.008087520268780683, 32: 0.008154848979154045, 33: 0.008832336641382957, 34: 0.007496325103955549, 35: 0.010735762163300374, 36: 0.009955610722489086, 37: 0.01082775874049

## Test summarization 

In [25]:
# Best sentences, (textrank)
ratio = 0.2  
top_sentence = {sentence:scores[index] for index,sentence in enumerate(sentences)}
number = int(len(sentence_tokens)*(ratio))
top = dict(sorted(top_sentence.items(), key=lambda x: x[1], reverse=True)[:number])


text_list = []

for sent in sentences:
    if sent in top.keys():
        text_list.append(sent)

summary = "\n".join(text_list)

print(summary)


Power Plant is an easy plant to grow.
Power Plant is a fast flowering Sativa with a unique and recognisable aroma, a strong taste and a very energetic and creative up-high.
Power Plant is a multiple cannabis cup winner
Power Plant is a classic cannabis variety from the 90s that has won several more cups in Canada in recent years:

1st Prize Sativa, Lift Expo Canada Vancouver 2016
3rd Prize Hash, Lift Expo Canada Vancouver 2016
2nd Prize Sativa, Prairie Medicinal Harvest Cup Canada 2016
Power Plant has a woody, spicy, earthy and floral aroma.
It is also an easy plant to clone and thanks to these excellent properties Power Plant, together with White Widow and Amnesia Haze, is one of the most cultivated Dutch strains of all time.
Power Plant is an easy to grow cannabis strain with vigorous growth and exceptional yield
Power Plant is a fast flowering Sativa like no other.
Even in the hands of a beginner, Power Plant will be able to produce a large yield.
In the hands of an expert, very exc

#### Evaluation - ROUGE score 
Ref - https://stats.stackexchange.com/questions/301626/interpreting-rouge-scores

In [26]:
from rouge_score import rouge_scorer

scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)
scores = scorer.score(df['Seed_info'][0],summary)
scores

{'rouge1': Score(precision=1.0, recall=0.28050397877984085, fmeasure=0.4381149663386846),
 'rougeL': Score(precision=1.0, recall=0.28050397877984085, fmeasure=0.4381149663386846)}

How to interpret the results in short and approximately:

ROUGE-n recall=0.28050397877984085 means that 28% of the n-grams in the reference summary are also present in the generated summary. ROUGE-n precision=1.0 means that 100% of the n-grams in the generated summary are also present in the reference summary.

# Summarization Function

In [29]:
def summarize(string, ratio = 0.2): 
    
    # Tokenization
    sentences = sent_tokenize(string) 
    sentences_clean = [re.sub(r'[^\w\s]','',sentence.lower()) for sentence in sentences]

    # Stop words removal
    stop_words = stopwords.words('english')
    sentence_tokens = [[words for words in sentence.split(' ') if words not in stop_words] for sentence in sentences_clean]

    # POS Tagging and Lemmatization
    text_data = []
    lemmatizer = WordNetLemmatizer() #object
    count = 0
    
    for sentence in sentence_tokens:
        sample_list = [word for word in sentence if word]
        tags = pos_tag(sample_list)
        line = []
        for word, tag in tags: 
            count += 1
            line.append(lemmatizer.lemmatize(word, pos = get_wordnet_pos(tag)))
        text_data.append(line) 

    # Similarity Matrix
    sim_a = similarity_original(text_data)
    sim_d = similarity_BM(text_data)

    # Normalization
    sim_a = sim_a / sim_a.max()    
    sim_d = sim_d / sim_d.max()    
    
    # Combination
    similarity_matrix = (sim_a + sim_d)
    
    # Page Rank
    nx_graph = nx.from_numpy_array(similarity_matrix)
    scores = nx.pagerank(nx_graph, max_iter = 700)

    # Best sentences, (textrank)
    top_sentence = {sentence:scores[index] for index,sentence in enumerate(sentences)}
    number = int(len(sentence_tokens)*(ratio))
    top = dict(sorted(top_sentence.items(), key=lambda x: x[1], reverse=True)[:number])
    text_list = []

    for sent in sentences:
        if sent in top.keys():
            text_list.append(sent)
    
    summary = "\n".join(text_list)
    return summary

In [30]:
print('-'*1000)
for i in range(len(df['Seed_Name'])):
  print('Conclusion of seed :',df['Seed_Name'][i])
  print('_'*(len('Conclusion of seed :')+len(df['Seed_Name'][i])+1),'\n')
  print(summarize(df['Seed_info'][i]))
  print('-'*1000,'\n')


----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------