### IMPORTING LIBRARIES:

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/solera-task/TASK.xlsx
/kaggle/input/glove6b100dtxt/glove.6B.100d.txt


### GLOVE WORD EMBEDDINGS.

In [2]:
word_embeddings = {}
with open('/kaggle/input/glove6b100dtxt/glove.6B.100d.txt', encoding = 'utf-8') as file:
    for line in file:
        coefs = np.asarray(line.split()[1:], dtype = 'float32')
        word_embeddings[line.split()[0]] = coefs

In [3]:
len(word_embeddings)

400000

### Given test data.

In [4]:
data = pd.read_excel('/kaggle/input/solera-task/TASK.xlsx')
data.head()

Unnamed: 0,TEST DATASET,Unnamed: 1
0,,Introduction
1,,Acnesol Gel is an antibiotic that fights bacte...
2,,Ambrodil Syrup is used for treating various re...
3,,Augmentin 625 Duo Tablet is a penicillin-type ...
4,,Azithral 500 Tablet is an antibiotic used to t...


In [5]:
data = pd.DataFrame(data.iloc[1:, 1]).reset_index(drop = True)
data.columns = ['TEXT']
data.head()

Unnamed: 0,TEXT
0,Acnesol Gel is an antibiotic that fights bacte...
1,Ambrodil Syrup is used for treating various re...
2,Augmentin 625 Duo Tablet is a penicillin-type ...
3,Azithral 500 Tablet is an antibiotic used to t...
4,Alkasol Oral Solution is a medicine used in th...


In [6]:
# There are total 1000 text docs.

data.shape

(1000, 1)

In [7]:
data.loc[0, 'TEXT']

'Acnesol Gel is an antibiotic that fights bacteria. It is used to treat acne, which appears as spots or pimples on your face, chest or back. This medicine works by attacking the bacteria that cause these pimples.Acnesol Gel is only meant for external use and should be used as advised by your doctor. You should normally wash and dry the affected area before applying a thin layer of the medicine. It should not be applied to broken or damaged skin. Avoid any contact with your eyes, nose, or mouth. Rinse it off with water if you accidentally get it in these areas. It may take several weeks for your symptoms to improve, but you should keep using this medicine regularly. Do not stop using it as soon as your acne starts to get better. Ask your doctor when you should stop treatment.Common side effects like minor itching, burning, or redness of the skin and oily skin may be seen in some people. These are usually temporary and resolve on their own. Consult your doctor if they bother you or do no

# TEXT SUMMARIZATION

<a> Text Summarization problem can be dealt in two ways:</a><br>
<a> 1. Extractive text summarization.</a><br>
<a> 2. Abstractive text summarization.</a>

#### <i>My Approach for solving the problem follows extractive text summarization, It extracts the most important sentences from the given text and stack them to create a summary. Following are the steps that I took to generate meaningful summary from the given text.</i>

1. The first step is to split the given text into individual sentences.
2. After sentence tokenizing, I performed text preprocessing like removing unwanted characters, stopwords and finally        lower casing the words and concatenating the words back to make a sentence.
3. The next step after text preprocessing is to generate word embeddings by making use of Glove(Global vectors for word      representation).
4. In this step, I generated a similarity matrix using cosine similarity.
5. The similarity matrix is then converted into a graph, where each vertex node represents a sentence and edge rpresents a    similarity score.
6. Finally, summary of given number of sentence is created.

In [8]:
from nltk.tokenize import sent_tokenize
import re
from nltk.corpus import stopwords
from sklearn.metrics.pairwise import cosine_similarity
import networkx as nx


class TextSummariser():
    
    doc = '''
              - This class summarises the given text using glove word embeddings
                and page rank algorithm.
              - Following steps are required for usage:-
                   1. user must instantiate this class by passing the text to
                      summarise as an argument.
                   2. using the object of the class user can call the extract_summary
                      method by passing number of sentences as an argument.
          '''
    
    def __init__(self, doc):
        self.doc = doc
        self.sent_tokens = self.__sentence_tokenizer(self.doc)
        self.preprocessed_docs = self.__preprocess_doc(self.sent_tokens)
        self.vects = self.__get_vectors(self.preprocessed_docs)
        self.sim_mat = self.__get_similarity_matrix(self.sent_tokens, self.vects)
        
        # creating graph of similarity matrix, where each node will denote the sentence
        # and each edge will denote the similarity score of the ith node and jth node.
        #
        #           i -------------------------------------> j
        #   sentence1             similarity score           sentence2
        #
        
        nx_graph = nx.from_numpy_array(self.sim_mat)
        self.scores = nx.pagerank(nx_graph)
    
    def __sentence_tokenizer(self, doc):
        
        '''
            - This method breaks the whole text into sentences or, we can say it 
              tokenise the text into sentences.
              
            - r_type: list
        '''
        
        sentences = sent_tokenize(doc)
        return sentences

    def __preprocess_doc(self, sentences):
        
        '''
            - This method preprocessed the input tokenised sentences. Following are
              preprocessing steps:-
                1. Removing everything from the sentence except alphabets.
                2. Removing stopwords(words that do not carry any information 
                   like a, the, It, by, is, was, etc) from the sentences.
                3. Lower casing each word of the sentence.
            
            - r_type: list
        '''
        
        STOPWORDS = set(tuple(stopwords.words('english')))
        new_sens = []
        for sent in sentences:
            doc = re.sub('[^a-zA-Z]', ' ', sent)   # allowing only alphabets.
            doc = ' '.join([word.lower() for word in doc.split() if word not in STOPWORDS]) # removing stopwords and 
                                                                                            # performing lower casing.
            new_sens.append(doc)
        return new_sens

    def __get_vectors(self, sentences):
        
        '''
            - This method converts words into numbers, and by doing so
              one sentence will become one single vector.
            - This method uses glove(Global vector for word representation)
              for creating word embeddings.
              
            - r_type: list
        '''
        
        sentence_vects = []
        for sent in sentences:
            if len(sent) != 0:
                
                # Below code line creates normalised vectors of each size 100.
                
                temp = sum([word_embeddings.get(w, np.zeros((100,))) for w in sent.split()])/(len(sent.split())+0.001)
            else:
                temp = np.zeros((100,))
            sentence_vects.append(temp)
        return sentence_vects

    def __get_similarity_matrix(self, sentences, sentence_vects):
        
        '''
            - This method creates similarity matrix of sentences.
            - sim_mat[i][j] represents similarity score(calculated using cosine similarity) of sentence
              i and j.
            
            - r_type: matrix (numpy ndarray)
        '''
        sim_mat = np.zeros([len(sentences), len(sentences)])
        for i in range(len(sentences)):
            for j in range(len(sentences)):
                if i != j:
                    sim_mat[i][j] = cosine_similarity(sentence_vects[i].reshape(1, 100), sentence_vects[j].reshape(1, 100))[0, 0]
        return sim_mat
    
    def extract_summary(self, number_of_sentences = 10):
        
        '''
            - This method generates summary of given sentences.
            
            - r_type: string
        '''
        ranked_sentences = sorted(((self.scores[i%14], s) for i, s in enumerate(self.sent_tokens)), reverse = True)[:number_of_sentences]
        summary = ''.join([i[1] for i in ranked_sentences])
        return summary
        
        
        
# DIRECTIONS: 

ts = TextSummariser(data.loc[999, 'TEXT'])
summary = ts.extract_summary(number_of_sentences = 5)
summary

'This helps to reduce the workload of the heart.Angizem CD 120 Capsule ER may be taken with or without food, but it is better to take it regularly at a fixed time each day as advised by your doctor.Your doctor may want to monitor your blood pressure while using it and you may need frequent blood tests.Drinking alcohol should be avoided while taking this medicine as it may worsen the side effects.Before taking this medicine, let your doctor know if you have any liver or kidney problems.Keep using this medicine even if you feel well.Also, inform your doctor if you have very low blood pressure (hypotension), heart failure, or if you recently had a heart attack and have fluid in your lungs.'

### SUMMARY FOR ALL ENTRIES OF TEST DATA:

In [9]:
data['SUMMARY'] = ''

no = 0
for i in range(data.shape[0]):
    try:
        ts = TextSummariser(data.loc[i, 'TEXT'])
        data.loc[i, 'SUMMARY'] = ts.extract_summary(number_of_sentences = 10)
    except:
        no += 1
        pass
print(f'unsuccessfull attempts: {no}/1000')

unsuccessfull attempts: 0/1000


In [10]:
data

Unnamed: 0,TEXT,SUMMARY
0,Acnesol Gel is an antibiotic that fights bacte...,Ask your doctor when you should stop treatment...
1,Ambrodil Syrup is used for treating various re...,Your doctor should also know about all other m...
2,Augmentin 625 Duo Tablet is a penicillin-type ...,These are usually mild but let your doctor kno...
3,Azithral 500 Tablet is an antibiotic used to t...,Consult your doctor if you find these side eff...
4,Alkasol Oral Solution is a medicine used in th...,Take it regularly and do not stop taking the m...
...,...,...
995,Azapure Tablet belongs to a group of medicines...,Your doctor also needs to know what other medi...
996,Arimidex 1mg Tablet is used alone or with oth...,Your doctor should also know about all other m...
997,Arpimune ME 100mg Capsule is used to prevent y...,You will have frequent medical tests while tak...
998,Amlodac CH Tablet is a combination medicine us...,You also need to tell your doctor what other m...


In [11]:
data.to_excel('test_data_with_summary.xlsx', index = False)