# Text Summerization With Fast Text and Spacy

In [36]:
#IMPORTANT LIBRARIES 

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity
import fasttext
import spacy
import re

# 1.Data Loading

In [37]:
df = pd.read_table("cancermine_sentences.tsv")

In [38]:
df.head(5)

Unnamed: 0,matching_id,pmid,title,journal,journal_short,year,month,day,section,subsection,...,cancer_start,cancer_end,gene_hugo_id,gene_entrez_id,gene_name,gene_normalized,gene_start,gene_end,sentence,formatted_sentence
0,333b030a6302a63ad2d43f83f0b01598,36528839,Roles of RNA-binding proteins in neurological ...,Human Cell,Human Cell,2023,12,18,article,introduction,...,254,284,HGNC:10571,6319,stearoyl-CoA desaturase (SCD),SCD,150,179,"In the line with this evidence, the role of th...","In the line with this evidence, the role of th..."
1,dcbb82e8e463eec67a75d22cc248228f,36450978,Common and rare variant associations with clon...,Nature,Nature,2022,11,30,article,,...,146,159,HGNC:2978,1788,DNMT3A,DNMT3A,184,190,Models estimating event risk on the basis of C...,Models estimating event risk on the basis of C...
2,13ae1534f5aba0a8ade7d319349313d1,33535156,Eyes on coronavirus,Stem Cell Research,Stem Cell Research,2021,1,27,article,introduction,...,91,96,HGNC:1116,682,CD147,BSG,39,44,"In non-small cell lung cancer (NSCLC), CD147 e...","In non-small cell lung cancer (NSCLC), <b>CD14..."
3,c6712385cd77b7273e2aeb12a217ed3e,36648635,World Molecular Imaging Congress 2022,Molecular Imaging and Biology,Molecular Imaging and Biology,2022,1,17,article,conclusions,...,87,102,HGNC:644,367,androgen receptor,AR,62,79,Mitochondrial pyruvate import is a metabolic v...,Mitochondrial pyruvate import is a metabolic v...
4,5659f2a223f7f79ec55e98d6f4869da4,36648635,World Molecular Imaging Congress 2022,Molecular Imaging and Biology,Molecular Imaging and Biology,2022,1,17,article,introduction,...,106,117,HGNC:8038,4922,neurotensin (NTS),NTS,36,53,Accumulating evidence suggests that neurotensi...,Accumulating evidence suggests that <b>neurote...


In [39]:
df.shape

(108057, 25)

# 2.Data Cleaning

In [40]:
# missing values
df['sentence'].isnull().sum()

0

In [41]:
df['sentence'].duplicated().sum()

25785

In [42]:
df = df.drop_duplicates('sentence', keep='last')
df.duplicated().sum()

0

# 3. Exploratory Data Analysis

In [43]:
df['sentence'].describe()

count                                                 82272
unique                                                82272
top       In the line with this evidence, the role of th...
freq                                                      1
Name: sentence, dtype: object

# 4. Data Preprocessing

In [44]:
# eliminating punctuations and symbols
analyzer = spacy.load('en_core_web_sm')
def preprocessor(text):
    example = analyzer(text)
    token_words = []
    processed_words = []
    for words  in example:
        if not words.is_punct:
            token_words.append(str(words))
    return " ".join(token_words)   
print(preprocessor("brasilian are good. at fottball! and studying"))

brasilian are good at fottball and studying


In [None]:
df['cleaned_sentence'] = df['sentence'].apply(preprocessor)

In [None]:
df['cleaned_sentence'][1]

# 5. Extracting processed txt file for unsupervised model training

In [None]:
df.to_csv("cancerfile.txt", columns = ['cleaned_sentence'], header = None, index = False)

# 6. Model Training(Unsupervised)

In [47]:
model = fasttext.train_unsupervised('cancerfile.txt')

In [48]:
# checking how model is well trained by finding nearest neighbors 
check = model.get_nearest_neighbors("cancer")
check

[(0.8482417464256287, 'cancer"'),
 (0.8238462805747986, 'cancerbut'),
 (0.8063907027244568, 'cancerRRM2'),
 (0.7920263409614563, 'breast'),
 (0.7765771150588989, 'cancers"'),
 (0.7683383822441101, 'breastcancer'),
 (0.7436859011650085, 'cancersand'),
 (0.7291085124015808, 'cancer-1'),
 (0.7147127389907837, 'cancerand'),
 (0.7045959234237671, 'cancers')]

In [356]:
model.save_model("fasttext_finalmodel.bin")

In [357]:
input_text = """In most cases, we can never attribute the cause of any cancer to one single factor. The main thing that causes cancer is a substance we know as carcinogens. But how these develop or enters a person’s body will depend on many factors. We can divide the main factors into the following types – biological factors, physical factors, and lifestyle-related factors.

Biological factors involve internal factors such as age, gender, genes, hereditary factors, blood type, skin type, etc. Physical factors refer to environmental exposure of any king to say X-rays, gamma rays, etc. Ad finally lifestyle-related factors refer to substances that introduced carcinogens into our body. These include tobacco, UV radiation, alcohol. smoke, etc. Next, in this essay on cancer lets learn about how we can treat cancer."""

In [358]:
# input paragraph text cleaning and preprocessing
input_processed = preprocessor(input_text)
input_processed = re.sub(r"[^a-zA-Z]"," ",input_processed)
input_processed

'In most cases we can never attribute the cause of any cancer to one single factor The main thing that causes cancer is a substance we know as carcinogens But how these develop or enters a person  s body will depend on many factors We can divide the main factors into the following types biological factors physical factors and lifestyle related factors    Biological factors involve internal factors such as age gender genes hereditary factors blood type skin type etc Physical factors refer to environmental exposure of any king to say X rays gamma rays etc Ad finally lifestyle related factors refer to substances that introduced carcinogens into our body These include tobacco UV radiation alcohol smoke etc Next in this essay on cancer lets learn about how we can treat cancer'

# 7. Converting to sentence embedding

In [359]:
word_embedding_para = model.get_sentence_vector(input_processed)

In [360]:
word_embedding_para

array([ 9.2889154e-03,  7.6127541e-03,  5.2819829e-02, -2.7959950e-02,
       -6.4665808e-05, -5.8702696e-03, -9.1797233e-02, -6.6517390e-02,
        3.2586761e-02, -1.6245896e-02, -2.0829242e-02,  3.1995386e-02,
       -4.3331023e-02, -3.5725866e-02, -1.6707462e-01,  7.3126420e-02,
       -4.1093912e-02,  1.8000711e-02, -5.6655377e-02, -8.4780611e-02,
        5.4462552e-02, -3.3425877e-03, -6.9462828e-02, -6.1196595e-02,
       -7.9739615e-02, -4.1772756e-03, -5.0474573e-03, -2.1156631e-03,
       -3.8637791e-02,  2.9136052e-03, -6.4682141e-02, -1.1133105e-02,
        1.2185012e-02,  1.1899914e-01,  4.1721868e-03, -1.0800358e-01,
       -4.1997347e-02,  6.0606111e-02, -2.7366489e-02, -3.8321968e-02,
        1.3862264e-01,  1.1765123e-01,  1.2285425e-01,  3.9953899e-02,
        6.2426619e-02,  8.4542232e-03, -4.0465001e-02, -1.0646818e-02,
        1.7627342e-02, -5.3850539e-02, -5.9627600e-02, -1.8939821e-02,
       -5.4124266e-02,  3.2087855e-02, -3.0777904e-03, -7.6084281e-03,
      

In [361]:
doc = analyzer(input_text)
processed_sentences = []
for x in doc.sents:
    processed_sentences.append(str(x))
word_embeddings_sent = []
for x in processed_sentences:
    x = x.replace("\n"," ")
    word_embeddings_sent.append(model.get_sentence_vector(x))
print(word_embeddings_sent)    

[array([ 0.01466357,  0.01209411,  0.04492087, -0.02626492,  0.02853228,
        0.00703079, -0.10982415, -0.08300329,  0.0263868 ,  0.00049283,
       -0.00948199,  0.07625341, -0.05468797, -0.01477058, -0.19871804,
        0.05576849, -0.06826639,  0.03755951, -0.03288502, -0.0943509 ,
        0.02895478, -0.01050661, -0.09215454, -0.0593892 , -0.1137737 ,
       -0.03314891, -0.01108863, -0.02265893, -0.05078182, -0.00043879,
       -0.06115587, -0.00230351,  0.02405915,  0.11121704,  0.00503107,
       -0.11163875, -0.05088544,  0.10118215, -0.01761193, -0.09710315,
        0.15200129,  0.13502774,  0.08039293,  0.06203151,  0.05506836,
       -0.02205065, -0.03110491,  0.01771598,  0.03056882, -0.04806802,
       -0.04111072, -0.01371094, -0.02504434,  0.04452137,  0.00544221,
       -0.01220976, -0.08018486, -0.0337398 ,  0.00322814,  0.11317295,
        0.03404204,  0.06647067,  0.00113642,  0.09248691, -0.10956248,
        0.06160248,  0.01551293,  0.06895933,  0.02362953,  0.0

# finding cosine similarities between sentences and input text

In [363]:
#finding cosine similarities between sentences and input text

cos_sim = cosine_similarity([word_embedding_para],two_sent)
similar_sentences = list(enumerate(cos_sim[0]))
sorted_similar_sentences = sorted(similar_sentences, key=lambda x:x[1], reverse=True)

similar_sentences

[(0, 0.93617666), (1, 0.9136757), (2, 0.8132518), (3, 0.81250846)]

# 8. Final output

In [311]:
summary = ""
for sent in similar_sentences[:5]:
    summary = summary + processed_sentences[sent[0]]
summary    
    

'In most cases, we can never attribute the cause of any cancer to one single factor.Ad finally lifestyle-related factors refer to substances that introduced carcinogens into our body.But how these develop or enters a person’s body will depend on many factors.Physical factors refer to environmental exposure of any king to say X-rays, gamma rays, etc.We can divide the main factors into the following types – biological factors, physical factors, and lifestyle-related factors.\n\n'