In [1]:
import pandas as pd

In [2]:
excel_dict = pd.read_excel(r"..\data files\Pubmed5k.xlsx", sheet_name=None)
# excel_dict

In [3]:
excel_dict["random 5k"]

Unnamed: 0,ArticleID,Title,Abstract
0,34153941,Stable Coordination Variability in Overground ...,Coordination variability (CV) is commonly anal...
1,34153942,Weak Hip Strength Increases Dynamic Knee Valgu...,Clinical Scenario: Dynamic knee valgus (DKV) i...
2,34153964,Current and Future Projections of Amyotrophic ...,Various methodologies have been reported to as...
3,34153968,Disparities between Asian and Non-Asian Thromb...,As outcomes for acute ischemic stroke (AIS) va...
4,34153978,Maternal Factors Predicting Loss to Follow-Up ...,Because hearing loss in children can result in...
...,...,...,...
4994,34444567,Mind the Differences: How Diagnoses and Hospit...,Integrated care pathway (ICP) is a prevailing ...
4995,34444568,The Ethics of Dying: Deciphering Pandemic-Resu...,The objective of medicine is to provide humans...
4996,34444569,Research on Adolescents Regarding the Indirect...,This research involved the participation of 30...
4997,34444571,Pre-Intervention Effects of a Community-Based ...,This study explores the impact of the 'pre-int...


In [4]:
excel_dict["random 5k"].to_csv(r"..\data files\Pubmed5k.csv", header=True, index=False)

In [5]:
df = pd.read_csv(r"..\data files\Pubmed5k.csv", index_col="ArticleID")
df

Unnamed: 0_level_0,Title,Abstract
ArticleID,Unnamed: 1_level_1,Unnamed: 2_level_1
34153941,Stable Coordination Variability in Overground ...,Coordination variability (CV) is commonly anal...
34153942,Weak Hip Strength Increases Dynamic Knee Valgu...,Clinical Scenario: Dynamic knee valgus (DKV) i...
34153964,Current and Future Projections of Amyotrophic ...,Various methodologies have been reported to as...
34153968,Disparities between Asian and Non-Asian Thromb...,As outcomes for acute ischemic stroke (AIS) va...
34153978,Maternal Factors Predicting Loss to Follow-Up ...,Because hearing loss in children can result in...
...,...,...
34444567,Mind the Differences: How Diagnoses and Hospit...,Integrated care pathway (ICP) is a prevailing ...
34444568,The Ethics of Dying: Deciphering Pandemic-Resu...,The objective of medicine is to provide humans...
34444569,Research on Adolescents Regarding the Indirect...,This research involved the participation of 30...
34444571,Pre-Intervention Effects of a Community-Based ...,This study explores the impact of the 'pre-int...


# EDA

In [6]:
df.shape

(4999, 2)

In [7]:
# ensure that the column "ArticleID" is really valid to be an ID and not a fake name
df.index.nunique()

4999

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4999 entries, 34153941 to 34444572
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Title     4999 non-null   object
 1   Abstract  4999 non-null   object
dtypes: object(2)
memory usage: 246.2+ KB


In [9]:
df.isna().mean()

Title       0.0
Abstract    0.0
dtype: float64

#### there is no null values

In [10]:
df.nunique()

Title       4999
Abstract    4989
dtype: int64

#### there are 10 values repeated in the column of "Abstract"

In [11]:
df.describe()

Unnamed: 0,Title,Abstract
count,4999,4999
unique,4999,4989
top,Stable Coordination Variability in Overground ...,[Figure: see text].
freq,1,6


In [12]:
df[df.duplicated()]

Unnamed: 0_level_0,Title,Abstract
ArticleID,Unnamed: 1_level_1,Unnamed: 2_level_1


# Preprocessing

Text Analysis is a major application field for machine learning algorithms. However the raw data, a sequence of symbols cannot be fed directly to the algorithms themselves as most of them expect numerical feature vectors with a fixed size rather than the raw text documents with variable length.

We will perform the following text preprocessing steps:-
- Convert the text into lowercase
- Split text into words (Tokenize)
- Remove the stop loss words
- Remove the Punctuation, any symbols, and special characters
- Normalize the word (I’ll be using Lemmatization for normalization)


In [13]:
# to replace abbreviation with it's original text
import spacy
import scispacy
from scispacy.abbreviation import AbbreviationDetector

nlp = spacy.load('en_core_sci_lg')
nlp.add_pipe("abbreviation_detector")

def replace_acronyms(text):
    doc = nlp(text)
    altered_tok = [tok.text for tok in doc]
    for abrv in doc._.abbreviations:
        altered_tok[abrv.start] = str(abrv._.long_form)

    return(" ".join(altered_tok))

In [14]:
import re
import string
import nltk
from tqdm import tqdm



# nltk lemmatization method
from nltk.stem import WordNetLemmatizer 
# nltk.download('wordnet')
# nltk.download('omw-1.4')

# nltk.download('words')
# eng_words = set(nltk.corpus.words.words())
eng_words = list(nlp.vocab.strings)

# # nltk method for tokenization
# from nltk.tokenize import word_tokenize
# nltk.download('punkt')


# stop words
# i make stop words is the set of the stop words from the 4 libraries to increase number of stop words, i.e accept all the definitions
# of the stop words from all libraries to remove higher numbers of stop words
# nltk.download('stopwords')
from nltk.corpus import stopwords 
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from gensim.parsing.preprocessing import STOPWORDS 
stop_words = list(set(list(nlp.Defaults.stop_words)+         #spacy
                      list(ENGLISH_STOP_WORDS)+              #scikit-learn
                      list(STOPWORDS)+                       #gensim
                      list(stopwords.words('english'))))     #nltk


wnl = WordNetLemmatizer()

In [15]:
# this step may take alot of time, you can reduce the mount of time by 
# using """nlp = spacy.load('en_core_sci_sm')""" instead of """nlp = spacy.load('en_core_sci_lg')""" that i used

def clean_txt(txt):
    """this function is to clean the text"""
    
    # replace the abbreviation wit it's origin
    txt = replace_acronyms(txt)
    
    # remove punctuation
    txt = re.sub('\W+',' ', txt) 
    # txt = (txt.translate(str.maketrans('', '', string.punctuation))) # remove the punctuation, it's the fastest method 
    
    # lower all characters
    txt = txt.lower()
    
    # splite the words after every space(tokenize)
    txt = txt.split()    
    # txt = word_tokenize(txt) # nltk tokenization alternative

    # remove stop words and do lemmatization
    txt = [wnl.lemmatize(word) for word in txt if ((word.isalpha()) and (word not in stop_words))]    
    
    # remove non-english words
    # this step may take alot of time, you can reduce the mount of time by 
    # using """nlp = spacy.load('en_core_sci_sm')""" instead of """nlp = spacy.load('en_core_sci_lg')""" that i used
    txt = [w for w in txt if w in eng_words]
    
    # convert list of tokens to text
    # txt = " ".join(txt)
    
    # returning the result
    return txt

In [16]:
# this step may take alot of time, you can reduce the mount of time by 
# using """nlp = spacy.load('en_core_sci_sm')""" instead of """nlp = spacy.load('en_core_sci_lg')""" that i used,

# if you find that this take along time, i did this step and save the results as a csv file, so you can use it directly in the next cell
df["Abstract_processed"] = [clean_txt(txt) for txt in tqdm(df.Abstract)]
df["Abstract_processed"].head()

  global_matches = self.global_matcher(doc)
100%|██████████████████████████████████████████████████████████████████████████████| 4999/4999 [04:41<00:00, 17.77it/s]


ArticleID
34153941    [coordination, variability, coordination, vari...
34153942    [clinical, scenario, dynamic, knee, valgus, dy...
34153964    [methodology, reported, ass, real, world, epid...
34153968    [outcome, acute, ischemic, stroke, acute, isch...
34153978    [hearing, loss, child, result, developmental, ...
Name: Abstract_processed, dtype: object

In [23]:
df = pd.read_csv(r"..\data files\Pubmed5k_processed.csv", index_col="ArticleID")
df

Unnamed: 0_level_0,Title,Abstract,Abstract_processed
ArticleID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
34153941,Stable Coordination Variability in Overground ...,Coordination variability (CV) is commonly anal...,"['coordination', 'variability', 'coordination'..."
34153942,Weak Hip Strength Increases Dynamic Knee Valgu...,Clinical Scenario: Dynamic knee valgus (DKV) i...,"['clinical', 'scenario', 'dynamic', 'knee', 'v..."
34153964,Current and Future Projections of Amyotrophic ...,Various methodologies have been reported to as...,"['methodology', 'reported', 'ass', 'real', 'wo..."
34153968,Disparities between Asian and Non-Asian Thromb...,As outcomes for acute ischemic stroke (AIS) va...,"['outcome', 'acute', 'ischemic', 'stroke', 'ac..."
34153978,Maternal Factors Predicting Loss to Follow-Up ...,Because hearing loss in children can result in...,"['hearing', 'loss', 'child', 'result', 'develo..."
...,...,...,...
34444567,Mind the Differences: How Diagnoses and Hospit...,Integrated care pathway (ICP) is a prevailing ...,"['integrated', 'care', 'pathway', 'integrated'..."
34444568,The Ethics of Dying: Deciphering Pandemic-Resu...,The objective of medicine is to provide humans...,"['objective', 'medicine', 'provide', 'human', ..."
34444569,Research on Adolescents Regarding the Indirect...,This research involved the participation of 30...,"['research', 'involved', 'participation', 'chi..."
34444571,Pre-Intervention Effects of a Community-Based ...,This study explores the impact of the 'pre-int...,"['study', 'explores', 'impact', 'pre', 'interv..."


In [24]:
df[df['Abstract_processed'].duplicated(keep=False)]

Unnamed: 0_level_0,Title,Abstract,Abstract_processed
ArticleID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
34237945,Studium anabolické aktivity suchých extrakt&#3...,This article presents the results of the study...,"['article', 'present', 'result', 'study', 'ana..."
34669439,DNA binding to TLR9 expressed by red blood cel...,[Figure: see text].,"['figure', 'text']"
34669440,Peptide-based urinary monitoring of fibrotic n...,[Figure: see text].,"['figure', 'text']"
34669441,A rapid assay provides on-site quantification ...,[Figure: see text].,"['figure', 'text']"
34669442,Fatal enhanced respiratory syncytial virus dis...,[Figure: see text].,"['figure', 'text']"
34669443,Macrophage migration inhibitory factor drives ...,[Figure: see text].,"['figure', 'text']"
34669444,"Development of ICT01, a first-in-class, anti-B...",[Figure: see text].,"['figure', 'text']"
34258890,Closing gaps in the care of patients with hear...,No abstract present.,"['abstract', 'present']"
34258891,Too much of a good thing in ischemic mitral: l...,No abstract present.,"['abstract', 'present']"
34258892,COVID-19 infection and cardiometabolic complic...,No abstract present.,"['abstract', 'present']"


In [25]:
# df.drop(df[df['Abstract_processed'].duplicated()].index, axis=0, inplace=True)
df.drop_duplicates(subset=["Abstract_processed"], inplace=True)

In [26]:
df["words_count"] = [len(i) for i in df['Abstract_processed']]
df["words_count"].unique()

array([1421, 1941, 1821, ..., 2037,  963, 1824], dtype=int64)

In [27]:
# df.to_csv(r"..\data files\Pubmed5k_processed.csv", index=True, header=True)
df = pd.read_csv(r"..\data files\Pubmed5k_processed.csv", index_col="ArticleID")
df

Unnamed: 0_level_0,Title,Abstract,Abstract_processed,words_count
ArticleID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
34153941,Stable Coordination Variability in Overground ...,Coordination variability (CV) is commonly anal...,"['coordination', 'variability', 'coordination'...",1421
34153942,Weak Hip Strength Increases Dynamic Knee Valgu...,Clinical Scenario: Dynamic knee valgus (DKV) i...,"['clinical', 'scenario', 'dynamic', 'knee', 'v...",1941
34153964,Current and Future Projections of Amyotrophic ...,Various methodologies have been reported to as...,"['methodology', 'reported', 'ass', 'real', 'wo...",1821
34153968,Disparities between Asian and Non-Asian Thromb...,As outcomes for acute ischemic stroke (AIS) va...,"['outcome', 'acute', 'ischemic', 'stroke', 'ac...",2358
34153978,Maternal Factors Predicting Loss to Follow-Up ...,Because hearing loss in children can result in...,"['hearing', 'loss', 'child', 'result', 'develo...",1339
...,...,...,...,...
34444567,Mind the Differences: How Diagnoses and Hospit...,Integrated care pathway (ICP) is a prevailing ...,"['integrated', 'care', 'pathway', 'integrated'...",1456
34444568,The Ethics of Dying: Deciphering Pandemic-Resu...,The objective of medicine is to provide humans...,"['objective', 'medicine', 'provide', 'human', ...",1286
34444569,Research on Adolescents Regarding the Indirect...,This research involved the participation of 30...,"['research', 'involved', 'participation', 'chi...",850
34444571,Pre-Intervention Effects of a Community-Based ...,This study explores the impact of the 'pre-int...,"['study', 'explores', 'impact', 'pre', 'interv...",1653


### This script written at April-2022 by Ahmad salama
- salama4ai@gmail.com
- www.linkedin.com/in/salama4ai