# Data cleaning before model training

In [None]:
# Mount Google Drive
from google.colab import drive

drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd

In [None]:
# Load dataset
csv_path = "/content/drive/MyDrive/Data/alldata_1_for_kaggle.csv"
columns = ["Serial Number","Class Labels","Research Paper Text"]
df = pd.read_csv(csv_path, header=None, encoding = "latin-1", names=columns, skiprows=[0])

In [None]:
df

Unnamed: 0,Serial Number,Class Labels,Research Paper Text
0,0,Thyroid_Cancer,Thyroid surgery in children in a single insti...
1,1,Thyroid_Cancer,""" The adopted strategy was the same as that us..."
2,2,Thyroid_Cancer,coronary arterybypass grafting thrombosis ï¬b...
3,3,Thyroid_Cancer,Solitary plasmacytoma SP of the skull is an u...
4,4,Thyroid_Cancer,This study aimed to investigate serum matrix ...
...,...,...,...
7565,7565,Colon_Cancer,we report the case of a 24yearold man who pres...
7566,7566,Colon_Cancer,among synchronous colorectal cancers scrcs rep...
7567,7567,Colon_Cancer,the heterogeneity of cancer cells is generally...
7568,7568,Colon_Cancer,"""adipogenesis is the process through which mes..."


In [None]:
len(df)

7570

In [None]:
# Show unique values
df["Class Labels"].drop_duplicates()

0      Thyroid_Cancer
281      Colon_Cancer
539       Lung_Cancer
Name: Class Labels, dtype: object

In [None]:
df["Research Paper Text"].drop_duplicates()

0       Thyroid surgery in  children in a single insti...
1       " The adopted strategy was the same as that us...
2       coronary arterybypass grafting thrombosis ï¬b...
3        Solitary plasmacytoma SP of the skull is an u...
4        This study aimed to investigate serum matrix ...
                              ...                        
6863    "Missense mutation distribution in the exons a...
6929    "versus gemcitabine/carboplatin in advanced no...
7040     Keloids are pathological scars that grow over...
7485    the anization of cells into multiple membranou...
7497    several immunotherapeutic strategies that harn...
Name: Research Paper Text, Length: 996, dtype: object

In [None]:
# Drop duplicates
df = df.drop_duplicates(subset=["Research Paper Text"])

In [None]:
len(df)

996

In [None]:
df

Unnamed: 0,Serial Number,Class Labels,Research Paper Text
0,0,Thyroid_Cancer,Thyroid surgery in children in a single insti...
1,1,Thyroid_Cancer,""" The adopted strategy was the same as that us..."
2,2,Thyroid_Cancer,coronary arterybypass grafting thrombosis ï¬b...
3,3,Thyroid_Cancer,Solitary plasmacytoma SP of the skull is an u...
4,4,Thyroid_Cancer,This study aimed to investigate serum matrix ...
...,...,...,...
6863,6863,Lung_Cancer,"""Missense mutation distribution in the exons a..."
6929,6929,Lung_Cancer,"""versus gemcitabine/carboplatin in advanced no..."
7040,7040,Thyroid_Cancer,Keloids are pathological scars that grow over...
7485,7485,Colon_Cancer,the anization of cells into multiple membranou...


In [None]:
df["Research Paper Text"][0]

'thyroid surgery child single institution osama ibrahim almosallama ali aseerib ahmed alhumaida ali alzahranic saif alsobhib saud alshanafeybfrom adepartment surgery college medicine qassim university buraidah al qassim saudi arabia bdepartment surgery king faisal specialist hospital research center riyadh saudi arabia cdepartment medicine king faisal specialist hospital research center riyadh saudi arabia correspondence dr osama ibrahim almosallam department surgery college medicine qassim university po box buraidah al qassim saudi arabia osama_iaahotmailcom orcid orcid0000000290367564 citation almosallam oi aseeri alhumaid alzahrani alsobhi alshanafey thyroid surgery child single institution ann saudi med received january accepted may published august copyright copyright annals saudi medicine saudi arabia access creative common attributionnoncommercialnoderivatives international license cc byncnd detail accessed httpcreativecommons licensesbyncnd40funding nonebackground data thyroid 

By dropping duplicates we can go from 7570 values to 996 unique values.

## Data preprocessing functions

In [None]:
!pip install contractions unidecode nltk

Collecting contractions
  Downloading contractions-0.1.73-py2.py3-none-any.whl (8.7 kB)
Collecting unidecode
  Downloading Unidecode-1.3.8-py3-none-any.whl (235 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m235.5/235.5 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
Collecting textsearch>=0.0.21 (from contractions)
  Downloading textsearch-0.0.24-py2.py3-none-any.whl (7.6 kB)
Collecting anyascii (from textsearch>=0.0.21->contractions)
  Downloading anyascii-0.3.2-py3-none-any.whl (289 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m289.9/289.9 kB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pyahocorasick (from textsearch>=0.0.21->contractions)
  Downloading pyahocorasick-2.0.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (110 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.8/110.8 kB[0m [31m13.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected p

In [None]:
import re
from contractions import fix
import nltk
from unidecode import unidecode
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
def toLowerCase(data):
  return data.lower()

In [None]:
def removeHyperlinksAndHtmlTags(data):
    data = re.sub('<.*?>+', '', data)
    data = re.sub('(https|http)?://\S+|www\.\S+', '', data)
    return data

In [None]:
def removePunctuation(data):
  return re.sub(r'[^\w\s]+', '', data)

In [None]:
def removeNewlinesAndTabs(data):
  return data.replace("\\n"," ").replace("\t"," ")

In [None]:
# Fixes contractions such as `you're` to you `are`
def fixesContractions(data):
  return fix(data)

In [None]:
# handling accented character
def handleAccentedCharacters(data):
  return unidecode(data)

In [None]:
# remove stopwords such as i, me, my, myself, ...
def removeStopwords(data):
    stopwords_list = set(stopwords.words('english'))

    words = word_tokenize(data)

    filtered_words = [word for word in words if word.lower() not in stopwords_list]

    filtered_text = ' '.join(filtered_words)

    return filtered_text

In [None]:
# defining the function for getting root
def lemmatization(data):
    lem = WordNetLemmatizer()
    lem_words = [lem.lemmatize(word) for word in data.split()]
    return " ".join(lem_words)

##  Clean dataset

In [None]:
df['Research Paper Text'] = df['Research Paper Text'].apply(toLowerCase)
df['Research Paper Text'] = df['Research Paper Text'].apply(removeHyperlinksAndHtmlTags)
df['Research Paper Text'] = df['Research Paper Text'].apply(removePunctuation)
df['Research Paper Text'] = df['Research Paper Text'].apply(removeNewlinesAndTabs)
df['Research Paper Text'] = df['Research Paper Text'].apply(fixesContractions)
df['Research Paper Text'] = df['Research Paper Text'].apply(handleAccentedCharacters)
df['Research Paper Text'] = df['Research Paper Text'].apply(removeStopwords)
df['Research Paper Text'] = df['Research Paper Text'].apply(lemmatization)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Research Paper Text'] = df['Research Paper Text'].apply(toLowerCase)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Research Paper Text'] = df['Research Paper Text'].apply(removeHyperlinksAndHtmlTags)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Research Paper Text'] = df['Research Pa

In [None]:
df

Unnamed: 0,Serial Number,Class Labels,Research Paper Text
0,0,Thyroid_Cancer,thyroid surgery child single institution osama...
1,1,Thyroid_Cancer,adopted strategy used prior year based four ex...
2,2,Thyroid_Cancer,coronary arterybypass grafting thrombosis ibri...
3,3,Thyroid_Cancer,solitary plasmacytoma sp skull uncommon clinic...
4,4,Thyroid_Cancer,study aimed investigate serum matrix metallopr...
...,...,...,...
6863,6863,Lung_Cancer,missense mutation distribution exon functional...
6929,6929,Lung_Cancer,versus gemcitabinecarboplatin advanced nonsmal...
7040,7040,Thyroid_Cancer,keloid pathological scar grow time extend beyo...
7485,7485,Colon_Cancer,anization cell multiple membranous compartment...


## Create a new CSV for the cleaned dataset

In [None]:
cleaned_dataset= df.to_csv('/content/drive/MyDrive/Data/CleanedMedicalDataset.csv')