In [18]:
import chardet
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from bs4 import BeautifulSoup
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
import string

In [2]:
#With utf-8, there was an issue with loading, also latin-1 had too many wrong characters, so
# I had to use chardet to find the correct encoding


# Detect the encoding of the file
with open('../Data/Medical Dataset.csv', 'rb') as f:
    result = chardet.detect(f.read())

# Get the detected encoding
detected_encoding = result['encoding']


In [3]:
detected_encoding

'MacRoman'

In [4]:
import pandas as pd

# Read the CSV file with the detected encoding
df = pd.read_csv('../Data/Medical Dataset.csv', encoding='MacRoman')


In [5]:
df.iloc[-3]['a']

'the heterogeneity of cancer cells is generally accepted and astem celllike subpopulation that is called ÔøΩcancer stem cellsÔøΩcscs has been identiÔ¨Åed in various types of malignanttumors although the lack of consensus on the deÔ¨Ånitioncscs are widely recognized as a small subpopulation amongcancer cells with the properties of selfrenewal and tumor initiation as cscs play a critical role in the recurrence andmetastasis of cancer   targeting the cscs is thought to bea promising approach for curing cancera large number of past studies have tried to identify andcharacterize the cscs as normal tissuespeciÔ¨Åc stem cellsare considered as the main origin of cancer   the cscsare also thought to be inherited at least partially the characterization of normal tissuespeciÔ¨Åc stem cells thereforemany studies on the identiÔ¨ÅcationpuriÔ¨Åcation of cscs havesimply shared markers of hematopoietic stem cells includingthe most popularly used cell surface markers of cd44 andcd133 [ ] cd44 is a type 

In [6]:
df.isna().sum()

Unnamed: 0    0
0             0
a             0
dtype: int64

In [10]:
df.duplicated().sum()

6574

In [8]:
df = df.drop('Unnamed: 0', axis=1)
df = df.rename(columns={'0': 'label', 'a':'text'})

In [9]:
df

Unnamed: 0,label,text
0,Thyroid_Cancer,Thyroid surgery in children in a single insti...
1,Thyroid_Cancer,""" The adopted strategy was the same as that us..."
2,Thyroid_Cancer,coronary arterybypass grafting thrombosis Ô¨Åb...
3,Thyroid_Cancer,Solitary plasmacytoma SP of the skull is an u...
4,Thyroid_Cancer,This study aimed to investigate serum matrix ...
...,...,...
7565,Colon_Cancer,we report the case of a 24yearold man who pres...
7566,Colon_Cancer,among synchronous colorectal cancers scrcs rep...
7567,Colon_Cancer,the heterogeneity of cancer cells is generally...
7568,Colon_Cancer,"""adipogenesis is the process through which mes..."


In [12]:

encoder = LabelEncoder()
df['label']=encoder.fit_transform(df['label'])

In [16]:
#Removing HTML tags from the whole dataset
def remove_html_tags(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

df['text'] = df['text'].apply(remove_html_tags)

In [17]:
# Since there was no space after each full stop, NLTK sent_tokenize considered 2 sentences as 1
def insert_space_after_full_stop(text):
    return text.replace('.', '. ')
df['text'] = df['text'].apply(insert_space_after_full_stop)

In [19]:
def lemmatization(text):
    lm= WordNetLemmatizer()
    text = ' '.join([lm.lemmatize(word, pos='v') for word in text.split()])
    return text
df['text'] = df['text'].apply(lemmatization)

In [20]:
df

Unnamed: 0,label,text
0,2,Thyroid surgery in children in a single instit...
1,2,""" The adopt strategy be the same as that use i..."
2,2,coronary arterybypass graft thrombosis Ô¨Åbrin...
3,2,Solitary plasmacytoma SP of the skull be an un...
4,2,This study aim to investigate serum matrix met...
...,...,...
7565,0,we report the case of a 24yearold man who pres...
7566,0,among synchronous colorectal cancers scrcs rep...
7567,0,the heterogeneity of cancer cells be generally...
7568,0,"""adipogenesis be the process through which mes..."


In [21]:
ps = PorterStemmer();
def text_transform(text):
    text= text.lower()
    text = nltk.word_tokenize(text)
    y=[]
    for i in text:
        if i.isalnum():
            y.append(i)
    
    text = y[:]
    y.clear()
    for i in text:
        if i not in stopwords.words('english') and i not in string.punctuation:
            y.append(i)
    text = y[:]
    y.clear()
    for i in text:
        y.append(ps.stem(i))
        
    
    return " ".join(y)

In [31]:
df['text'] = df['text'].apply(text_transform)

In [32]:
df

Unnamed: 0,label,text
0,2,thyroid surgeri children singl institut osama ...
1,2,adopt strategi use prior year base four exclus...
2,2,coronari arterybypass graft thrombosi mutation...
3,2,solitari plasmacytoma sp skull uncommon clinic...
4,2,studi aim investig serum matrix metalloprotein...
...,...,...
7565,0,report case 24yearold man present chief compla...
7566,0,among synchron colorect cancer scrc report pre...
7567,0,heterogen cancer cell gener accept astem celll...
7568,0,adipogenesi process mesenchymalstem cell msc c...


In [33]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7570 entries, 0 to 7569
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   7570 non-null   int64 
 1   text    7570 non-null   object
dtypes: int64(1), object(1)
memory usage: 118.4+ KB


In [34]:
df.to_csv('../Data/CleanedMedicalDataset.csv', index = False)