In [1]:
#imports
import pandas as pd
import regex as re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

The creator of [this](https://www.kaggle.com/code/firozchowdury/encoding-fix-and-data-pre-processing) notebook determined the best encoding to read in this dataset is MacRoman.

In [2]:
#read in data
df = pd.read_csv('alldata_1_for_kaggle.csv', encoding='MacRoman', index_col=0)
df.head()

Unnamed: 0,0,a
0,Thyroid_Cancer,Thyroid surgery in children in a single insti...
1,Thyroid_Cancer,""" The adopted strategy was the same as that us..."
2,Thyroid_Cancer,coronary arterybypass grafting thrombosis Ô¨Åb...
3,Thyroid_Cancer,Solitary plasmacytoma SP of the skull is an u...
4,Thyroid_Cancer,This study aimed to investigate serum matrix ...


From the creater of the notebook, I also know that the dataset contains a number of duplicates. I will confirm duplicates exist and drop them from the dataframe.

In [3]:
#7570 total entries
df.shape

(7570, 2)

In [4]:
#over 6500 are duplicates
df.duplicated().sum()

6574

In [5]:
#no more dupes
df = df.drop_duplicates()
df.shape

(996, 2)

In [6]:
#change column names
df.columns = ['label', 'text']
df.head()

Unnamed: 0,label,text
0,Thyroid_Cancer,Thyroid surgery in children in a single insti...
1,Thyroid_Cancer,""" The adopted strategy was the same as that us..."
2,Thyroid_Cancer,coronary arterybypass grafting thrombosis Ô¨Åb...
3,Thyroid_Cancer,Solitary plasmacytoma SP of the skull is an u...
4,Thyroid_Cancer,This study aimed to investigate serum matrix ...


From the initial look at the dataset, I can see that there are special characters in the text. I will use regex to filter out these characters.

In [7]:
#use regexp to remove all characters except letters and spaces
regexp = r"[^a-zA-Z\s']"

df.text = df.text.str.replace(regexp, "", regex=True)

df.head()

Unnamed: 0,label,text
0,Thyroid_Cancer,Thyroid surgery in children in a single insti...
1,Thyroid_Cancer,The adopted strategy was the same as that use...
2,Thyroid_Cancer,coronary arterybypass grafting thrombosis brin...
3,Thyroid_Cancer,Solitary plasmacytoma SP of the skull is an u...
4,Thyroid_Cancer,This study aimed to investigate serum matrix ...


It's not perfect, but it cleans it up nicely. Next, I will remove capitalization and stopwords, and reduce the length of the words using the PorterStemmer because of the length of the text documents.

In [8]:
#lowercase text
df.text = df.text.str.lower()
df.head()

Unnamed: 0,label,text
0,Thyroid_Cancer,thyroid surgery in children in a single insti...
1,Thyroid_Cancer,the adopted strategy was the same as that use...
2,Thyroid_Cancer,coronary arterybypass grafting thrombosis brin...
3,Thyroid_Cancer,solitary plasmacytoma sp of the skull is an u...
4,Thyroid_Cancer,this study aimed to investigate serum matrix ...


In [9]:
#unpack stopwords and write function to apply to text data
unpacked_stopwords = stopwords.words('english')
ps = PorterStemmer()

def remove_stopwords(article, stopword_list=unpacked_stopwords, stemmer=ps):

    tok_article = word_tokenize(article)
    approved_words = []

    for word in tok_article:
        if word in stopword_list:
            continue
        else:
            stem = ps.stem(word)
            approved_words.append(stem)

    return " ".join(approved_words)

In [10]:
#remove stopwords
df.text = df.text.apply(remove_stopwords)
df.head()

Unnamed: 0,label,text
0,Thyroid_Cancer,thyroid surgeri children singl institut osama ...
1,Thyroid_Cancer,adopt strategi use prior year base four exclus...
2,Thyroid_Cancer,coronari arterybypass graft thrombosi brin bri...
3,Thyroid_Cancer,solitari plasmacytoma sp skull uncommon clinic...
4,Thyroid_Cancer,studi aim investig serum matrix metalloprotein...


In [11]:
#check for class balance
df.label.value_counts()

Lung_Cancer       452
Thyroid_Cancer    283
Colon_Cancer      261
Name: label, dtype: int64

In [12]:
#pretty good balance, save clean copy to csv
df.to_csv('clean_stemmed.csv')