In [1]:
# Importing the required libraries

# Data manipulation
import numpy as np
import pandas as pd

# Text processing
import re
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/bruno/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
# Reading the training data
training_variants = pd.read_csv("data_files/training_variants")
training_text = pd.read_csv("data_files/training_text", sep="\|\|", engine="python", header=None, skiprows=1, names=["ID", "Text"])

# Merging
training_data = pd.merge(training_variants, training_text, on="ID", how="left")

# Filling missing values in the text column
training_data.loc[training_data["Text"].isna(), "Text"] = training_data["Gene"] + " " + training_data["Variation"]

training_data.head()

Unnamed: 0,ID,Gene,Variation,Class,Text
0,0,FAM58A,Truncating Mutations,1,Cyclin-dependent kinases (CDKs) regulate a var...
1,1,CBL,W802*,2,Abstract Background Non-small cell lung canc...
2,2,CBL,Q249E,2,Abstract Background Non-small cell lung canc...
3,3,CBL,N454D,3,Recent evidence has demonstrated that acquired...
4,4,CBL,L399V,4,Oncogenic mutations in the monomeric Casitas B...


In [3]:
# Reading the test data
test_variants = pd.read_csv("data_files/test_variants")
test_text = pd.read_csv("data_files/test_text", sep="\|\|", engine="python", header=None, skiprows=1, names=["ID", "Text"])

# Merging
test_data = pd.merge(test_variants, test_text, on="ID", how="left")

# Filling missing values in the text column
test_data.loc[test_data["Text"].isna(), "Text"] = test_data["Gene"] + " " + test_data["Variation"]

test_data.head()

Unnamed: 0,ID,Gene,Variation,Text
0,0,ACSL4,R570S,Cyclin-dependent kinases (CDKs) regulate a var...
1,1,NAGLU,P521L,Abstract Background Non-small cell lung canc...
2,2,PAH,L333F,Abstract Background Non-small cell lung canc...
3,3,ING1,A148D,Recent evidence has demonstrated that acquired...
4,4,TMEM216,G77A,Oncogenic mutations in the monomeric Casitas B...


In [4]:
# Set the text preprocessing function
Stopwords = stopwords.words('english')
def preprocess_text(text):
    """This function preprocesses the text"""

    # Convert to lowercase
    text = text.lower()

    # Removing punctuation and html tags
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'<.*?>+', '', text)

    # Removing multiple spaces
    text = re.sub(r'\s+', ' ', text)

    # Finally, removing stopwords
    text = " ".join([word for word in text.split() if word not in Stopwords])

    return text

In [5]:
# Preprocessing the corpus
training_data["Text"] = training_data["Text"].apply(preprocess_text)
test_data["Text"] = test_data["Text"].apply(preprocess_text)

In [6]:
# Saving the preprocessed data
training_data.to_csv("data_files/training_data_preprocessed.csv", index=False)
test_data.to_csv("data_files/test_data_preprocessed.csv", index=False)