Anggota Kelompok :

1. Reyza Rahmatsyah - 2540122716
2. Stefan Bondito Giovanno - 2540122041
3. Gregory Nicolla - 2501962340

# Libraries

In [1]:
import numpy as np
import pandas as pd
import re
import nltk
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_auc_score, log_loss
from sklearn.metrics import classification_report, confusion_matrix

# Import Data

In [2]:
import gdown
import zipfile
url = 'https://drive.google.com/drive/folders/1DwqeAZWO7qhog0Ar4btPMoL7EZljYQDX?usp=sharing'
gdown.download_folder(url)

Retrieving folder contents


Processing file 1JG2i33getB48m0Eel14oUguV_cI3JI6U archive.zip
Processing file 1RJWEjAD3U5CWyUz4X9ZpH4EAbcAWPjD0 stratified_sample.zip


Retrieving folder contents completed
Building directory structure
Building directory structure completed
Downloading...
From (original): https://drive.google.com/uc?id=1JG2i33getB48m0Eel14oUguV_cI3JI6U
From (redirected): https://drive.google.com/uc?id=1JG2i33getB48m0Eel14oUguV_cI3JI6U&confirm=t&uuid=267a5610-e534-4771-844c-2595510196cb
To: /content/Dataset/archive.zip
100%|██████████| 58.7M/58.7M [00:01<00:00, 49.6MB/s]
Downloading...
From (original): https://drive.google.com/uc?id=1RJWEjAD3U5CWyUz4X9ZpH4EAbcAWPjD0
From (redirected): https://drive.google.com/uc?id=1RJWEjAD3U5CWyUz4X9ZpH4EAbcAWPjD0&confirm=t&uuid=0f319668-f53d-40cb-ad2b-dd2244292917
To: /content/Dataset/stratified_sample.zip
100%|██████████| 62.8M/62.8M [00:01<00:00, 57.7MB/s]
Download completed


['/content/Dataset/archive.zip', '/content/Dataset/stratified_sample.zip']

In [3]:
!unzip /content/Dataset/stratified_sample.zip

Archive:  /content/Dataset/stratified_sample.zip
 extracting: stratified_sample.csv   


# Read Data

In [4]:
df = pd.read_csv('stratified_sample.csv', encoding='latin-1')
df.head()

Unnamed: 0,Class,Description
0,Lung_Cancer,"""(a) shows a frame from the 4D acquisition fro..."
1,Colon_Cancer,""" tumor associated macrophages tam constitut..."
2,Thyroid_Cancer,Thyroid surgery in children in a single insti...
3,Thyroid_Cancer,peripheral serum metabolomic profiles inform ...
4,Lung_Cancer,"""Accordingly the protease inhibitor E-64d part..."


# 1. Data Preprocessing

## a. Label Encoding

In [5]:
## Label Encoder
label_encode = {"Class": {"Lung_Cancer":0,
                          "Colon_Cancer":1,
                          "Thyroid_Cancer":2}}
df = df.replace(label_encode)

In [None]:
df.head()

Unnamed: 0,Class,Description
0,0,"""(a) shows a frame from the 4D acquisition fro..."
1,1,""" tumor associated macrophages tam constitut..."
2,2,Thyroid surgery in children in a single insti...
3,2,peripheral serum metabolomic profiles inform ...
4,0,"""Accordingly the protease inhibitor E-64d part..."


## b. Data Cleaning

In [6]:
## Simple Cleaning Function
def cleaning(dataframe):
    df_clean = dataframe.str.lower()
    df_clean = [re.sub(r"\d+","",i )for i in df_clean]
    df_clean = [re.sub(r'[^\w]', ' ', i)for i in df_clean]
    df_clean = [re.sub(r'_',' ',i)for i in df_clean]
    df_clean = [re.sub(r'\s+',' ',i)for i in df_clean]

    return df_clean

In [7]:
clean_desc = cleaning(df['Description'])

In [8]:
df.insert(2, 'Clean Description', clean_desc)

In [9]:
df.head()

Unnamed: 0,Class,Description,Clean Description
0,0,"""(a) shows a frame from the 4D acquisition fro...",a shows a frame from the d acquisition from p...
1,1,""" tumor associated macrophages tam constitut...",tumor associated macrophages tam constitute t...
2,2,Thyroid surgery in children in a single insti...,thyroid surgery in children in a single instit...
3,2,peripheral serum metabolomic profiles inform ...,peripheral serum metabolomic profiles inform ...
4,0,"""Accordingly the protease inhibitor E-64d part...",accordingly the protease inhibitor e d partia...


## c. Data Splitting

In [10]:
## Splitting
train_df, test_df = train_test_split(df, test_size = 0.2, random_state = 41)
train_df.reset_index(drop=True, inplace=True)
test_df.reset_index(drop=True, inplace=True)

## d. Tokenization

In [11]:
 nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [12]:
def tokenizing(text):
  return word_tokenize(text)

train_df['tokenized'] = train_df['Clean Description'].apply(tokenizing)
test_df['tokenized'] = test_df['Clean Description'].apply(tokenizing)

In [13]:
train_df.head()

Unnamed: 0,Class,Description,Clean Description,tokenized
0,0,"""Actin was probed by antibody from Sigma (cata...",actin was probed by antibody from sigma catal...,"[actin, was, probed, by, antibody, from, sigma..."
1,1,high throughput methods in biological and bi...,high throughput methods in biological and bio...,"[high, throughput, methods, in, biological, an..."
2,0,"""Background: This study evaluated the efficacy...",background this study evaluated the efficacy ...,"[background, this, study, evaluated, the, effi..."
3,0,"""The LSCC tissue microarrays contain 75 matche...",the lscc tissue microarrays contain matched p...,"[the, lscc, tissue, microarrays, contain, matc..."
4,1,""" ovarian cancer is the leading cause of canc...",ovarian cancer is the leading cause of cancer...,"[ovarian, cancer, is, the, leading, cause, of,..."


In [14]:
test_df.head()

Unnamed: 0,Class,Description,Clean Description,tokenized
0,2,"""Accumulating evidence has revealed the critic...",accumulating evidence has revealed the critic...,"[accumulating, evidence, has, revealed, the, c..."
1,0,"""The probandÂs father (II-5) and sister (III-...",the probandâ s father ii and sister iii were ...,"[the, probandâ, s, father, ii, and, sister, ii..."
2,1,""" it is estimated that around Â of patients ...",it is estimated that around â of patients wit...,"[it, is, estimated, that, around, â, of, patie..."
3,2,Microbial colonisation of the gastrointestinal...,microbial colonisation of the gastrointestinal...,"[microbial, colonisation, of, the, gastrointes..."
4,0,"""In this study we identified a novel pathway f...",in this study we identified a novel pathway f...,"[in, this, study, we, identified, a, novel, pa..."


## e. Filtering

In [15]:
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [16]:
stopwords_list = set(stopwords.words('english'))

## Function to Remove Stopwords
def stopword_removal(tokenized):
  return [word for word in tokenized if word.lower() not in stopwords_list]

train_df['filtered'] = train_df['tokenized'].apply(stopword_removal)
test_df['filtered'] = test_df['tokenized'].apply(stopword_removal)

In [17]:
train_df.head()

Unnamed: 0,Class,Description,Clean Description,tokenized,filtered
0,0,"""Actin was probed by antibody from Sigma (cata...",actin was probed by antibody from sigma catal...,"[actin, was, probed, by, antibody, from, sigma...","[actin, probed, antibody, sigma, catalog, gene..."
1,1,high throughput methods in biological and bi...,high throughput methods in biological and bio...,"[high, throughput, methods, in, biological, an...","[high, throughput, methods, biological, biomed..."
2,0,"""Background: This study evaluated the efficacy...",background this study evaluated the efficacy ...,"[background, this, study, evaluated, the, effi...","[background, study, evaluated, efficacy, safet..."
3,0,"""The LSCC tissue microarrays contain 75 matche...",the lscc tissue microarrays contain matched p...,"[the, lscc, tissue, microarrays, contain, matc...","[lscc, tissue, microarrays, contain, matched, ..."
4,1,""" ovarian cancer is the leading cause of canc...",ovarian cancer is the leading cause of cancer...,"[ovarian, cancer, is, the, leading, cause, of,...","[ovarian, cancer, leading, cause, cancerrelate..."


In [18]:
test_df.head()

Unnamed: 0,Class,Description,Clean Description,tokenized,filtered
0,2,"""Accumulating evidence has revealed the critic...",accumulating evidence has revealed the critic...,"[accumulating, evidence, has, revealed, the, c...","[accumulating, evidence, revealed, critical, r..."
1,0,"""The probandÂs father (II-5) and sister (III-...",the probandâ s father ii and sister iii were ...,"[the, probandâ, s, father, ii, and, sister, ii...","[probandâ, father, ii, sister, iii, unaffected..."
2,1,""" it is estimated that around Â of patients ...",it is estimated that around â of patients wit...,"[it, is, estimated, that, around, â, of, patie...","[estimated, around, â, patients, early, stage,..."
3,2,Microbial colonisation of the gastrointestinal...,microbial colonisation of the gastrointestinal...,"[microbial, colonisation, of, the, gastrointes...","[microbial, colonisation, gastrointestinal, tr..."
4,0,"""In this study we identified a novel pathway f...",in this study we identified a novel pathway f...,"[in, this, study, we, identified, a, novel, pa...","[study, identified, novel, pathway, sp, mediat..."


# 2. Text Representation

## a. Word2Vec - Skipgram

In [19]:
import gensim
from gensim.models import Word2Vec

In [20]:
## Create Skipgram Model
skipgram_model = gensim.models.Word2Vec(train_df['tokenized'],
                                        min_count = 3,
                                        vector_size = 50,
                                        window = 5, sg=1)

In [21]:
## Making the vocabulary for skipgram from train data
vocabulary_skipgram = skipgram_model.wv.index_to_key
print(vocabulary_skipgram)



In [22]:
word_vec_dict={}
for word in vocabulary_skipgram:
    word_vec_dict[word]=skipgram_model.wv.get_vector(word)
print("The no of key-value pairs : ",len(word_vec_dict))

The no of key-value pairs :  119486


In [23]:
import tensorflow as tf
from keras.preprocessing.text import one_hot, Tokenizer

In [24]:
word_vectors = skipgram_model.wv
vocab_size = len(word_vectors.key_to_index) + 1 # +1 for padding
embed_dim=word_vectors.vector_size

tok = Tokenizer()
tok.fit_on_texts(train_df["filtered"])
sequences = tok.texts_to_sequences(train_df["filtered"])
max_length= max(len(seq) for seq in sequences) # Max length of sentence

In [25]:
## Making the embed_matrix
embed_matrix=np.zeros(shape=(vocab_size, embed_dim))
for word,i in tok.word_index.items():
    embed_vector = word_vec_dict.get(word)
    if embed_vector is not None:
        embed_matrix[i]=embed_vector

In [26]:
from keras.preprocessing.sequence import pad_sequences
pad_rev = pad_sequences(sequences, maxlen=max_length, padding='post')
pad_rev.shape

(2000, 4537)

In [27]:
embed_matrix

array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [-0.00898825,  0.42603001,  0.41066921, ..., -0.44812906,
        -0.5493499 ,  0.24328987],
       [-0.47797915, -0.36705509,  0.33232462, ...,  0.51817495,
        -0.08687739,  0.13122916],
       ...,
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ]])

# 3. Data Modeling

## ANN Model

### a. Training

In [32]:
model2 = keras.Sequential()
model2.add(Embedding(input_dim=vocab_size, output_dim=embed_dim, input_length=max_length,
                    weights=[embed_matrix], embeddings_initializer=Constant(embed_matrix),
                    trainable=False))

model2.add(layers.Dense(128, activation='relu'))
model2.add(layers.Dropout(0.5))
model2.add(layers.Dense(64, activation='relu'))
model2.add(layers.Dropout(0.5))
model2.add(layers.Dense(32, activation='relu'))
model2.add(layers.Dropout(0.5))

model2.add(layers.Flatten())
model2.add(layers.Dense(3, activation="softmax"))
print(model2.summary())

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 4537, 50)          5974350   
                                                                 
 dense_4 (Dense)             (None, 4537, 128)         6528      
                                                                 
 dropout (Dropout)           (None, 4537, 128)         0         
                                                                 
 dense_5 (Dense)             (None, 4537, 64)          8256      
                                                                 
 dropout_1 (Dropout)         (None, 4537, 64)          0         
                                                                 
 dense_6 (Dense)             (None, 4537, 32)          2080      
                                                                 
 dropout_2 (Dropout)         (None, 4537, 32)         

In [33]:
model2.compile(
    loss='sparse_categorical_crossentropy',
    optimizer=keras.optimizers.Adam(),
    metrics=["accuracy"],
)

In [34]:
model2.fit(pad_rev, train_df["Class"], epochs=3)


Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.src.callbacks.History at 0x7db41e0f6740>

In [35]:
train_predict2 = model2.predict(pad_rev)
train_pred_res2 = np.argmax(train_predict2, axis=1)



In [36]:
print(classification_report(train_df['Class'], train_pred_res2))

              precision    recall  f1-score   support

           0       0.94      1.00      0.97       571
           1       0.96      0.97      0.96       701
           2       0.99      0.93      0.96       728

    accuracy                           0.96      2000
   macro avg       0.96      0.96      0.96      2000
weighted avg       0.96      0.96      0.96      2000



### b. Testing

In [37]:
## Tokenize word for test data
word_token_test = [word_tokenize(i) for i in test_df['Clean Description']]
# Remove stopwords from test
filtered_tokens_test = [[word for word in sublist if word not in stopwords_list] for sublist in word_token_test]
tok.fit_on_texts(filtered_tokens_test)
encd_rev_test = tok.texts_to_sequences(filtered_tokens_test)
pad_rev_test= pad_sequences(encd_rev_test, maxlen=max_length, padding='post')
pad_rev_test.shape

(500, 4537)

In [38]:
test_predict2 = model2.predict(pad_rev_test)
res_pred2 = np.argmax(test_predict2, axis=1)




In [39]:
res_pred2

array([2, 0, 1, 2, 0, 0, 2, 0, 0, 1, 0, 2, 1, 0, 2, 1, 1, 0, 0, 2, 0, 2,
       2, 2, 1, 1, 2, 1, 0, 0, 0, 1, 0, 2, 2, 1, 0, 0, 2, 1, 2, 0, 0, 1,
       0, 1, 2, 2, 1, 1, 1, 2, 1, 0, 0, 2, 1, 1, 1, 1, 2, 0, 2, 2, 1, 0,
       1, 1, 1, 0, 0, 1, 2, 0, 1, 0, 2, 2, 2, 0, 2, 1, 1, 0, 1, 1, 1, 0,
       1, 0, 1, 1, 1, 0, 1, 1, 2, 1, 1, 0, 1, 1, 0, 0, 2, 1, 1, 0, 1, 2,
       1, 1, 2, 1, 2, 0, 0, 2, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0,
       0, 2, 2, 0, 2, 1, 2, 0, 2, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 2, 0,
       1, 0, 1, 0, 1, 0, 2, 2, 0, 0, 0, 2, 0, 2, 1, 1, 0, 1, 0, 0, 0, 2,
       2, 2, 1, 1, 0, 0, 0, 2, 0, 1, 1, 2, 2, 0, 1, 2, 1, 0, 2, 2, 0, 0,
       1, 0, 2, 2, 2, 0, 2, 1, 1, 2, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1,
       0, 2, 0, 0, 1, 2, 2, 1, 1, 2, 1, 0, 1, 0, 0, 2, 0, 2, 2, 1, 0, 1,
       1, 1, 1, 2, 2, 1, 2, 1, 2, 1, 1, 1, 1, 1, 2, 1, 0, 0, 1, 2, 0, 2,
       2, 0, 0, 0, 1, 1, 0, 2, 2, 1, 0, 0, 2, 2, 0, 1, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 2, 2, 2, 1, 0, 0, 2, 0, 2, 2, 0, 0,

### c. Pred Test Data

In [40]:
print('\nClassification Report 2\n')
print(classification_report(test_df['Class'], res_pred2))


Classification Report 2

              precision    recall  f1-score   support

           0       0.80      0.93      0.86       149
           1       0.66      0.84      0.74       151
           2       0.89      0.60      0.72       200

    accuracy                           0.77       500
   macro avg       0.79      0.79      0.78       500
weighted avg       0.80      0.77      0.77       500



## LSTM Model

In [28]:
from tensorflow import keras
from tensorflow.keras import layers
from keras.initializers import Constant
from tensorflow.keras.layers import Dense, Embedding, Activation, Flatten

### a. Training

In [None]:
model = keras.Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embed_dim, input_length=max_length,
                    weights=[embed_matrix], embeddings_initializer=Constant(embed_matrix),
                    trainable=False))

model.add(layers.LSTM(128, return_sequences=True))
model.add(layers.Dropout(0.5))
model.add(layers.Dense(64, activation='relu'))
model.add(layers.Dropout(0.5))
model.add(layers.Dense(32, activation='relu'))
model.add(layers.Dropout(0.5))

model.add(layers.Flatten())
model.add(layers.Dense(3, activation="softmax"))
print(model.summary())

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_4 (Embedding)     (None, 4537, 50)          5974350   
                                                                 
 lstm_4 (LSTM)               (None, 4537, 128)         91648     
                                                                 
 dropout_7 (Dropout)         (None, 4537, 128)         0         
                                                                 
 dense_12 (Dense)            (None, 4537, 64)          8256      
                                                                 
 dropout_8 (Dropout)         (None, 4537, 64)          0         
                                                                 
 dense_13 (Dense)            (None, 4537, 32)          2080      
                                                                 
 dropout_9 (Dropout)         (None, 4537, 32)         

In [None]:
model.compile(
    loss='sparse_categorical_crossentropy',
    optimizer=keras.optimizers.Adam(),
    metrics=["accuracy"],
)

In [None]:
model.fit(pad_rev, train_df["Class"], epochs=3)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.src.callbacks.History at 0x7921bbef2350>

In [None]:
train_predict = model.predict(pad_rev)



In [None]:
train_pred_res = np.argmax(train_predict, axis=1)

In [None]:
print('\nClassification Report\n')
print(classification_report(train_df['Class'], train_pred_res))


Classification Report

              precision    recall  f1-score   support

           0       0.97      1.00      0.99       571
           1       0.96      0.90      0.93       701
           2       0.92      0.96      0.94       728

    accuracy                           0.95      2000
   macro avg       0.95      0.95      0.95      2000
weighted avg       0.95      0.95      0.95      2000



### b. Testing

In [None]:
## Tokenize word for test data
word_token_test = [word_tokenize(i) for i in test_df['Clean Description']]
# Remove stopwords from test
filtered_tokens_test = [[word for word in sublist if word not in stopwords_list] for sublist in word_token_test]

In [None]:
tok.fit_on_texts(filtered_tokens_test)
encd_rev_test = tok.texts_to_sequences(filtered_tokens_test)

In [None]:
pad_rev_test= pad_sequences(encd_rev_test, maxlen=max_length, padding='post')
pad_rev_test.shape

(500, 4537)

### c. Pred Test Data

In [None]:
test_predict = model.predict(pad_rev_test)
res_pred = np.argmax(test_predict, axis=1)




In [None]:
res_pred

array([1, 0, 1, 2, 0, 1, 1, 0, 0, 2, 0, 2, 1, 0, 2, 2, 1, 0, 0, 2, 0, 2,
       1, 1, 1, 2, 0, 2, 2, 1, 0, 2, 0, 1, 2, 2, 0, 0, 1, 2, 2, 0, 0, 1,
       0, 1, 2, 2, 1, 1, 1, 2, 1, 0, 0, 2, 2, 2, 2, 1, 2, 0, 1, 1, 1, 0,
       2, 1, 1, 0, 0, 2, 2, 0, 1, 0, 2, 1, 2, 0, 2, 2, 2, 0, 1, 1, 1, 0,
       2, 2, 2, 2, 1, 0, 1, 1, 2, 1, 1, 0, 1, 1, 0, 0, 2, 1, 1, 0, 1, 2,
       1, 1, 2, 1, 2, 0, 0, 2, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 2, 2, 1,
       0, 2, 2, 0, 2, 1, 2, 0, 2, 1, 0, 1, 2, 2, 1, 1, 1, 0, 1, 1, 2, 0,
       2, 0, 1, 0, 2, 0, 2, 0, 1, 0, 0, 2, 0, 1, 2, 1, 0, 1, 0, 2, 1, 1,
       2, 2, 1, 1, 0, 0, 0, 2, 0, 1, 2, 2, 1, 0, 1, 2, 1, 0, 2, 1, 0, 0,
       1, 0, 2, 2, 2, 0, 1, 1, 2, 2, 1, 0, 1, 0, 0, 0, 2, 2, 1, 1, 0, 1,
       0, 2, 0, 0, 1, 2, 1, 1, 1, 2, 1, 0, 1, 0, 2, 1, 0, 1, 2, 2, 0, 1,
       2, 2, 1, 2, 2, 1, 2, 2, 2, 2, 0, 1, 1, 1, 2, 2, 1, 2, 2, 1, 0, 2,
       2, 2, 0, 0, 1, 1, 0, 2, 2, 2, 2, 0, 2, 2, 0, 1, 1, 1, 1, 1, 1, 2,
       1, 2, 1, 1, 2, 2, 1, 0, 0, 2, 0, 2, 2, 0, 0,

In [None]:
print('\nClassification Report\n')
print(classification_report(test_df['Class'], res_pred))


Classification Report

              precision    recall  f1-score   support

           0       0.89      0.92      0.90       149
           1       0.65      0.68      0.66       151
           2       0.74      0.69      0.72       200

    accuracy                           0.76       500
   macro avg       0.76      0.77      0.76       500
weighted avg       0.76      0.76      0.76       500

