In [None]:
# Install additional packages to handle lemmatization.
!pip install symspellpy vaderSentiment

Collecting symspellpy
  Downloading symspellpy-6.7.7-py3-none-any.whl (2.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.6/2.6 MB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting vaderSentiment
  Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl (125 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m126.0/126.0 kB[0m [31m14.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting editdistpy>=0.1.3 (from symspellpy)
  Downloading editdistpy-0.1.3.tar.gz (57 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.2/57.2 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: editdistpy
  Building wheel for editdistpy (pyproject.toml) ... [?25l[?25hdone
  Created wheel for editdistpy: filename=editdistpy-0.1.3-cp310-c

In [None]:
# Change directory.
import os
os.chdir("/content/drive/MyDrive/ML Project 2")
os.getcwd()

'/content/drive/MyDrive/ML Project 2'

## Import dependencies

In [None]:
from utility.paths import DataPath
from preprocessing import Preprocessing
from models.gru import GRU

import pandas as pd
from tqdm.auto import tqdm

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


In [None]:
# Create training and testing preprocessing object.
train_prep = Preprocessing([DataPath.TRAIN_NEG_FULL, DataPath.TRAIN_POS_FULL])
test_prep = Preprocessing([DataPath.TEST], is_test=True)

In [None]:
# Declare params for GRU.
MAX_LEN = 256
BATCH_SIZE = 128
EPOCHS = 10
EMBEDDING_DIM = 100 # Since we're using GLoVe

In [None]:
# Declare GRU model.
gru = GRU(weight_path=DataPath.GRU_WEIGHT,
          submission_path=DataPath.GRU_SUBMISSION,
          max_length=MAX_LEN)

## Preprocessing

In [None]:
# Retrieve preprocessing steps declared in GRU class for both train and test data.
for step in tqdm(gru.preprocessing(), desc="Preprocessing train data"):
    getattr(train_prep, step)()

for step in tqdm(gru.preprocessing(is_train=False), desc="Preprocessing test data"):
    getattr(test_prep, step)()

Preprocessing train data:   0%|          | 0/14 [00:00<?, ?it/s]

Executing: `drop_duplicates`
Executing: `remove_ending`
Executing: `remove_extra_space`


100%|██████████| 2268591/2268591 [00:01<00:00, 1156548.06it/s]


Executing: `remove_space_around_emoji`
Executing: `remove_extra_space`


100%|██████████| 2268591/2268591 [00:01<00:00, 1178470.91it/s]


Executing: `reconstruct_emoji`


100%|██████████| 2268591/2268591 [00:25<00:00, 88042.22it/s] 


Executing: `remove_extra_space`


100%|██████████| 2268591/2268591 [00:02<00:00, 862362.81it/s]


Executing: `emoji_to_tag`


100%|██████████| 2268591/2268591 [00:37<00:00, 59732.05it/s]


Executing: `reconstruct_last_emoji`
Executing: `num_to_tag`
Executing: `hashtag_to_tag`
Executing: `repeat_symbols_to_tag`
Executing: `elongate_to_tag`
Executing: `remove_extra_space`


100%|██████████| 2268591/2268591 [00:02<00:00, 1024107.87it/s]


Preprocessing test data:   0%|          | 0/13 [00:00<?, ?it/s]

Executing: `remove_ending`
Executing: `remove_extra_space`


100%|██████████| 10000/10000 [00:00<00:00, 766418.89it/s]

Executing: `remove_space_around_emoji`





Executing: `remove_extra_space`


100%|██████████| 10000/10000 [00:00<00:00, 798960.70it/s]


Executing: `reconstruct_emoji`


100%|██████████| 10000/10000 [00:00<00:00, 93891.96it/s]


Executing: `remove_extra_space`


100%|██████████| 10000/10000 [00:00<00:00, 858626.38it/s]


Executing: `emoji_to_tag`


100%|██████████| 10000/10000 [00:00<00:00, 72363.25it/s]


Executing: `reconstruct_last_emoji`
Executing: `num_to_tag`
Executing: `hashtag_to_tag`
Executing: `repeat_symbols_to_tag`
Executing: `elongate_to_tag`
Executing: `remove_extra_space`


100%|██████████| 10000/10000 [00:00<00:00, 486990.61it/s]


In [None]:
# Retrieve the preprocessed df.
train_data = train_prep.__get__()
test_data = test_prep.__get__()

In [None]:
# Export the dataframes. For training frames, shuffles.
train_data = train_data.sample(frac=1)
train_data.to_csv(DataPath.GRU_TRAIN, index=False)

test_data.to_csv(DataPath.GRU_TEST, index=False)

## Training

In [None]:
# Read the dataframe
train_df = pd.read_csv(DataPath.GRU_TRAIN)
train_df.head()

Unnamed: 0,text,label
0,i couldnt be happier <heart>,1.0
1,""" <user> why am i so <elong> tired "" maybe yu ...",0.0
2,yay ! ! ! rt <user> im gettting my award today...,1.0
3,<user> okie dokie ! try to stick around i wann...,1.0
4,not gonna speak to anyone at work the next cou...,0.0


In [None]:
# Drop nan in the df
train_df.dropna(inplace=True)
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2268591 entries, 0 to 2268590
Data columns (total 2 columns):
 #   Column  Dtype  
---  ------  -----  
 0   text    object 
 1   label   float64
dtypes: float64(1), object(1)
memory usage: 34.6+ MB


### Splitting data

In [None]:
# Create X and y to feed into GRU
X, y = train_df['text'].values, train_df['label'].values

### Training Loop

In [None]:
# Update vocabulary for GRU embedding
gru.update_vocabulary(X)

# Start the training process
gru.train(X, y, batch_size=BATCH_SIZE, epochs=EPOCHS)

Executing: `update_vocabulary`
Vocabulary size: 439824
Executing: `padding`
Executing: `padding`
Executing: `generate_embedding_matrix`


Loading GloVe: 0it [00:00, ?it/s]

Found 1193514 word vectors


Generating embedding matrix:   0%|          | 0/439822 [00:00<?, ?it/s]

Converted 172879 words (266943 missing)
Executing: `build_model`
Model summary
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 256, 100)          43982400  
                                                                 
 bidirectional (Bidirection  (None, 200)               121200    
 al)                                                             
                                                                 
 dense (Dense)               (None, 100)               20100     
                                                                 
 dense_1 (Dense)             (None, 1)                 101       
                                                                 
Total params: 44123801 (168.32 MB)
Trainable params: 141401 (552.35 KB)
Non-trainable params: 43982400 (167.78 MB)
____________________________________________________________

  saving_api.save_model(


Saving weights


## Predicting

In [None]:
# Read preprocessed test data
test_df = pd.read_csv(DataPath.GRU_TEST)
test_df.head()

Unnamed: 0,ids,text
0,1,sea doo pro sea scooter <sadface> sports with ...
1,2,<user> shucks well i work all week so now i ca...
2,3,i cant stay away from bug thats my baby
3,4,<user> no ma'am ! ! ! lol im perfectly fine an...
4,5,"whenever i fall asleep watching the tv , i alw..."


In [None]:
# Retrieve `text` column for predicting
X_test = test_df["text"]
gru.predict(X_test)

Executing: `padding`


We can check in `submissions/gru` there is a submission file (`submission_2023-12-18_14:04:34.csv`). Once submit to AICrowd, it yields a score of `0.865` and secondary score of `0.866` (Submission #247060).