In [1]:
from models.gru import GRU
from preprocessing import Preprocessing
from utility.paths import DataPath

import pandas as pd
import numpy as np
from tqdm.auto import tqdm

2023-12-18 09:11:59.458954: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/thainamhoang/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/thainamhoang/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/thainamhoang/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/thainamhoang/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [2]:
# Create training and testing preprocessing object.
train_prep = Preprocessing([DataPath.TRAIN_NEG_FULL, DataPath.TRAIN_POS_FULL])
test_prep = Preprocessing([DataPath.TEST], is_test=True)

In [3]:
# Declare params for GRU.
MAX_LEN = 120
BATCH_SIZE = 128
EPOCHS = 10
EMBEDDING_DIM = 100 # Since we're using GLoVe

In [4]:
# Declare GRU model.
gru = GRU(weight_path=DataPath.GRU_WEIGHT,
          submission_path=DataPath.GRU_SUBMISSION,
          max_length=MAX_LEN)

In [5]:
# Retrieve preprocessing steps declared in GRU class for both train and test data.
for step in tqdm(gru.preprocessing(), desc="Preprocessing train data"):
    getattr(train_prep, step)()

for step in tqdm(gru.preprocessing(is_train=False), desc="Preprocessing test data"):
    getattr(test_prep, step)()

Preprocessing train data:   0%|          | 0/12 [00:00<?, ?it/s]

Executing: `drop_duplicates`
Executing: `remove_ending`
Executing: `remove_extra_space`


100%|█████████████████████████████████████████████████████████████████████| 2268591/2268591 [00:03<00:00, 575771.10it/s]


Executing: `remove_space_before_symbol`


100%|█████████████████████████████████████████████████████████████████████| 2268591/2268591 [00:08<00:00, 255542.41it/s]


Executing: `remove_extra_space`


100%|█████████████████████████████████████████████████████████████████████| 2268591/2268591 [00:04<00:00, 564294.26it/s]


Executing: `reconstruct_last_emoji`
Executing: `emoji_to_tag`


100%|██████████████████████████████████████████████████████████████████████| 2268591/2268591 [00:26<00:00, 85703.19it/s]


Executing: `num_to_tag`
Executing: `hashtag_to_tag`
Executing: `repeat_symbols_to_tag`
Executing: `elongate_to_tag`
Executing: `remove_extra_space`


100%|█████████████████████████████████████████████████████████████████████| 2268591/2268591 [00:04<00:00, 524206.35it/s]


Preprocessing test data:   0%|          | 0/11 [00:00<?, ?it/s]

Executing: `remove_ending`
Executing: `remove_extra_space`


100%|█████████████████████████████████████████████████████████████████████████| 10000/10000 [00:00<00:00, 396556.99it/s]


Executing: `remove_space_before_symbol`


100%|█████████████████████████████████████████████████████████████████████████| 10000/10000 [00:00<00:00, 223453.10it/s]


Executing: `remove_extra_space`


100%|█████████████████████████████████████████████████████████████████████████| 10000/10000 [00:00<00:00, 526532.34it/s]


Executing: `reconstruct_last_emoji`
Executing: `emoji_to_tag`


100%|██████████████████████████████████████████████████████████████████████████| 10000/10000 [00:00<00:00, 80297.58it/s]


Executing: `num_to_tag`
Executing: `hashtag_to_tag`
Executing: `repeat_symbols_to_tag`
Executing: `elongate_to_tag`
Executing: `remove_extra_space`


100%|█████████████████████████████████████████████████████████████████████████| 10000/10000 [00:00<00:00, 400174.03it/s]


In [6]:
# Retrieve the preprocessed df.
train_data = train_prep.__get__()
test_data = test_prep.__get__()

In [7]:
# Export the dataframes. For training frames, shuffles.
train_data = train_data.reset_index(drop=True)
train_data.to_csv(DataPath.GRU_TRAIN, index=False)

test_data.to_csv(DataPath.GRU_TEST, index=False)

In [8]:
prep_train = pd.read_csv("data/preprocessed/gru/train_preprocessed.csv")

In [9]:
train_data.dropna(inplace=True)
prep_train.dropna(inplace=True)

In [10]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2268591 entries, 0 to 2268590
Data columns (total 2 columns):
 #   Column  Dtype 
---  ------  ----- 
 0   text    object
 1   label   int64 
dtypes: int64(1), object(1)
memory usage: 34.6+ MB


In [11]:
prep_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2268591 entries, 0 to 2268590
Data columns (total 2 columns):
 #   Column  Dtype  
---  ------  -----  
 0   text    object 
 1   label   float64
dtypes: float64(1), object(1)
memory usage: 34.6+ MB


In [12]:
og = Preprocessing([DataPath.TRAIN_NEG_FULL, DataPath.TRAIN_POS_FULL])
og = og.__get__()
og.dropna(inplace=True)
og.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2500000 entries, 0 to 2499999
Data columns (total 2 columns):
 #   Column  Dtype 
---  ------  ----- 
 0   text    object
 1   label   int64 
dtypes: int64(1), object(1)
memory usage: 38.1+ MB


In [15]:
for i in range(len(train_data["text"][1000:1200])):
    # print(og["text"][i])
    print(train_data["text"][i + 1000])
    print(prep_train["text"][i + 1000])

<url> more than this ( vevo lift brought to you by mcdonald's
<url> more than this ( vevo lift brought to you by mcdonald's
i want to grow bigger ( fatter ) like so so badly
i want to grow bigger ( fatter ) like so so badly
i just woke up so <elong> sad <user>
i just woke up so <elong> sad <user>
everyday i had to bring food for my mates
everyday i had to bring food for my mates
palm lifedrive directsync - wall charger bundle ( uk outlet plug this sync and charge bundle includes one wall
palm lifedrive directsync - wall charger bundle ( uk outlet plug this sync and charge bundle includes one wall
am i not faithful enough to you . i don't deserve this !
am i not faithful enough to you . i don't deserve this !
<user> <user> <user> hahah you already got someone in our group already ! still want me ?
<user> <user> <user> hahah you already got someone in our group already ! still want me ?
my best friend <user> has another best friend
my best friend <user> has another best friend
i hate the

In [17]:
prep_train["label"].unique()

array([0., 1.])

In [18]:
import tensorflow as tf

In [20]:
model = tf.keras.saving.load_model("./weights/gru_1")

2023-12-18 14:15:32.592294: W tensorflow/core/common_runtime/graph_constructor.cc:840] Node 'cond/while' has 13 outputs but the _output_shapes attribute specifies shapes for 40 outputs. Output shapes may be inaccurate.
2023-12-18 14:15:32.882486: W tensorflow/core/common_runtime/graph_constructor.cc:840] Node 'cond/while' has 13 outputs but the _output_shapes attribute specifies shapes for 40 outputs. Output shapes may be inaccurate.
2023-12-18 14:15:32.893682: W tensorflow/core/common_runtime/graph_constructor.cc:840] Node 'cond' has 4 outputs but the _output_shapes attribute specifies shapes for 40 outputs. Output shapes may be inaccurate.
2023-12-18 14:15:33.144558: W tensorflow/core/common_runtime/graph_constructor.cc:840] Node 'cond/while' has 13 outputs but the _output_shapes attribute specifies shapes for 40 outputs. Output shapes may be inaccurate.
2023-12-18 14:15:33.156132: W tensorflow/core/common_runtime/graph_constructor.cc:840] Node 'cond' has 4 outputs but the _output_sh

KeyboardInterrupt: 

In [2]:
import pandas as pd
sub = pd.read_csv("submissions/gru/submission_2023-12-18_14_04_34.csv")
sub.head()

Unnamed: 0,Id,Prediction
0,0,-1
1,1,-1
2,2,-1
3,3,1
4,4,-1


In [3]:
sub["Id"] = sub["Id"] + 1
sub.head()

Unnamed: 0,Id,Prediction
0,1,-1
1,2,-1
2,3,-1
3,4,1
4,5,-1


In [4]:
sub.to_csv("submissions/gru/submission_2023-12-18_14_04_34.csv", index=False)