# Path

In [None]:
# Global path
GLOBAL_PATH = "/content/drive/MyDrive/CS433"

# GloVe
GLOVE_PATH = f"{GLOBAL_PATH}/data/glove.twitter.27B.100d.txt"

# Train full
TRAIN_NEG_FULL_PATH = f"{GLOBAL_PATH}/data/train_neg_full.txt"
TRAIN_POS_FULL_PATH = f"{GLOBAL_PATH}/data/train_pos_full.txt"

# Train
TRAIN_NEG_PATH = f"{GLOBAL_PATH}/data/train_neg.txt"
TRAIN_POS_PATH = f"{GLOBAL_PATH}/data/train_pos.txt"

# Test
TEST_PATH = f"{GLOBAL_PATH}/data/test_data.txt"

# Preprocessed data
TRAIN_PREP_PATH = f"{GLOBAL_PATH}/data/preprocessed/train_gru.csv"
TEST_PREP_PATH = f"{GLOBAL_PATH}/data/preprocessed/test_gru.csv"

# Weight
WEIGHT_PATH = f"{GLOBAL_PATH}/weight"

# Preprocessing

In [None]:
!pip install symspellpy

Collecting symspellpy
  Downloading symspellpy-6.7.7-py3-none-any.whl (2.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.6/2.6 MB[0m [31m10.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting editdistpy>=0.1.3 (from symspellpy)
  Downloading editdistpy-0.1.3.tar.gz (57 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.2/57.2 kB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: editdistpy
  Building wheel for editdistpy (pyproject.toml) ... [?25l[?25hdone
  Created wheel for editdistpy: filename=editdistpy-0.1.3-cp310-cp310-linux_x86_64.whl size=187460 sha256=78ecfbfeeb9df2ca63154b84ec16b1cbfcb9daf21e7e7b11822c8f1008f6ef7f
  Stored in directory: /root/.cache/pip/wheels/88/6a/a6/a1283cc145323a1fb3d475bd158ee60b248ab1985230d266fc
Success

In [None]:
import pkg_resources
import nltk
import re
import pandas as pd
import numpy as np

from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from symspellpy import SymSpell # Fuzzy search and word correction

In [None]:
EMOTICONS_GLOVE = {
  '<smile>': [':-]', '0:3', '8)', ":'-)", '>^_^<', '(^_^)', "(';')", ':*',
    '(^^)/', ':)', ':>', '(*_*)', '(^^)v', '=3', ':}', ';^)', ':->', '^_^;',
    '=)', '(^o^)', '*)', '(^.^)', '^_^', '\\o/', '^5', '(__)', '(#^.^#)', '0:)',
    '(^^)', ';]', ':-*', ':^)', ':3', '(+_+)', ';)', ":')", '(:', ':-3', ':-}',
    ';-)', ':-)', ':]', '*-)', 'o/\\o', '=]', '(^_-)', '8-)', ':o)', ':c)',
    '(^_^)/', '(o.o)', ':o', '>:)', '8-0', ':-0', ';3', '>:3', '3:)', ':-o',
    '}:)', 'o_0', '^^;', 'xx', 'xxx', '^o^', ':d', ' c:'],
  '<lolface>': [':-p', ':p', ':b', ':-b', 'x-p', '=p'],
  '<heart>': ['<3'],
  '<neutralface>': ['=\\', '>:/', '(..)', '(._.)', ':-/', ':|', '>.<', ':-.',
    "('_')", '=/', ':/', ':#', '(-_-)', 'o-o', 'o_o', ':$', '>:\\', ':@', ':-|',
    '><>', '(-.-)', ':\\', '<+', ':-@'],
  '<sadface>': [';(', '(~_~)', ':c', ':[', ':-&', ':(', '>:[', ':&', ':-c',
    ';n;', ":'(", ';;', ':-[', ';-;', '%)', ':<', '<\\3', ':{', ';_;', '=(',
    'v.v', 'm(__)m', '</3', ":'-(", ':-<']
}


In [None]:
# nltk weights
nltk.download("averaged_perceptron_tagger")
nltk.download("stopwords")
nltk.download("wordnet")

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
def function_name(func):
    def wrapper(*args, **kwargs):
        print(f"Executing function: `{func.__name__}`")
        return func(*args, **kwargs)
    return wrapper

In [None]:
class Preprocessing:
    def __init__(self, path_ls: list[str], is_submission: bool = False) -> None:
        if is_submission:
            with open(path_ls[0]) as f:
                content = f.read().splitlines()

            ids = [line.split(",")[0] for line in content]
            text = [",".join(line.split(",")[1:]) for line in content]

            self.__df = pd.DataFrame({"ids": ids, "text": text})
        else:
            # Initiate df from the train data
            self.__df = pd.DataFrame(columns=["text", "label"])

            for i, path in enumerate(path_ls):
                with open(path) as f:
                    content = f.read().splitlines()

                __temp_df = pd.DataFrame({"text": content, "label": np.ones(len(content)) * i})

                self.__df = self.__df.append(__temp_df).reset_index(drop=True)

            self.__df["raw_text"] = self.__df["text"]


    def get(self) -> pd.DataFrame:
        return self.__df

    @function_name
    def lower(self):
        self.__df["text"] = self.__df["text"].str.lower()


    @function_name
    def drop_duplicates(self):
        self.__df = self.__df.drop_duplicates(subset=["text"])


    @function_name
    def remove_tags(self):
        # Remove <user> and <url> and ...
        self.__df["text"] = self.__df["text"].str.replace("<[\w]*>", "")
        self.__df["text"] = self.__df["text"].apply(lambda x: x.strip())
        self.__df["text"] = self.__df["text"].str.replace("\.{3}$", "")


    @function_name
    def remove_number(self):
        self.__df["text"] = self.__df["text"].str.replace("\d", "")


    @function_name
    def remove_punctuation(self):
        self.__df["text"] = str(self.__data["text"]).replace("[^\w\s]", "")


    @function_name
    def remove_stopword(self):
        stopword_list = set(stopwords.words("english"))
        self.__df["text"] = self.__df["text"].apply(lambda text: " ".join([word for word in str(text).split() if word not in stopword_list]))


    @function_name
    def remove_elong(self):
        elf.__df["text"] = self.__df["text"].apply(lambda x: str(re.sub(r"\b(\S*?)(.)\2{2,}\b", r"\1\2", x)))


    @function_name
    def remove_whitespace(self):
        self.__df["text"] = self.__df["text"].str.replace("\s{2,}", " ")
        self.__df["text"] = self.__df["text"].apply(lambda x: x.strip())
        self.__df.reset_index(inplace=True, drop=True)


    @function_name
    def remove_space_between_emoticons(self):

        # Getting list of all emoticons
        emo_list = [el for value in list(EMOTICONS_GLOVE.values()) for el in value]

        # Putting a space between each character in each emoticon
        emo_with_spaces = "|".join(re.escape(" ".join(emo)) for emo in emo_list)

        # Getting all emoticons that don"t contain any alphanumeric character
        all_non_alpha_emo = "|".join(re.escape(emo) for emo in emo_list if not any(char.isalpha() or char.isdigit() for char in emo))

        # Removing spaces between emoticons
        self.__df["text"] = self.__df["text"].str.replace(emo_with_spaces, lambda t: t.group().replace(' ', ''))

        # Adding space between a word and an emoticon
        self.__df["text"] = self.__df["text"].str.replace(rf"({all_non_alpha_emo})", r" \1 ")


    @function_name
    def reconstruct_ending_emoticon(self):
        # Reconstruct emoticon at the end of line
        self.__df["text"] = self.__df["text"].str.replace("\)+$", ":)").replace("\(+$", ":(")


    @function_name
    def emoticons_to_tags(self):
        # Dictionary like {tag:[list_of_emoticons]}
        union_re = {}
        for tag, emo_list in EMOTICONS_GLOVE.items():
            # Getting emoticons as they are
            re_emo = "|".join(re.escape(emo) for emo in emo_list)
            union_re[tag] = re_emo

        # Function to be called for each tweet
        def _inner(text, _union_re):
            for tag, union_re in _union_re.items():
                text = re.sub(union_re, " " + tag + " ", text)
            return text

        # Applying for each tweet
        self.__df["text"] = self.__df["text"].apply(lambda x: _inner(str(x), union_re))


    @function_name
    def final_emoticons_to_tag(self):
        # Reconstruct emoticon at the end of line
        self.__df["text"] = self.__df["text"].str.replace("\)+$", "<smile>").replace("\(+$", "<sadface>")


    @function_name
    def hashtags_to_tags(self):
        self.__df["text"] = self.__df["text"].replace(r"#(\S+)", r"<hashtag> \1")

    @function_name
    def numbers_to_tags(self):
        self.__df["text"] = self.__df["text"].replace(r"[-+]?[.\d]*[\d]+[:,.\d]*", r"<number>")


    @function_name
    def repeat_to_tags(self):
        self.__df["text"] = self.__df["text"].replace(r"([!?.]){2,}", r"\1 <repeat>")


    @function_name
    def elongs_to_tags(self):
        self.__df["text"] = self.__df["text"].replace(r"\b(\S*?)(.)\2{2,}\b", r"\1\2 <elong>")


In [None]:
class NewPreprocessing:
    """
    Preprocesses the data and can even perform feature extraction.

    Attributes:
      __data: A pandas dataframe with the data (at least one column called text).
    """

    def __init__(self, list_: list, submission=False):
        """
        Builds the Pandas DataFrame.
          - If submission is False, a list of 2 elements is expected.
            The first one must be the negative tweets.
            The second one must be the positive tweets.
            The final DataFrame is composed of `text` and `label` columns.
          - If submission is True, a list of 1 element is expected.
            The final DataFrame is composed of `ids` and `text` columns.

        :param list_: a list of .txt files to be converted in DataFrame
        :type list_: list
        :param submission: specify the type of DataFrame (train or test data)
        :rtype submission: bool
        """

        if not submission:
            if len(list_) == 2:

                # Creating empty DataFrame
                self.__data = pd.DataFrame(columns=["text", "label"])

                # Reading the content of each file in the list
                for i, file_name in enumerate(list_):
                    with open(file_name) as f:
                        content = f.read().splitlines()

                    # Creating a DataFrame putting as label the position in the input list
                    df = pd.DataFrame(
                        columns=["text", "label"],
                        data={"text": content, "label": np.ones(len(content)) * i},
                    )

                    # Appending the dataframe
                    self.__data = self.__data.append(df).reset_index(drop=True)

        else:
            if len(list_) == 1:
                # Reading the content
                with open(list_[0]) as f:
                    content = f.read().splitlines()

                # Getting the ids
                ids = [line.split(",")[0] for line in content]
                # Getting the tweets' content
                texts = [",".join(line.split(",")[1:]) for line in content]

                # Creating the DataFrame
                self.__data = pd.DataFrame(
                    columns=["ids", "text"], data={"ids": ids, "text": texts}
                )

    # UTILITY METHODS

    def get(self):
        """
        Returns the DataFrame.

        :return: the DataFrame
        :rtype: pandas.DataFrame
        """
        return self.__data

    def logging(self):
        """
        Prints the first 10 rows in the dataframe stored in self.__data.
        """
        print("Logging:")
        print(self.__data["text"].head(10))

    def save_raw(self):
        """
        Creates a column in the dataframe as copy of `text` column
          to keep the original data.

        Must be called before anything else!
        """
        print("Saving raw tweet...")

        self.__data["raw"] = self.__data["text"]

    # PREPROCESSING METHODS

    def drop_duplicates(self):
        """
        Removes duplicated in the dataframe according to text column.
        """
        print("Dropping duplicates...")

        self.__data = self.__data.drop_duplicates(subset=["text"])

    def remove_tags(self):
        """
        Removes tags (<user>, <url>) and final '...' characters (long tweets)
        """
        print("Removing tags...")

        self.__data["text"] = self.__data["text"].str.replace("<[\w]*>", "")
        self.__data["text"] = self.__data["text"].apply(lambda text: text.strip())
        self.__data["text"] = self.__data["text"].str.replace("\.{3}$", "")

    def convert_hashtags(self):
        """
        Removes '#' at the beginning of a tweet and corrects spacing of it.
        """
        print("Converting hashtags...")

        self.__data["text"] = self.__data["text"].str.replace(
            "(#)(\w+)",
            lambda text: Preprocessing.__word_segmentation(str(text.group(2))),
        )

    def slangs_to_words(self):
        """
        Extends slangs to sequence of words.
        """
        print("Converting slangs to words...")

        # Reading the slangs from file
        with open("./utility/slang.txt") as f:
            chat_words_str = f.read().splitlines()

        # List of mappings {slang: slang_expanded}
        chat_words_map_dict = {}

        # List of slangs
        chat_words_list = []

        for line in chat_words_str:
            # Slang
            cw = line.split("=")[0]

            # Slang expanded
            cw_expanded = line.split("=")[1]

            # Appending slang and mapping
            chat_words_list.append(cw)
            chat_words_map_dict[cw] = cw_expanded

        # Make sure slangs in list are unique
        chat_words_list = set(chat_words_list)

        # Function to be called for each tweet
        def chat_words_conversion(text):
            new_text = []

            # For each word in the tweet
            for w in text.split():

                # If slangs is in the mapping
                if w.upper() in chat_words_list:
                    new_text.append(chat_words_map_dict[w.upper()])

                # Otherwise, use the slang itself
                else:
                    new_text.append(w)
            return " ".join(new_text)

        # Calling `chat_words_conversion` for each tweet
        self.__data["text"] = self.__data["text"].apply(
            lambda text: chat_words_conversion(str(text))
        )

    def final_parenthesis(self):
        """
        Substitutes the final parenthesis of a tweet with a positive or negative smile.
        More on this in the report.
        """

        print("Substituting final paranthesis...")

        self.__data["text"] = self.__data["text"].str.replace("\)+$", ":)")
        self.__data["text"] = self.__data["text"].str.replace("\(+$", ":(")

    def final_parenthesis_to_tags(self):
        """
        Substitutes the final parenthesis of a tweet with a positive or negative smile tag.
        More on this in the report.
        """
        print("Substituting final paranthesis with tags...")

        self.__data["text"] = self.__data["text"].str.replace("\)+$", " <smile> ")
        self.__data["text"] = self.__data["text"].str.replace("\(+$", " <sadface> ")

    def remove_numbers(self):
        """
        Removes numbers from each tweet
        """

        print("Removing numbers...")
        self.__data["text"] = self.__data["text"].str.replace("\d", "")

    def remove_punctuation(self):
        """
        Removes everything that is not alphanumeric and not a space.
        """

        print("Removing punctuation...")
        self.__data["text"] = self.__data["text"].str.replace("[^\w\s]", "")

    def to_lower(self):
        """
        Converts each tweet to lowercase.
        """

        print("Converting to lowercase...")
        self.__data["text"] = self.__data["text"].str.lower()

    def correct_spelling(self):
        """
        Corrects spelling of each tweet.
        """

        print("Correcting spelling...")
        self.__data["text"] = self.__data["text"].apply(
            lambda text: Preprocessing.__correct_spelling(text)
        )

    def lemmatize(self):
        """
        Performs the lemmatization.
        """

        print("Performing lemmatization...")
        self.__data["text"] = self.__data["text"].apply(Preprocessing.__lemmatize)

    def remove_stopwords(self):
        """
        Removes english stopwords.
        """

        print("Removing stopwords...")

        # Getting english stopwords set
        stopwords_ = set(stopwords.words("english"))

        # Removing stopwords for each tweet
        self.__data["text"] = self.__data["text"].apply(
            lambda text: " ".join(
                [word for word in str(text).split() if word not in stopwords_]
            )
        )

    def empty_tweets(self):
        """
        Adds tag <EMPTY> for empty tweets.
        """

        print("Marking empty tweets...")
        self.__data["text"] = self.__data["text"].str.replace("^\s*$", "<EMPTY>")

    def remove_elongs(self):
        """
        Removes elongs. (e.g.: hellooooo -> hello)
        """

        print("Removing elongs...")
        self.__data["text"] = self.__data["text"].apply(
            lambda text: str(re.sub(r"\b(\S*?)(.)\2{2,}\b", r"\1\2", text))
        )

    def correct_spacing_indexing(self):
        """
        Deletes double or more spaces and corrects indexing.

        Must be called after calling the above methods.
        Most of the above methods just delete a token. However since tokens are
        surrounded by whitespaces, they will often result in having more than one
        space between words.

        The only exception is for `remove_space_between_emoticons` method.
        Should be called before and after calling that method.
        It could exist ':  )' which that method doesn't recognize.
        """

        print("Correcting spacing...")

        # Removing double spaces
        self.__data["text"] = self.__data["text"].str.replace("\s{2,}", " ")

        # Stripping text
        self.__data["text"] = self.__data["text"].apply(lambda text: text.strip())

        # Correcting the indexing
        self.__data.reset_index(inplace=True, drop=True)

    def remove_space_between_emoticons(self):
        """
        Removes spaces between emoticons (e.g.: ': )' --> ':)').
        Adds a space between a word and an emoticon (e.g.: 'hello:)' --> 'hello :)')
        """

        print("Removing space between emoticons...")

        # Getting list of all emoticons
        emo_list = [el for value in list(EMOTICONS_GLOVE.values()) for el in value]

        # Putting a space between each character in each emoticon
        emo_with_spaces = "|".join(re.escape(" ".join(emo)) for emo in emo_list)

        # Getting all emoticons that don't contain any alphanumeric character
        all_non_alpha_emo = "|".join(
            re.escape(emo)
            for emo in emo_list
            if not any(char.isalpha() or char.isdigit() for char in emo)
        )

        # Removing spaces between emoticons
        self.__data["text"] = self.__data["text"].str.replace(
            emo_with_spaces, lambda t: t.group().replace(" ", "")
        )

        # Adding space between a word and an emoticon
        self.__data["text"] = self.__data["text"].str.replace(
            rf"({all_non_alpha_emo})", r" \1 "
        )

    def emoticons_to_tags(self):
        """
        Convert emoticons (with or without spaces) into tags
          according to the pretrained stanford glove model
          (e.g.: :) ---> <smile> and so on)
        """

        print("Converting emoticons to tags...")

        # Dictionary like {tag:[list_of_emoticons]}
        union_re = {}
        for tag, emo_list in EMOTICONS_GLOVE.items():
            # Getting emoticons as they are
            re_emo = "|".join(re.escape(emo) for emo in emo_list)
            union_re[tag] = re_emo

        # Function to be called for each tweet
        def inner(text, _union_re):
            for tag, union_re in _union_re.items():
                text = re.sub(union_re, " " + tag + " ", text)
            return text

        # Applying for each tweet
        self.__data["text"] = self.__data["text"].apply(
            lambda text: inner(str(text), union_re)
        )

    def hashtags_to_tags(self):
        """
        Convert hashtags. (e.g.: #hello ---> <hashtag> hello)
        """

        print("Converting hashtags to tags...")
        self.__data["text"] = self.__data["text"].str.replace(
            r"#(\S+)", r"<hashtag> \1"
        )

    def numbers_to_tags(self):
        """
        Convert numbers into tags. (e.g.: 34 ---> <number>)
        """

        print("Converting numbers to tags...")
        self.__data["text"] = self.__data["text"].str.replace(
            r"[-+]?[.\d]*[\d]+[:,.\d]*", r"<number>"
        )

    def repeat_to_tags(self):
        """
        Convert repetitions of '!' or '?' or '.' into tags.
          (e.g.: ... ---> . <repeat>)
        """

        print("Converting repetitions of symbols to tags...")
        self.__data["text"] = self.__data["text"].str.replace(
            r"([!?.]){2,}", r"\1 <repeat>"
        )

    def elongs_to_tags(self):
        """
        Convert elongs into tags. (e.g.: hellooooo ---> hello <elong>)
        """

        print("Converting elongated words to tags...")
        self.__data["text"] = self.__data["text"].str.replace(
            r"\b(\S*?)(.)\2{2,}\b", r"\1\2 <elong>"
        )

    def remove_endings(self):
        """
        Remove ... <url> which represents the ending of tweet
        """

        print("Removing tweet ending when the tweet is cropped...")
        self.__data["text"] = self.__data["text"].str.replace(r"\.{3} <url>$", "")

    # STATIC METHODS (private, used internally)

    # Instance of `SymSpell` class
    symspell = None

    @staticmethod
    def __get_symspell():
        """
        Instantiates a `SymSpell` object.

        :return: instantiated object
        :rtype: SymSpell
        """

        # If is not already instantiated
        if Preprocessing.symspell is None:
            # Instantiating `SymSpell`
            Preprocessing.symspell = SymSpell()

            # Getting dictionary for single words
            dictionary_path = pkg_resources.resource_filename(
                "symspellpy", "frequency_dictionary_en_82_765.txt"
            )
            Preprocessing.symspell.load_dictionary(
                dictionary_path, term_index=0, count_index=1
            )

            # Getting dictionary for bigram (two words)
            bigram_path = pkg_resources.resource_filename(
                "symspellpy", "frequency_bigramdictionary_en_243_342.txt"
            )
            Preprocessing.symspell.load_bigram_dictionary(
                bigram_path, term_index=0, count_index=2
            )

        return Preprocessing.symspell

    @staticmethod
    def __word_segmentation(text):
        """
        Tries to put spaces between word in a text (used for hashtag).
          (e.g.: helloguys --> hello guys))

        :param text: Text to be converted (typically an hashtag)
        :type text: str
        :return: Processed text
        :rtype: str
        """

        # `max_edit_distance = 0` avoids that `SymSpell` corrects spelling.
        result = Preprocessing.__get_symspell().word_segmentation(
            text, max_edit_distance=0
        )
        return result.segmented_string

    @staticmethod
    def __correct_spelling(text):
        """
        Corrects spelling of a word (e.g.: helo -> hello)

        :param text: Text to be converted
        :type text: str
        :return: Processed text
        :rtype: str
        """

        # `max_edit_distance = 2` tells `SymSpell` to check at a maximum distance
        #  of 2 in the vocabulary. Only words with at most 2 letters wrong will be corrected.
        result = Preprocessing.__get_symspell().lookup_compound(
            text, max_edit_distance=2
        )

        return result[0].term

    @staticmethod
    def __get_wordnet_tag(nltk_tag):
        """
        Returns type of word according to nltk pos tag.

        :param nltk_tag: nltk pos tag
        :type nltk_tag: list(tuple(str, str))
        :return: type of a word
        :rtype: str
        """

        if nltk_tag.startswith("V"):
            return wordnet.VERB
        elif nltk_tag.startswith("N"):
            return wordnet.NOUN
        elif nltk_tag.startswith("J"):
            return wordnet.ADJ
        elif nltk_tag.startswith("R"):
            return wordnet.ADV
        else:
            # This is the default in WordNetLemmatizer, when no pos tag is passed
            return wordnet.NOUN

    @staticmethod
    def __lemmatize(text):
        """
        Performs lemmatization using nltk pos tag and `WordNetLemmatizer`.

        :param text: Text to be processed
        :type text: str
        :return: processed texg
        :rtype: str
        """

        nltk_tagged = nltk.pos_tag(text.split())
        lemmatizer = WordNetLemmatizer()

        return " ".join(
            [
                lemmatizer.lemmatize(w, Preprocessing.__get_wordnet_tag(nltk_tag))
                for w, nltk_tag in nltk_tagged
            ]
        )


# Abstract method

In [None]:
from sklearn.model_selection import train_test_split
from abc import ABC, abstractmethod

In [None]:
class AbstractModel(ABC):
    def __init__(self, weights_path: str):
        self.__weights_path = weights_path


    @abstractmethod
    def get_preprocessing_methods(self, is_test: bool = False):
        pass


    @abstractmethod
    def fit_predict(self, X, y, ids_test, X_test, prediction_path):
        pass


    @abstractmethod
    def predict(self, ids, X, path):
        pass


    @staticmethod
    def _create_submission(ids: list[int], predictions: list[int], path: str):
        # Generating the submission file
        submission = pd.DataFrame(columns=["Id", "Prediction"],
                                data={"Id": ids, "Prediction": predictions})

        # For many models the labels are 0 or 1. Replacing 0s with -1s.
        submission["Prediction"].replace(0, -1, inplace=True)

        # Saving the file
        submission.to_csv(path, index=False)


    @staticmethod
    def _split_data(X: pd.DataFrame, y: pd.DataFrame, test_size: float = 0.2, random_state: int = 42, **kwargs) -> tuple:
        print("Splitting data in train and test set...")
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state, **kwargs)

        return X_train, X_test, y_train, y_test

# GRU

In [None]:
!pip install tensorflow keras



In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.initializers import Constant
from tensorflow.keras import layers

In [None]:
class Gru(AbstractModel):
    def __init__(self,
                 weights_path: str = WEIGHT_PATH,
                 glove_path: str = GLOVE_PATH,
                 max_tweet_length: int = 120,
                 embedding_dim: int = 100):
        super().__init__(weights_path)

        self.__tokenizer = Tokenizer(oov_token="<unk>")
        self.__model = tf.keras.Sequential()
        self.__max_tweet_length = max_tweet_length
        self.__embedding_dim = embedding_dim
        self.__glove_path = glove_path

        # Size of the vocabulary, it will be updated according to the input data
        self.__vocab_size = 0


    def update_vocabulary(self, X: pd.DataFrame):

        # Updates the default internal vocabulary according to the words in X
        self.__tokenizer.fit_on_texts(X)

        # Updating the vocabulary length.
        # NOTE: the +2 is due to some special reserved tokens that are in the vocabulary
        # but not in the tweets
        self.__vocab_size = len(self.__tokenizer.word_index) + 2


    def __convert_data(self, X: pd.DataFrame):

        # Creating the numerical tokens and padding each tweet to max_tweet_length
        X_tokens = self.__tokenizer.texts_to_sequences(X)

        # NOTE: padding = "post" means that the pad is after each sequence
        # (each tweet) and not before
        X_pad = pad_sequences(
            X_tokens,
            maxlen=self.__max_tweet_length,
            padding="post"
        )

        return X_pad


    def __generate_embedding_matrix(self):

        # Getting the vocabulary from the tokenizer
        word_index = self.__tokenizer.word_index

        # Creating a dictionary the embedding file. Keys = words in the embedding file,
        # Values = their respective vector
        embeddings_index = {}

        with open(self.__glove_path) as f:
            for line in f:
                word, coefs = line.split(maxsplit=1)
                coefs = np.fromstring(coefs, "f", sep=" ")
                embeddings_index[word] = coefs

        # Printing the number of words found in the file
        print("Found %s word vectors." % len(embeddings_index))

        # Generating the embedding matrix
        embedding_matrix = np.zeros((self.__vocab_size, self.__embedding_dim))

        # These two variables will hold the number of words in the vocabulary
        # That are found in the file, and the number of the ones that are not.
        hits = 0
        misses = 0

        for word, i in word_index.items():
            embedding_vector = embeddings_index.get(word)

        # Words not found in embedding index will be represented as a zero-vector.
        # This includes the representation for "padding" and "OOV"
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
            hits += 1
        else:
            misses += 1

        # Printing the number of found / not found words
        print("Converted %d words (%d misses)" % (hits, misses))

        return embedding_matrix


    def __build_model(self, embedding_matrix):
        # Creating the model with all its layers.
        # NOTE: mask_zero must be true because 0 is a special character
        # used as padding, as mentioned before.
        # The Embedding layer is not trainable since we loaded the vectors from a pre-trained file,
        # as mentioned before
        self.__model.add(layers.Embedding(
            input_dim=self.__vocab_size,
            output_dim=self.__embedding_dim,
            embeddings_initializer=Constant(embedding_matrix),
            input_length=self.__max_tweet_length,
            mask_zero=True,
            trainable=False
        ))

        # NOTE: since we are using GRU as a RNN, we need to define two types of dropouts: the
        # first one is used for the first operation on the inputs (when data
        # "enters" in GRU) the second one is used for the recurrences Units
        self.__model.add(layers.Bidirectional(
        layers.GRU(units=100, dropout=0.2, recurrent_dropout=0, activation="tanh",
                   recurrent_activation="sigmoid", unroll=False, use_bias=True,
                   reset_after=True)))
        self.__model.add(tf.keras.layers.Dense(100, activation="relu")),
        self.__model.add(layers.Dense(1, activation="sigmoid"))

        # Compiling the model. The optimizer is Adam with standard lr (0.001)
        self.__model.compile(
        loss="binary_crossentropy",
        optimizer=tf.keras.optimizers.Adam(),
        metrics=["accuracy"])

        # Printing model"s summary
        print(self.__model.summary())


    def get_preprocessing_methods(self, is_test: bool = False):
        methods = []

        if not is_test:
        # Dropping duplicates tweets only in the training set
            methods.append("drop_duplicates")

        methods.extend([
            "remove_tags",
            "remove_whitespace",
            "remove_space_between_emoticons",
            "remove_whitespace",
            "emoticons_to_tags",
            "final_emoticons_to_tag",
            "numbers_to_tags",
            "hashtags_to_tags",
            "repeat_to_tags",
            "elongs_to_tags",
            "lower",
            "remove_whitespace"
        ])

        return methods


    def fit_predict(self,
                    X: pd.DataFrame,
                    y: pd.DataFrame,
                    ids_test: np.ndarray,
                    X_test: np.ndarray,
                    prediction_path: str,
                    batch_size: int = 128,
                    epochs: int = 10):
        # Splitting train and validation data
        X_train, X_val, y_train, y_val = AbstractModel._split_data(X, y)

        # Converting train and validation data to sequences (vectors)
        X_train_pad = self.__convert_data(X_train)
        X_val_pad = self.__convert_data(X_val)

        # Generating the embedding matrix from the training data
        embedding_matrix = self.__generate_embedding_matrix()

        # Building the model
        self.__build_model(embedding_matrix)

        print("Training the model...")
        self.__model.fit(X_train_pad, y_train, batch_size, epochs,
                        validation_data=(X_val_pad, y_val))

        print("Saving the model...")
        self.__model.save(f"{self._weights_path}model")

        print("Making the prediction...")
        self.predict(ids_test, X_test, prediction_path, from_weights=False)


    def predict(self, ids, X, path, from_weights=True):
        if from_weights:
            # Loading weights
            self.__model = tf.keras.models.load_model(f"{self._weights_path}model")

        # Converting input data
        X_pad = self.__convert_data(X)
        predictions = self.__model.predict(X_pad).squeeze()
        preds = np.where(predictions >= 0.5, 1, -1)
        print(preds)

        # Creating and saving the file
        AbstractModel._create_submission(ids, preds, path)

In [None]:
class NewGRU(AbstractModel):
    """
    This class implements a Gru bidirectional neural network with Glove pretrained embedding file.
    The embedding file has been created by Stanford University, and it's based on tweets.
    """

    def __init__(self, weights_path=WEIGHT_PATH, glove_path=GLOVE_PATH, max_tweet_length=120,
                embedding_dim=100):
        """
        :param weights_path: weights path of the model. Model's parameters will be loaded and saved from this path.
        :type weights_path: str
        :param glove_path: path of the glove file.
        :type glove_path: str
        :param max_tweet_length: maximum (estimated) lenght of a tweet in words.
        We exaggerated the dimension to be sure to not truncate any tweet.
        :type max_tweet_length: int, optional
        :param embedding_dim: the embedding dimension. Every word is represented by a vector of this length
        in the embedding space. Please before changing it refer to your embedding file documentation.
        :type embedding_dim: int, optional
        """
        super().__init__(weights_path)

        self.__tokenizer = Tokenizer(oov_token='<unk>')
        self.__model = tf.keras.Sequential()
        self.__max_tweet_length = max_tweet_length
        self.__embedding_dim = embedding_dim
        self.__glove_path = glove_path

        # Size of the vocabulary, it will be updated according to the input data
        self.__vocab_size = 0

    def update_vocabulary(self, X):
        """
        Method used to update (create) the vocabulary of the tokenizer.

        :param X: A matrix. Each row is a document, in our case a tweet.
        :type X: numpy.ndarray
        """

        print('Updating vocabulary...')

        # Updates the default internal vocabulary according to the words in X
        self.__tokenizer.fit_on_texts(X)

        # Updating the vocabulary length.
        # NOTE: the +2 is due to some special reserved tokens that are in the vocabulary
        # but not in the tweets
        self.__vocab_size = len(self.__tokenizer.word_index) + 2

    def __convert_data(self, X):
        """
        Converts the tweets in numerical tokens.
        Each word in the tweet is substituted with its index in the vocabulary,
        in a bag of words fashion. Each tweet is padded to 120 words at maximum,
        with 0 as special padding character.

        param X: A matrix. Each row is a document, in our case a tweet.
        :type X: numpy.ndarray

        :return: Numpy array with shape (len(X), max_tweet_length)
        :rtype: numpy.ndarray
        """

        print('Converting data...')
        print(X)

        # Creating the numerical tokens and padding each tweet to max_tweet_length
        X_tokens = self.__tokenizer.texts_to_sequences(X)
        print("X_tokens")
        print(X_tokens[0])

        # NOTE: padding = 'post' means that the pad is after each sequence
        # (each tweet) and not before
        X_pad = pad_sequences(
            X_tokens,
            maxlen=self.__max_tweet_length,
            padding='post'
        )

        return X_pad

    def __generate_embedding_matrix(self):
        """
        Generates the word embedding matrix according to the words in the vocabulary.
        Each word is represented by a vector with length equal to embedding_dim.
        The embedding is done according to a model pretrained on twitter data
        (https://nlp.stanford.edu/projects/glove/). Only the words in the vocabulary that
        are found in the pretrained model are taken into account.

        :return: The embedding matrix. Each row corresponds to a word in the vocabulary.
        The index of the row is the index of the word in the voc.
        :rtype: numpy.ndarray
        """

        print('Generating embedding matrix...')

        # Getting the vocabulary from the tokenizer
        word_index = self.__tokenizer.word_index

        # Creating a dictionary the embedding file. Keys = words in the embedding file,
        # Values = their respective vector
        embeddings_index = {}

        with open(self.__glove_path) as f:
            for line in f:
                word, coefs = line.split(maxsplit=1)
                coefs = np.fromstring(coefs, "f", sep=" ")
                embeddings_index[word] = coefs

        # Printing the number of words found in the file
        print("Found %s word vectors." % len(embeddings_index))

        # Generating the embedding matrix
        embedding_matrix = np.zeros((self.__vocab_size, self.__embedding_dim))

        # These two variables will hold the number of words in the vocabulary
        # That are found in the file, and the number of the ones that are not.
        hits = 0
        misses = 0

        for word, i in word_index.items():
            embedding_vector = embeddings_index.get(word)

        # Words not found in embedding index will be represented as a zero-vector.
        # This includes the representation for "padding" and "OOV"
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
            hits += 1
        else:
            misses += 1

        # Printing the number of found / not found words
        print("Converted %d words (%d misses)" % (hits, misses))

        return embedding_matrix

    def __build_model(self, embedding_matrix):
        """
        Method used to build and compile the GRU (Bidirectional) model.

        :param embedding_matrix: The embedding matrix used for the Embedding layer of the model.
        The embedding happens according to the matrix. The matrix is built in the previous method.
        :type: numpy.ndarray:
        """

        print('Building model...')

        # Creating the model with all its layers.
        # NOTE: mask_zero must be true because 0 is a special character
        # used as padding, as mentioned before.
        # The Embedding layer is not trainable since we loaded the vectors from a pre-trained file,
        # as mentioned before
        self.__model.add(layers.Embedding(
        input_dim=self.__vocab_size,
        output_dim=self.__embedding_dim,
        embeddings_initializer=Constant(embedding_matrix),
        input_length=self.__max_tweet_length,
        mask_zero=True,
        trainable=False))

        # NOTE: since we are using GRU as a RNN, we need to define two types of dropouts: the
        # first one is used for the first operation on the inputs (when data
        # "enters" in GRU) the second one is used for the recurrences Units
        self.__model.add(layers.Bidirectional(
        layers.GRU(units=100, dropout=0.2, recurrent_dropout=0, activation='tanh', \
                    recurrent_activation='sigmoid', unroll=False, use_bias=True,
                    reset_after=True)))
        self.__model.add(tf.keras.layers.Dense(100, activation='relu')),
        self.__model.add(layers.Dense(1, activation='sigmoid'))

        # Compiling the model. The optimizer is Adam with standard lr (0.001)
        self.__model.compile(
        loss='binary_crossentropy',
        optimizer=tf.keras.optimizers.Adam(),
        metrics=['accuracy'])

        # Printing model's summary
        print(self.__model.summary())

    def get_preprocessing_methods(self, istest=False):
        methods = []

        if not istest:
        # Dropping duplicates tweets only in the training set
            methods.append('drop_duplicates')

        methods.extend([
        'remove_endings',
        'correct_spacing_indexing',
        'remove_space_between_emoticons',
        'correct_spacing_indexing',
        'emoticons_to_tags',
        'final_parenthesis_to_tags',
        'numbers_to_tags',
        'hashtags_to_tags',
        'repeat_to_tags',
        'elongs_to_tags',
        'to_lower',
        'correct_spacing_indexing'
        ])

        return methods

    def fit_predict(self, X, Y, ids_test, X_test, prediction_path, batch_size=128,
                    epochs=10):
        """
        Fits (train) the model, and makes a prediction on the test data.

        :param X: datapoint matrix. Will be splitted into training and validation data.
        :type X: numpy.ndarray
        :param Y: labels of the datapoints.
        :type Y: numpy.ndarray
        :param ids_test: the ids of the test datapoints, necessary to make a prediction.
        :type ids_test: numpy.ndarray
        :param X_test: the matrix containing the test datapoints for the prediction.
        :type X_test: numpy.ndarray
        :param prediction_path: relative path of the prediction file.
        :type prediction_path: str
        :param batch_size: size of the mini-batches used when training the model.
        :type batch_size: int, optional
        :param epochs: number of epochs used when training the model.
        :type epochs: int, optional
        """

        # Splitting train and validation data
        X_train, X_val, Y_train, Y_val = AbstractModel._split_data(X, Y)

        # Converting train and validation data to sequences (vectors)
        X_train_pad = self.__convert_data(X_train)
        X_val_pad = self.__convert_data(X_val)

        # Generating the embedding matrix from the training data
        embedding_matrix = self.__generate_embedding_matrix()

        # Building the model
        self.__build_model(embedding_matrix)

        print('Training the model...')
        self.__model.fit(X_train_pad, Y_train, batch_size, epochs,
                        validation_data=(X_val_pad, Y_val))

        print('Saving the model...')
        self.__model.save(f'{self._weights_path}model')

        print('Making the prediction...')
        self.predict(ids_test, X_test, prediction_path, from_weights=False)

    def predict(self, ids, X, path, from_weights=True):
        """
        Performs the predictions. Usually called within the fit_predict method.

        :param ids: ids of testing data.
        :type ids: numpy.ndarray
        :param X: matrix of the testing datapoints.
        :type x: numpy.ndarray
        :param path: specifies where to store the submission file
        :type path: str
        :param from_weights: specifies if it is a prediction of a new model or if it is made according to a pre-trained one.
        :type from_weights: bool, optional
        """

        if from_weights:
        # Loading weights
            self.__model = tf.keras.models.load_model(f'{self._weights_path}model')

        # Converting input data
        X_pad = self.__convert_data(X)
        predictions = self.__model.predict(X_pad).squeeze()
        preds = np.where(predictions >= 0.5, 1, -1)
        print(preds)

        # Creating and saving the file
        AbstractModel._create_submission(ids, preds, path)

# Run

In [None]:
from time import strftime

In [None]:
@function_name
def run_preprocessing(cls: AbstractModel, train_path: str, test_path: str, is_full: bool = False):
    path_ls = [TRAIN_NEG_FULL_PATH, TRAIN_POS_FULL_PATH] if is_full else [TRAIN_NEG_PATH, TRAIN_POS_PATH]

    train_preprocessing = Preprocessing(path_ls)
    test_preprocessing = Preprocessing([TEST_PATH], is_submission=True)

    for method in cls().get_preprocessing_methods(is_test=False):
        getattr(train_preprocessing, method)()

    for method in cls().get_preprocessing_methods(is_test=True):
        getattr(test_preprocessing, method)()

    # Save it
    train_df = train_preprocessing.get()
    train_df = train_df.sample(frac=1)

    train_df.to_csv(train_path, index=False)
    test_preprocessing.get().to_csv(test_path, index=False)

In [None]:
def run_new_preprocessing(
    csr: AbstractModel, train_preprocessed_path=TRAIN_PREP_PATH, test_preprocessed_path=TEST_PREP_PATH, full_data=True
):
    """
    Runs the preprocessing methods according to the chosen classifier
      on the train and test data

    :param csr: chosen classifier (child of AbstractModel)
    :type csr: AbstractModel
    :param train_preprocessed_path: path to load train data
    :type train_preprocessed_path: str
    :param test_preprocessed_path: path to load test data
    :type test_preprocessed_path: str
    :param full_data: if False, the small dataset (200K rows) is used
    :type full_data: bool, optional
    """

    # Read data
    if full_data:
        dataset_files = [TRAIN_NEG_FULL_PATH, TRAIN_POS_FULL_PATH]
    else:
        dataset_files = [TRAIN_NEG_PATH, TRAIN_POS_PATH]

    train_preprocessing = NewPreprocessing(dataset_files, submission=False)
    test_preprocessing = NewPreprocessing([TEST_PATH], submission=True)

    # Preprocess it
    for method in csr.get_preprocessing_methods(istest=False):
        getattr(train_preprocessing, method)()

    for method in csr.get_preprocessing_methods(istest=True):
        getattr(test_preprocessing, method)()

    # Save it
    train_df = train_preprocessing.get()
    train_df = train_df.sample(frac=1)

    train_df.to_csv(train_preprocessed_path, index=False)
    test_preprocessing.get().to_csv(test_preprocessed_path, index=False)


In [None]:
from enum import Enum


class Models(Enum):
    gru = 'gru'

    def __str__(self):
        """
        Returns the value of the Enumeration

        :return: value of Enumeration
        :rtype: str
        """
        return self.value

    def get_model_name(self):
        """
        Performs a mapping between Models value and class/string to run the method

        :return: class/string with respect to the value of the Enumeration
        :rtype: object
        """

        list_model = {
            Models.gru: NewGRU,
        }

        return list_model[self]

In [None]:
def execute(
    weights_path: str = WEIGHT_PATH,
    is_prep: bool = False,
    train_preprocessed_path: str = TRAIN_PREP_PATH,
    test_preprocessed_path: str = TEST_PREP_PATH,
    submission_path: str = f"{GLOBAL_PATH}/submission",
    full_data: bool = True,
):
    """
    Creates a submission file using a method specified by user.
      If specified, loads preprocessed data and/or the weights, otherwise
      preprocesses data, fits the model and makes predictions from scratch

    :param args: arguments chosen by the user
    :type args: argparse.Namespace
    :param weights_path: path to load/store the weights
    :type weights_path: str
    :param train_preprocessed_path: path to load/store the train preprocessed data
    :type train_preprocessed_path: str
    :param test_preprocessed_path: path to load/store the test preprocessed data
    :type test_preprocessed_path: str
    :param submission_path: path to save the submission file
    :type submission_path: str
    :param full_data: if False, the small dataset (200K rows) is used
    :type full_data: bool, optional
    :param kwargs: additional arguments for classical methods (otherwise empty)
    :type kwargs: dict
    """

    classifier = Models["gru"].get_model_name()(weights_path)

    # Specifying the columns of the DataFrame
    usecols_train = ["text", "label"]
    usecols_test = ["ids", "text"]

    if not is_prep:
        run_new_preprocessing(classifier, train_preprocessed_path, test_preprocessed_path)

    # Loading preprocessed data
    train_preprocessed = pd.read_csv(train_preprocessed_path, usecols=usecols_train)
    test_preprocessed = pd.read_csv(test_preprocessed_path, usecols=usecols_test)

    # Dropping null rows from training data
    train_preprocessed.dropna(inplace=True)

    X, Y = train_preprocessed["text"].values, train_preprocessed["label"].values
    X_test, test_ids = (
        test_preprocessed["text"].values,
        test_preprocessed["ids"].values,
    )

    # Updating the vocabulary of the GRU classifier according to the training data
    classifier.update_vocabulary(X)

    classifier.fit_predict(
        X,
        Y,
        test_ids,
        X_test,
        f'{submission_path}submission-{strftime("%Y-%m-%d_%H:%M:%S")}.csv'
    )


In [None]:
execute()

  self.__data = self.__data.append(df).reset_index(drop=True)
  self.__data = self.__data.append(df).reset_index(drop=True)


Dropping duplicates...
Removing tweet ending when the tweet is cropped...


  self.__data["text"] = self.__data["text"].str.replace(r"\.{3} <url>$", "")


Correcting spacing...


  self.__data["text"] = self.__data["text"].str.replace("\s{2,}", " ")


Removing space between emoticons...


  self.__data["text"] = self.__data["text"].str.replace(
  self.__data["text"] = self.__data["text"].str.replace(


Correcting spacing...
Converting emoticons to tags...
Substituting final paranthesis with tags...


  self.__data["text"] = self.__data["text"].str.replace("\)+$", " <smile> ")
  self.__data["text"] = self.__data["text"].str.replace("\(+$", " <sadface> ")


Converting numbers to tags...


  self.__data["text"] = self.__data["text"].str.replace(


Converting hashtags to tags...


  self.__data["text"] = self.__data["text"].str.replace(


Converting repetitions of symbols to tags...


  self.__data["text"] = self.__data["text"].str.replace(


Converting elongated words to tags...


  self.__data["text"] = self.__data["text"].str.replace(


Converting to lowercase...
Correcting spacing...
Removing tweet ending when the tweet is cropped...
Correcting spacing...
Removing space between emoticons...
Correcting spacing...
Converting emoticons to tags...
Substituting final paranthesis with tags...
Converting numbers to tags...
Converting hashtags to tags...
Converting repetitions of symbols to tags...
Converting elongated words to tags...
Converting to lowercase...
Correcting spacing...
Updating vocabulary...
Splitting data in train and test set...
Converting data...
['<hashtag> boston $ <number> for a standard commuter <number> laptop bag from rickshaw bagworks ( $ <number> value the right bag c . <repeat> <url> <hashtag> groupon'
 'hey <user> follow me please it means so much for me <number>'
 "<user> haha they love my beauty skills don't they ! ahaa . it hasn't failed we never spend money ! carly , chels , seanie + sammi , come !"
 ... 'leg and body cramp'
 '<user> heyy follow me back please . <repeat> it would make my day i

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Converting data...
['galco royal guard inside the pant holster for <number> <number> <number>/<number>- inch colt , para , springfield ( natural , left-hand the'
 '<user> yes , i have to talk to u again <neutralface> i did something . you gonna kill me like always jajajaja'
 '<user> alzheimers very moving apparently . will pick it up on iplayer .'
 ... 'cant <elong> wait for my pizzza to get here !'
 '<user> fraser or harris tomorrow ? shall we stay after school tuesday and wednesday ? and yeah same i saw her in plymouth ! ! !'
 'so happy for my babygirl <user> for getting asked to prom yayayayay go patrick go !']


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Generating embedding matrix...
Found 1193514 word vectors.
Converted 1 words (0 misses)
Building model...
Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 120, 100)          43982400  
                                                                 
 bidirectional_1 (Bidirecti  (None, 200)               121200    
 onal)                                                           
                                                                 
 dense_2 (Dense)             (None, 100)               20100     
                                                                 
 dense_3 (Dense)             (None, 1)                 101       
                                                                 
Total params: 44123801 (168.32 MB)
Trainable params: 141401 (552.35 KB)
Non-trainable params: 43982400 (167.78 MB)
_______________________________

KeyboardInterrupt: ignored