<center>
    <h1><b>Feed Forward Neural Networks for Natural Language Processing<b></h1>
</center>

### Download GloVe Embeddings

In [2]:
import pandas as pd
import warnings

warnings.filterwarnings("ignore")

dataset_base_filepath = '../Data/Raw' # "/kaggle/input/ai-2-deep-learning-for-nlp-homework-1"
images_base_filepath = '../imgs' # "/kaggle/input/images"

# Load all the dataset files using pandas and store inside some dataframe variables
train_df = pd.read_csv(f'{dataset_base_filepath}/train_dataset.csv')
val_df = pd.read_csv(f'{dataset_base_filepath}/val_dataset.csv')
test_df = pd.read_csv(f'{dataset_base_filepath}/test_dataset.csv')

# Reduce the data sizes
size=1
train_df = train_df.sample(frac=size)
val_df = val_df.sample(frac=size)
# test_df = test_df.sample(frac=size)

print(train_df.shape, val_df.shape, test_df.shape)

(148388, 3) (42396, 3) (21199, 2)


In [3]:
train_df.head()

Unnamed: 0,ID,Text,Label
34518,67309,@suicidalcats That's cool. I've never been the...,1
143195,90927,"No, I'm not coming on to you. I am quoting Ela...",0
106657,196113,"@musicalverse Ah well, at least we both got th...",1
97983,83600,"Morning!! Todays the game, and the Amount Boyz...",1
114890,138451,@mot_mot lol no prob in that...so do i. especi...,0


In [4]:
import os
import re
import nltk
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import wordnet
import contractions

nltk.download('wordnet')
nltk.download('omw-1.4')

stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

# !unzip /usr/share/nltk_data/corpora/wordnet.zip -d /usr/share/nltk_data/corpora/

def clean_text(text: str) -> str:
    text = text.lower() # Convert to lowercase
    text = contractions.fix(text) # Expand contractions
    text = re.sub(r"http\S+|www\S+|https\S+", "", text) # Remove url links
    text = re.sub(r"@\w+", "", text) # Remove mentions
    text = re.sub(r"#(\w+)", "", text) # Remove hastags
    text = re.sub(r"(?<!\.)\.(?!\.)|[^\w\s\?\!]", "", text)
    text = re.sub(r"\.\.\.", " ... ", text)  # Ensure '...' is treated as a single token
    
    text = re.sub(r"&[^;\s]+;", "", text) # Remove special characters
    text = re.sub(r"\s+", " ", text) # Remove extra spaces
    
    text = re.sub(r"\d+", "", text)
    
    words = text.split()
    stemmed_words = [stemmer.stem(word) for word in words] # Apply stemming
    lemmatized_words = [lemmatizer.lemmatize(word) for word in stemmed_words] # Apply lemmatization
    text = " ".join(lemmatized_words)
    
    return text

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Antonis\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Antonis\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [5]:
import time

# Apply the cleaning function to every dataset file and create a new column with the modified text
start_time = time.time()
train_df["Cleaned_text"] = train_df["Text"].apply(clean_text)
val_df["Cleaned_text"] = val_df["Text"].apply(clean_text)
test_df["Cleaned_text"] = test_df["Text"].apply(clean_text)
end_time = time.time()

print(f"Datasets were cleaned in {end_time - start_time} seconds.")

train_df.head(10)

Datasets were cleaned in 46.984538555145264 seconds.


Unnamed: 0,ID,Text,Label,Cleaned_text
34518,67309,@suicidalcats That's cool. I've never been the...,1,that is cool i have never been there though _ ...
143195,90927,"No, I'm not coming on to you. I am quoting Ela...",0,no i am not come on to you i am quot elain fro...
106657,196113,"@musicalverse Ah well, at least we both got th...",1,ah well at least we both got the same wrong an...
97983,83600,"Morning!! Todays the game, and the Amount Boyz...",1,morning!! today the game and the amount boyz p...
114890,138451,@mot_mot lol no prob in that...so do i. especi...,0,lol no prob in thatso do i especi now that my ...
87958,109124,Good Morning Twitterland! Heading off to the g...,0,good morn twitterland! head off to the gym hav...
63081,200481,@JayEv3ryDay Awww... and I miss him lol,0,a and i miss him lol
40214,25025,Watching Atonement on HBO. It's so sad. Keira...,0,watch aton on hbo it is so sad keira knightli ...
108059,142794,damn. I lost my @mordecai account to someone e...,0,damn i lost my account to someon el that wa fast
50540,9413,@TIBlockhead this is so great...i think it's g...,0,thi is so greati think it is go to be a phenom...


In [6]:
import nltk
from nltk import data
from nltk.tokenize import word_tokenize
import time

warnings.filterwarnings("ignore", category=UserWarning, module="nltk")

def remove_stopwords(text: str) -> str:
    # Remove the stopwords from the original text
    stop_words = set(
        ["i", "to", "the", "is", "a", "you", "my", "and", 
         "it", "am", "for", "in", "of", "that", "on", "so", "me"]
    )
    word_tokens = word_tokenize(text)
    filtered_words = [word for word in word_tokens if word not in stop_words]

    return " ".join(filtered_words)

start_time = time.time()
train_df['Cleaned_text'] = train_df['Cleaned_text'].apply(remove_stopwords)
val_df['Cleaned_text'] = val_df['Cleaned_text'].apply(remove_stopwords)
test_df['Cleaned_text'] = test_df['Cleaned_text'].apply(remove_stopwords)
end_time = time.time()


print(f"Stopwords were removed in {end_time - start_time} seconds.")

Stopwords were removed in 18.365330934524536 seconds.
