## 1. Importing necessary libraries

In [14]:
!pip install tensorflow_datasets

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com


In [15]:
import pandas as pd
import numpy as np
import seaborn as sns

import re
import string
# import tensorflow as tf
import tensorflow_datasets as tfds
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
from sklearn.feature_extraction.text import TfidfTransformer
from keras_preprocessing.text import text_to_word_sequence
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

## 2. Import Preprocessor module

### Basic cleaning and preprocessing

In [16]:
# Function to clean and tokenize
def clean_text(text, tokenizer, stopwords):
    """Pre-process text and generate tokens
    
    Args:
        text: Text to tokenize.

    Returns:
        Tokenized text.
    """
    text = str(text).lower()                            # Lowercase words
    text = re.sub(r"\[(.*?)\]", "", text)               # Remove [+XYZ chars] in content
    text = re.sub(r"\s+", " ", text)                    # Remove multiple spaces in content
    text = re.sub(r"\w+…|…", "", text)                  # Remove ellipsis (and last word)
    text = re.sub(r"(?<=\w)-(?=\w)", " ", text)         # Replace dash between words
    text = re.sub(f"[{re.escape(string.punctuation)}]", "", text)  # Remove punctuation

    tokens = tokenizer(text)                                            # Get tokens from text
    tokens = [t for t in tokens if not t in stopwords]                  # Remove stopwords
    tokens = ["" if t.isdigit() else t for t in tokens]                 # Remove digits
    tokens = [t for t in tokens if len(t) > 1]                          # Remove short tokens
    return tokens

### Writing to csv file

In [18]:
def file_write(X , y , dataname ):
    train_data = pd.DataFrame(list(zip(X, y)))
    train_data.to_csv(f'../data/{dataname}.csv')
    
    

## 3. Implementation

**IMDb Reviews** is a large dataset for binary sentiment classification, consisting of 50,000 highly polar reviews (in English) with an even number of examples for training and testing purposes.

The dataset contains additional unlabelled data. A negative review has a score ≤ 4 out of 10, and a positive review has a score ≥ 7 out of 10. No more than 30 reviews are included per movie.

In [4]:
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [21]:
# dataset = tfds.load('imdb_reviews', as_supervised=False)
dataset = load_dataset("aayya/sst2-augmented")
ds = dataset

In [22]:
orig_dataset = ds['train_orig']
orig_eda_dataset = ds['train_orig_eda']
orig_eda_embedding_dataset = ds['train_orig_eda_embedding']
orig_eda_embedding_wordnet_dataset = ds['train_orig_eda_embedding_wordnet']
validate_datset = ds['val']

dict_keys(['train_orig', 'train_orig_eda', 'train_orig_eda_embedding', 'train_orig_eda_embedding_wordnet', 'val'])

In [32]:
# train_dat = pd.DataFrame(dataset['train'])
# test_dat = pd.DataFrame(dataset['test'])

# X_train, y_train = pd.DataFrame([s for s in train_dat['sentence']], columns=['text']), train_dat['label']
# X_test, y_test = pd.DataFrame([s for s in test_dat['sentence']], columns=['text']), test_dat['label']

def get_Xy(datadict , dataname):
    dataset = datadict[f'{dataname}']
    X , y = pd.DataFrame([s for s in dataset['sentence']], columns = ['text']) , dataset['label'] 
    return X , y
    

In [30]:
def remove_stopwords(X):
    stop_words = set(stopwords.words("english"))
    X['text'] = X['text'].map(lambda x:clean_text(x, word_tokenize, stop_words))
    return X
    

In [None]:
# stop_words = set(stopwords.words("english"))

# X_train['text'] = X_train['text'].map(lambda x:clean_text(x, word_tokenize, stop_words))
# X_test['text'] = X_test['text'].map(lambda x:clean_text(x, word_tokenize, stop_words))

In [None]:
# X_train

Unnamed: 0,text
0,"[rock, destined, 21st, century, new, conan, go..."
1,"[gorgeously, elaborate, continuation, lord, ri..."
2,"[singercomposer, bryan, adams, contributes, sl..."
3,"[think, america, would, enough, plucky, britis..."
4,"[yet, act, still, charming]"
...,...
8539,"[real, snooze]"
8540,[surprises]
8541,"[seen, hippie, turned, yuppie, plot, enthusias..."
8542,"[fans, walked, muttering, words, like, horribl..."


In [None]:
# y_train

0       0.69444
1       0.83333
2       0.62500
3       0.50000
4       0.72222
         ...   
8539    0.11111
8540    0.22222
8541    0.75000
8542    0.13889
8543    0.34722
Name: label, Length: 8544, dtype: float64

In [29]:
def file_write(X , y , dataname , output_dir='/mnt/disk1/hieupcvp/RNN/SentimenalAnalysis/data'):
    # Ensure X_train and X_test are DataFrames with a 'text' column
    if isinstance(X, pd.DataFrame) and 'text' in X.columns:
        X_train_text = X['text'].apply(lambda tokens: ' '.join(tokens))  # Convert tokens to strings
    else:
        raise ValueError("X_train must be a DataFrame with a 'text' column.")
    
    # if isinstance(X_test, pd.DataFrame) and 'text' in X_test.columns:
    #     X_test_text = X_test['text'].apply(lambda tokens: ' '.join(tokens))  # Convert tokens to strings
    # else:
    #     raise ValueError("X_test must be a DataFrame with a 'text' column.")
    
    # Create DataFrames for training and testing data
    train_data = pd.DataFrame({'text': X_train_text, 'label': y})
    # test_data = pd.DataFrame({'text': X_test_text, 'label': y_test})
    
    # Write to CSV files in the new output directory
    train_data.to_csv(f'{output_dir}/{dataname}.csv', index=False)
    # test_data.to_csv(f'{output_dir}/TestSet.csv', index=False)

In [33]:
# # exporting resultant datasets
# file_write(X_train, X_test, y_train, y_test)

for dataname in ds.keys():
    X , y = get_Xy(ds , dataname)
    X = remove_stopwords(X)
    file_write(X, y , dataname)
    