# <font color = 'dodgerblue'>**Tokenization approaches spacy - Real Dataset**

# <font color = 'dodgerblue'>**Install/Import Libraries**

In [None]:
# install spacy
if 'google.colab' in str(get_ipython()):
    %pip install -U spacy -qq
    %pip install swifter -qq

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/6.7 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.8/6.7 MB[0m [31m82.7 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m6.7/6.7 MB[0m [31m123.7 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.7/6.7 MB[0m [31m72.9 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
en-core-web-sm 3.4.1 requires spacy<3.5.0,>=3.4.0, but you have spacy 3.5.0 which is incompatible.[0m[31m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m830.9/830.9 KB[0m [31m18.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━

In [None]:
# Import the Path module from the pathlib library
from pathlib import Path

# Import the tarfile module for working with tar files
import tarfile

# Import the pandas library for working with data frames
import pandas as pd

# Import the spacy library for natural language processing
import spacy

# Import the List type from the typing module to use in function annotations
from typing import List

# Import the swifter package to speed up data processing tasks on pandas DataFrame and Series objects
import swifter



In [None]:
# check spacy version
spacy.__version__

'3.5.0'

# <font color = 'dodgerblue'>**Specify Data Folders**

In [None]:
# Use for normal projects
from pathlib import Path
if 'google.colab' in str(get_ipython()):
  from google.colab import drive
  drive.mount('/content/drive') 
  base_folder = Path('/content/drive/MyDrive/colab_notebooks/')
  subject = 'nlp'
  data = base_folder/subject/'data/'
  archive = base_folder/subject/'archive/'
  output = base_folder/subject/'output'
else:
  base_folder = Path('C:/Users/Abdul Rauf Maroof/OneDrive/Documents/MSBA')
  data = base_folder/subject/'data/'
  archive = base_folder/subject/'archive/'
  output = base_folder/subject/'output'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# <font color = 'dodgerblue'>**Download Data**

## <font color = 'dodgerblue'>**Step1: use wget to download data files from URl**

In [None]:
# complete data link: http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz

url='https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz'
file = archive/'aclImdb_v1.tar.gz'
if not file.exists(): # check if file already exists
    !wget {url} -P {archive_folder} -O {file}

## <font color = 'dodgerblue'>**Step2: check content of folder where data was downloaded**

In [None]:
# list files of google drive where data was downloaded
for entries in archive.iterdir():
  if 'tar' in  entries.name:
    print(entries.name)

review_polarity (1).tar.gz
scale_whole_review (1).tar.gz
aclImdb_v1.tar.gz
scale_whole_review.tar.gz
review_polarity.tar.gz


## <font color = 'dodgerblue'>**Step3: Check content of zipped/tar folder**

In [None]:
# create a pathlib object for the file we want to untar
file = archive/'aclImdb_v1.tar.gz'

In [None]:
# Extract files using tarfile library 
# you can skip running this cell
with  tarfile.open(file, 'r') as tar:
  tar_file_names = tar.getnames()

In [None]:
tar_file_names[0:10]

['aclImdb',
 'aclImdb/test',
 'aclImdb/train',
 'aclImdb/test/neg',
 'aclImdb/test/pos',
 'aclImdb/train/neg',
 'aclImdb/train/pos',
 'aclImdb/train/unsup',
 'aclImdb/imdbEr.txt',
 'aclImdb/imdb.vocab']

In [None]:
with  tarfile.open(file, 'r') as tar:
  tar_file_members = tar.getmembers()

## <font color = 'dodgerblue'>**Step 4: unzip/untar files**

In [None]:
# file = archive/'aclImdb_v1.tar.gz'
# with tarfile.open(file, 'r') as tar:
#     tar.extractall(path = data)

In [None]:
# this cell can take time to run if you are running this for first time
file = archive/'aclImdb_v1.tar.gz'
with tarfile.open(file, 'r') as tar:
    # Get the list of names of members in the tar file
    member_names = tar.getnames()
    # Loop over each member name
    for member_name in member_names:
        # Get the path of the current member
        member_path = data / member_name
        # Extract the current member only if it does not already exist
        if not member_path.exists():
            tar.extract(member_name, path=data)

Here is an explanation of the code:

- `with tarfile.open(file, 'r') as tar:`: This line opens the tar archive file specified by file in read mode, and creates a TarFile object, which is stored in the variable tar. The with statement is used to ensure that the tar file is properly closed when the code inside the block is finished executing.

- `member_names = tar.getnames()`: This line retrieves a list of names of the members in the tar archive, and stores it in the variable member_names.

- `for member_name in member_names:` : This line starts a for loop that iterates over each member name in the list member_names.

- `member_path = data_folder / member_name`: This line creates a Path object that represents the path of the current member in the loop, using the data_folder variable and the current member_name variable.

- `if not member_path.exists():`: This line checks if the path represented by member_path exists.

- `tar.extract(member_name, path=data_folder)`: If the path does not exist, this line extracts the current member from the tar archive and saves it to the data_folder path.

## <font color = 'dodgerblue'>**Step 5: Understand the structure of unzipped folder**

In [None]:
# we will use rglob which will help us to specify the pattern to search 
# ** - Recursively matches zero or more directories that fall under the current directory.
for entries in (data/'aclImdb').rglob('**'):
  print(entries)

/content/drive/MyDrive/colab_notebooks/nlp/data/aclImdb
/content/drive/MyDrive/colab_notebooks/nlp/data/aclImdb/test
/content/drive/MyDrive/colab_notebooks/nlp/data/aclImdb/test/neg
/content/drive/MyDrive/colab_notebooks/nlp/data/aclImdb/test/pos
/content/drive/MyDrive/colab_notebooks/nlp/data/aclImdb/train
/content/drive/MyDrive/colab_notebooks/nlp/data/aclImdb/train/neg
/content/drive/MyDrive/colab_notebooks/nlp/data/aclImdb/train/pos
/content/drive/MyDrive/colab_notebooks/nlp/data/aclImdb/train/unsup


Explanation of the code:
- The `rglob` method is used to perform a recursive search for files and directories, and 
- ``'**'` argument is used to match all subdirectories.

## <font color = 'dodgerblue'>**Step 6a: combine all text files and create dataframe**

In [None]:
# Function to combine reviews from multiple text files
# the concepts were covered in first lecture

def get_reviews(path: Path) -> List[str]:
    """
    This function takes a directory path and returns a list of strings, 
    where each string is the contents of a '.txt' file in the directory.
    
    Parameters:
    - path (Path): The directory path to search for '.txt' files
    
    Returns:
    - List[str]: A list of strings, where each string is the contents of a '.txt' file in the directory
    """
    reviews = []  # list to store the contents of each '.txt' file
    
    # loop through all the entries in the directory
    for file in path.iterdir():    
        # check if the entry is a '.txt' file
        if file.suffix == '.txt':
            # open the file and read its contents
            with open(path/file,'r') as f:
                text = f.read()        
                # add the contents to the list of reviews
                reviews.append(text)
    
    # return the list of reviews
    return reviews

In [None]:
# Function to create dataframe from extracted list of files

def make_dataframe(folder: Path) -> pd.DataFrame:
    """
    This function takes a directory path and returns a Pandas DataFrame with two columns: 'Reviews' and 'Labels'. 
    The 'Reviews' column contains the contents of all '.txt' files in the 'pos' and 'neg' subdirectories of the input 
    folder, concatenated together. The 'Labels' column contains binary labels indicating whether the corresponding 
    review is positive (1) or negative (0).
    
    Parameters:
    - folder (Path): The directory path containing the 'pos' and 'neg' subdirectories
    
    Returns:
    - pd.DataFrame: A Pandas DataFrame with two columns: 'Reviews' and 'Labels'
    """
    # Get the reviews from the 'pos' and 'neg' subdirectories
    positive_reviews = get_reviews(folder / 'pos')
    negative_reviews = get_reviews(folder / 'neg')
    
    # Create the DataFrame with the combined reviews and binary labels
    data = pd.DataFrame({'Reviews':positive_reviews + negative_reviews,
                        'Labels':list('1' * len(positive_reviews) + '0' * len(negative_reviews))})
    # Convert the 'Labels' column to integer type
    data['Labels'] = data['Labels'].astype('int32')
    
    # Return the DataFrame
    return data

In [None]:
# this cell can take 15 mins to run
# create a train data set
train_data = make_dataframe(data/'aclImdb/train')

In [None]:
# create a test data set
test_data = make_dataframe(data/'aclImdb/test')

### <font color = 'dodgerblue'>**Save dataframe to csv file**

In [None]:
train_data.to_csv(data/'aclImdb'/'train.csv')

In [None]:
test_data.to_csv(data/'aclImdb'/'test.csv')

In [None]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000 entries, 0 to 24999
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Reviews  25000 non-null  object
 1   Labels   25000 non-null  int32 
dtypes: int32(1), object(1)
memory usage: 293.1+ KB


# <font color = 'dodgerblue'>**Load csv file**

In [None]:
train_data = pd.read_csv(data/ 'aclImdb'/'train.csv', index_col=0)

In [None]:
# Printing shape of dataframe
train_data.shape

(25000, 2)

In [None]:
# diaplay first five rows
train_data.head()

Unnamed: 0,Reviews,Labels
0,Zentropa has much in common with The Third Man...,1
1,Zentropa is the most original movie I've seen ...,1
2,Lars Von Trier is never backward in trying out...,1
3,*Contains spoilers due to me having to describ...,1
4,That was the first thing that sprang to mind a...,1


# <font color = 'dodgerblue'>**Import Spacy Model**

In [None]:
# check the models we have dowloaded in spacy folder
!python -m spacy download en_core_web_sm -qq

2023-03-01 18:32:48.899900: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/nvidia/lib:/usr/local/nvidia/lib64
2023-03-01 18:32:48.900011: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/nvidia/lib:/usr/local/nvidia/lib64
2023-03-01 18:32:50.528347: E tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.cc:267] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting en-core-web-sm==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download

# <font color = 'dodgerblue'>**Compare tokenization approaches**

In [None]:
# We will load the model -en_core_web_sm
nlp = spacy.load('en_core_web_sm')

## <font color = 'dodgerblue'>**Method1 : Typical approach using spacy**

In [None]:
def tokenize(text: str) -> List[str]:
    """Tokenize the input text using spaCy.
    
    Args:
    text: The input text to be tokenized.
    
    Returns:
    A list of tokens.
    """
    # Apply the spaCy NLP model to the input text
    doc = nlp(text)
    # Extract the tokens from the spaCy doc and return as a list
    tokens = [token.text for token in doc]  
    return tokens 

In [None]:
# DONOT RUN THIS Cell in the class
# it is only for demonstration purpose, it can take a long time
# as indicated by the output below- 
# it took around 8 minutes on a 128 gb RAM machine
# it took 21 minutes on colab
train_data['tokens_method1'] = train_data['Reviews'].swifter.apply(tokenize)

Pandas Apply:   0%|          | 0/25000 [00:00<?, ?it/s]

In [None]:
train_data.head()

Unnamed: 0,Reviews,Labels,tokens_method1
0,Zentropa has much in common with The Third Man...,1,"[Zentropa, has, much, in, common, with, The, T..."
1,Zentropa is the most original movie I've seen ...,1,"[Zentropa, is, the, most, original, movie, I, ..."
2,Lars Von Trier is never backward in trying out...,1,"[Lars, Von, Trier, is, never, backward, in, tr..."
3,*Contains spoilers due to me having to describ...,1,"[*, Contains, spoilers, due, to, me, having, t..."
4,That was the first thing that sprang to mind a...,1,"[That, was, the, first, thing, that, sprang, t..."


## <font color = 'dodgerblue'>**Method 2: Using nlp.pipe from Spacy**

In [None]:
import os
os.cpu_count()

4

In [None]:
## DO NOT Run this cell in the class

# spaCy includes built-in support for multiprocessing with nlp.pipe
# this can speed up the processing
# it took 1 min 42 secs on a 128 gb RAM machine with 16 cores
# it took 10 mins on colab (colab free has 2 cores)

# initialize an empty list to store tokens
tokens_method2 = []

# process multiple documents in parallel using the spaCy NLP library
for doc in nlp.pipe(train_data.Reviews.values, batch_size=1000, n_process=3):
    # extract text of each token in the document and create a list of tokens
    tokens = [token.text for token in doc] 
    # add the list of tokens to the tokens_method2
    tokens_method2.append(tokens)

# add the tokens_method2 to the train_data dataframe as a new column 'tokens_method2'
train_data['tokens_method2'] = tokens_method2

This code performs tokenization on the `train_data.Reviews.values` by using the spaCy NLP library (`nlp`).

- The **`nlp.pipe` method is used to process multiple documents in parallel**, where `batch_size=1000` and `n_process=32` specify the batch size and number of CPU processes to use respectively.

- For each document in the batch, the code creates a list of tokens, represented by the text of the spaCy token objects, using a list comprehension `[token.text for token in doc]`.

- The resulting list of tokens is then appended to `tokens_method2`. Finally, the `tokens_method2` list is added as a new column ``'tokens_method2'` to the `train_data` dataframe.






In [None]:
train_data.head()

Unnamed: 0,Reviews,Labels,tokens_method1,tokens_method2
0,Zentropa has much in common with The Third Man...,1,"[Zentropa, has, much, in, common, with, The, T...","[Zentropa, has, much, in, common, with, The, T..."
1,Zentropa is the most original movie I've seen ...,1,"[Zentropa, is, the, most, original, movie, I, ...","[Zentropa, is, the, most, original, movie, I, ..."
2,Lars Von Trier is never backward in trying out...,1,"[Lars, Von, Trier, is, never, backward, in, tr...","[Lars, Von, Trier, is, never, backward, in, tr..."
3,*Contains spoilers due to me having to describ...,1,"[*, Contains, spoilers, due, to, me, having, t...","[*, Contains, spoilers, due, to, me, having, t..."
4,That was the first thing that sprang to mind a...,1,"[That, was, the, first, thing, that, sprang, t...","[That, was, the, first, thing, that, sprang, t..."


## <font color = 'dodgerblue'>**Method 3: Using nlp.pipe and disable not required components**

In [None]:
# in addition to multiprocessing with nlp.pipe
# we can get significant speed improvements if we disable the components that we do not need
# it took around 3 minutes
# it took 26 secs on a 128 gb RAM machine with 16 cores
# 50 secs on colab

# initialize an empty list to store tokens
token_list_method3 = []

# temporarily disable the named pipes of spaCy NLP processing pipeline
disabled = nlp.select_pipes(disable= ['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner'])

# process multiple documents in parallel using the spaCy NLP library
for doc in nlp.pipe(train_data.Reviews.values, batch_size=1000, n_process=3):
    # extract text of each token in the document and create a list of tokens
    tokens = [token.text for token in doc]         
    # add the list of tokens to the token_list_method3
    token_list_method3.append(tokens)

# add the token_list_method3 to the train_data dataframe as a new column 'tokens_method3'
train_data['tokens_method3'] = token_list_method3

# restore the named pipes that were disabled
disabled.restore()

In [None]:
train_data.head()

Unnamed: 0,Reviews,Labels,tokens_method1,tokens_method2,tokens_method3
0,Zentropa has much in common with The Third Man...,1,"[Zentropa, has, much, in, common, with, The, T...","[Zentropa, has, much, in, common, with, The, T...","[Zentropa, has, much, in, common, with, The, T..."
1,Zentropa is the most original movie I've seen ...,1,"[Zentropa, is, the, most, original, movie, I, ...","[Zentropa, is, the, most, original, movie, I, ...","[Zentropa, is, the, most, original, movie, I, ..."
2,Lars Von Trier is never backward in trying out...,1,"[Lars, Von, Trier, is, never, backward, in, tr...","[Lars, Von, Trier, is, never, backward, in, tr...","[Lars, Von, Trier, is, never, backward, in, tr..."
3,*Contains spoilers due to me having to describ...,1,"[*, Contains, spoilers, due, to, me, having, t...","[*, Contains, spoilers, due, to, me, having, t...","[*, Contains, spoilers, due, to, me, having, t..."
4,That was the first thing that sprang to mind a...,1,"[That, was, the, first, thing, that, sprang, t...","[That, was, the, first, thing, that, sprang, t...","[That, was, the, first, thing, that, sprang, t..."
