In [8]:
# Download and install the English language model
!python -m spacy download en_core_web_sm

# spaCy is a modern and efficient library suited for industrial NLP applications
# NLTK is a comprehensive toolkit that offers more flexibility and is often used for educational and research purposes

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
      --------------------------------------- 0.3/12.8 MB 1.7 MB/s eta 0:00:08
     - -------------------------------------- 0.5/12.8 MB 1.7 MB/s eta 0:00:09
     -- ------------------------------------- 0.7/12.8 MB 1.5 MB/s eta 0:00:08
     -- ------------------------------------- 0.8/12.8 MB 1.4 MB/s eta 0:00:09
     -- ------------------------------------- 0.9/12.8 MB 1.4 MB/s eta 0:00:09
     --- ------------------------------------ 1.1/12.8 MB 1.4 MB/s eta 0:00:09
     --- ------------------------------------ 1.2/12.8 MB 1.4 MB/s eta 0:00:09
     ---- ----------------------------------- 1.4/12.8 MB 1.4 MB/s eta 0:00:09
     ---- ----------------------------------- 1.6/12.8 MB 1.3 MB/s eta 0:00:09
     ----- ------------------------------

en: This indicates that the model is for the English language.

core: This suggests that the model contains core linguistic components, such as tokenization, part-of-speech tagging, dependency parsing, named entity recognition, and more. These components are essential for various NLP tasks.

web: This signifies that the model was trained on web data, which typically includes a broad range of text genres and styles commonly found on the internet.

sm: This stands for "small". It indicates that the model is one of the smaller-sized models provided by spaCy. Small models are lighter in size and require less memory compared to larger models, making them more suitable for applications with limited computational resources or deployment in resource-constrained environments.

## **Methods:**

### **1. Modifying vocab.is_stop attribute:**

#### Importing neccessary libraries

In [9]:
import spacy
from spacy.lang.en import STOP_WORDS

nlp = spacy.load("en_core_web_sm")

#### Adding custom words to the list of stopwords

In [10]:
# Define custom stopwords
custom_stopwords = {'NIL', 'JUNK'}

# Add custom stopwords to the spaCy language model
for word in custom_stopwords:
    nlp.vocab[word].is_stop = True

#### Python code for removing custom stopwords after tokenisation

In [11]:
# Function to remove stopwords from text
def remove_stopwords(text):
    doc = nlp(text)
    filtered_tokens = ([token.text for token in doc if not token.is_stop])
    return filtered_tokens

In [12]:
text = "This is an example sentence demonstrating stop word removal. NIL and JUNK are custom stopwords."
filtered_tokens = remove_stopwords(text)

print("Original text:", text, "\n")
print("Text after stop word removal:", filtered_tokens)

Original text: This is an example sentence demonstrating stop word removal. NIL and JUNK are custom stopwords. 

Text after stop word removal: ['example', 'sentence', 'demonstrating', 'stop', 'word', 'removal', '.', 'custom', 'stopwords', '.']


#### Python code for removing custom stopwords without tokenisation

In [13]:
# Function to remove stopwords from text
def remove_stopwords(text):
    doc = nlp(text)
    filtered_tokens = [token.text for token in doc if not token.is_stop]
    filtered_text = ' '.join(filtered_tokens)
    return filtered_text

In [14]:
text = "This is an example sentence demonstrating stop word removal. NIL and JUNK are custom stopwords."
filtered_text = remove_stopwords(text)
print("Original text:", text, '\n')
print("Text after stop word removal:", filtered_text)

Original text: This is an example sentence demonstrating stop word removal. NIL and JUNK are custom stopwords. 

Text after stop word removal: example sentence demonstrating stop word removal . custom stopwords .


### **2. Modifying Defaults.stop_words:**

#### Importing necessary models and adding custom stopwords

In [15]:
# Load the English language model
nlp = spacy.load("en_core_web_sm")

# Define custom stopwords
custom_stopwords = ['nil', 'junk']

# Add custom stopwords to the spaCy language model
for word in custom_stopwords:
    nlp.Defaults.stop_words.add(word)

#### Python code for removing custom stopwords after tokenisation

In [16]:
# Function to remove stopwords from text
def remove_stopwords(text):
    doc = nlp(text)
    filtered_tokens = [token.text for token in doc if token.text.lower() not in nlp.Defaults.stop_words]
    return filtered_tokens

In [17]:
text = "This is an example sentence demonstrating stop word removal. NIL and JUNK are custom stopwords."
filtered_text = remove_stopwords(text)
print("Original text:", text, '\n')
print("Text after stop word removal:", filtered_text)

Original text: This is an example sentence demonstrating stop word removal. NIL and JUNK are custom stopwords. 

Text after stop word removal: ['example', 'sentence', 'demonstrating', 'stop', 'word', 'removal', '.', 'custom', 'stopwords', '.']


### **3. Override Language.Defaults.stop_words:**

#### Adding custom stopwords

In [18]:
# Define custom stopwords
new_stopwords = {'NIL', 'JUNK'}

# Get default stopwords from spaCy and convert to a set
default_stopwords = set(nlp.Defaults.stop_words)

# Add custom stopwords to the default stopwords
updated_stopwords = default_stopwords.union(new_stopwords)

#### Python code for removing custom stopwords after tokenisation

In [19]:
# Function to remove stopwords from text
def remove_stopwords(text):
    doc = nlp(text)
    filtered_tokens = [token.text for token in doc if token.text.lower() not in updated_stopwords]
    return filtered_tokens

In [20]:
text = "This is an example sentence demonstrating stop word removal. NIL and JUNK are custom stopwords."
filtered_text = remove_stopwords(text)
print("Original text:", text, '\n')
print("Text after stop word removal:", ' '.join(filtered_text))

Original text: This is an example sentence demonstrating stop word removal. NIL and JUNK are custom stopwords. 

Text after stop word removal: example sentence demonstrating stop word removal . custom stopwords .
