# 1. Text Operations

In [1]:
# Install libraries
!pip install nltk
!pip install spacy

Collecting spacy
  Downloading spacy-3.7.4-cp39-cp39-macosx_10_9_x86_64.whl (6.9 MB)
[K     |████████████████████████████████| 6.9 MB 1.7 MB/s eta 0:00:01
[?25hCollecting spacy-loggers<2.0.0,>=1.0.0
  Downloading spacy_loggers-1.0.5-py3-none-any.whl (22 kB)
Collecting murmurhash<1.1.0,>=0.28.0
  Downloading murmurhash-1.0.10-cp39-cp39-macosx_10_9_x86_64.whl (26 kB)
Collecting pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4
  Downloading pydantic-2.7.3-py3-none-any.whl (409 kB)
[K     |████████████████████████████████| 409 kB 8.3 MB/s eta 0:00:01
[?25hCollecting langcodes<4.0.0,>=3.2.0
  Downloading langcodes-3.4.0-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 2.5 MB/s eta 0:00:01
[?25hCollecting cymem<2.1.0,>=2.0.2
  Downloading cymem-2.0.8-cp39-cp39-macosx_10_9_x86_64.whl (42 kB)
[K     |████████████████████████████████| 42 kB 1.7 MB/s eta 0:00:01
[?25hCollecting srsly<3.0.0,>=2.4.3
  Downloading srsly-2.4.8-cp39-cp39-macosx_10_9_x86_64.whl (493 kB)
[K     |

In [4]:
# Import the library
import nltk

In [10]:
text = "Ben relocated to Paris last year to pursue his passion. His currently enrolles in a comprehensive course on Natural Language Processing"
text

'Ben relocated to Paris last year to pursue his passion. His currently enrolles in a comprehensive course on Natural Language Processing'

In [6]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/deepshah/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [11]:
from nltk.tokenize import word_tokenize
tokens = word_tokenize(text)
print(tokens)

['Ben', 'relocated', 'to', 'Paris', 'last', 'year', 'to', 'pursue', 'his', 'passion', '.', 'His', 'currently', 'enrolles', 'in', 'a', 'comprehensive', 'course', 'on', 'Natural', 'Language', 'Processing']


In [12]:
# Stemming = reduces words to their root form
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
stemmed_words = [stemmer.stem(token) for token in tokens]
print(stemmed_words)

['ben', 'reloc', 'to', 'pari', 'last', 'year', 'to', 'pursu', 'hi', 'passion', '.', 'hi', 'current', 'enrol', 'in', 'a', 'comprehens', 'cours', 'on', 'natur', 'languag', 'process']


In [14]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/deepshah/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [15]:
# Part-of-speech tagging
from nltk import pos_tag
text_pos_tag = pos_tag(tokens)
print(text_pos_tag)

[('Ben', 'NNP'), ('relocated', 'VBD'), ('to', 'TO'), ('Paris', 'NNP'), ('last', 'JJ'), ('year', 'NN'), ('to', 'TO'), ('pursue', 'VB'), ('his', 'PRP$'), ('passion', 'NN'), ('.', '.'), ('His', 'PRP$'), ('currently', 'RB'), ('enrolles', 'VBZ'), ('in', 'IN'), ('a', 'DT'), ('comprehensive', 'JJ'), ('course', 'NN'), ('on', 'IN'), ('Natural', 'NNP'), ('Language', 'NNP'), ('Processing', 'NNP')]


In [17]:
# This function shows us the label for each tag
nltk.download('tagsets')
nltk.help.upenn_tagset("NNP")

[nltk_data] Downloading package tagsets to
[nltk_data]     /Users/deepshah/nltk_data...


NNP: noun, proper, singular
    Motown Venneboerger Czestochwa Ranzer Conchita Trumplane Christos
    Oceanside Escobar Kreisler Sawyer Cougar Yvette Ervin ODI Darryl CTCA
    Shannon A.K.C. Meltex Liverpool ...


[nltk_data]   Unzipping help/tagsets.zip.


In [21]:
# Named Entitiy Recognition
nltk.download('maxent_ne_chunker')
nltk.download('words')
entities = nltk.ne_chunk(text_pos_tag)
print(entities)

(S
  (PERSON Ben/NNP)
  relocated/VBD
  to/TO
  (GPE Paris/NNP)
  last/JJ
  year/NN
  to/TO
  pursue/VB
  his/PRP$
  passion/NN
  ./.
  His/PRP$
  currently/RB
  enrolles/VBZ
  in/IN
  a/DT
  comprehensive/JJ
  course/NN
  on/IN
  (ORGANIZATION Natural/NNP Language/NNP)
  Processing/NNP)


[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /Users/deepshah/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to /Users/deepshah/nltk_data...
[nltk_data]   Package words is already up-to-date!


# 2. Word Embeddings

In [22]:
!python -m spacy download en_core_web_lg

Collecting en-core-web-lg==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.7.1/en_core_web_lg-3.7.1-py3-none-any.whl (587.7 MB)
[K     |████████████████████████████████| 587.7 MB 18 kB/s  eta 0:00:012     |██████████████████████▎         | 409.7 MB 4.1 MB/s eta 0:00:44     |████████████████████████▎       | 446.0 MB 11.3 MB/s eta 0:00:13
Installing collected packages: en-core-web-lg
Successfully installed en-core-web-lg-3.7.1
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')


In [24]:
import spacy

# Load the English model
nlp = spacy.load('en_core_web_lg')

# Process words with the model
word1 = nlp("king")
word2 = nlp("queen")
word3 = nlp("apple")

# Calculate the similarities
similarity1 = word1.similarity(word2)
similarity2 = word1.similarity(word3)

# Display the similarities
print(similarity1)
print(similarity2)

0.6108841234425123
0.19521993793686707


# 3. NLP Pipeline

## Data Acquisition

Sourcing various texts from digital places digital libraries, websites, transcripts

Public data, Web Scraping, APIs

Data Augmentation - Enhancing our dataset by making slight variations to existing texts (Synonyms, Rephrasing, increasing volume).

In [31]:
from sklearn.datasets import fetch_20newsgroups
import pandas as pd

# Load the data
posts = fetch_20newsgroups(subset='all', categories=['sci.electronics', 'sci.space'],
                          remove = ('headers', 'footers', 'quotes'))

In [32]:
# See the available key in the dataset
print(posts.keys())

dict_keys(['data', 'filenames', 'target_names', 'target', 'DESCR'])


In [33]:
# Display one post
print(posts.data[1])

AL>>        Question:   Is there a certain device out there that I can
AL>>                    use to find out the number to the line?
AL>>        Thanks for any response.
AL>>                                                    Al

AL>There is a number you can call which will return a synthesized
AL>voice telling you the number of the line.  Unfortunately, for the
AL>life of me I can't remember what it is. The telephone technicians
AL>use it all the time.  We used to play around with this in our
AL>dorm rooms since there were multiple phone lines running between
AL>rooms.

It probably wouldn't help for you to post the number, since it appears
to be different in each area.  For what it's worth, in the New Orleans
area the number is 998-877-6655 (easy to remember, what?)


 * SLMR 2.1 * Ask me anything: if I don't know, I'll make up something.
                                          


In [34]:
# Create a DataFrame
df = pd.DataFrame({
    'text': posts.data,
    'label': [posts.target_names[target] for target in posts.target]
})

print(df.head())
df.shape

                                                text            label
0  \n   >\tIf the  new  Kuiper belt object *is*  ...        sci.space
1  AL>>        Question:   Is there a certain dev...  sci.electronics
2  \nIt's not quite what you were asking, but a f...        sci.space
3  \n\n\nNo, the sky does not, at this time, belo...        sci.space
4   \nDigi-Key also sells Quad Line Receivers, pa...  sci.electronics


(1971, 2)

## Data Cleaning and Preprocessing

We ensure that data is validated and filtered to guarantee accuracy, relevance and the absence of errors or duplicates.

Step 1: Clean and Standardize the Data 

Addressing issues like Spelling, grammar, typographical errors, and whitespaces

Standardizing capitalization, date formats, encoding

Discarding irrelevant elements like metadata or non-textual content

Step 2: Deconstruct Natural Language

Tokenization -> Remove Stop Words (like and) -> Stemming (Faster but less precise) and Lemmatization (more accurate and needs more resource) to reduce words to their base forms -> POS tagging (categoriing each token according to grammatical role such as noun or verb) and Named Entity Recognition (Identifies or Classifies key information)

In [38]:
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/deepshah/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/deepshah/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/deepshah/nltk_data...


True

In [45]:
# Define function to clean and pre-process text
def clean_text(text):
    
    # Tokenization
    tokens = word_tokenize(text)
    
    # Remove tokens that are not purely letters
    tokens = [word for word in tokens if word.isalpha()]
    
    # Lowercase the text
    tokens = [word.lower() for word in tokens]
    
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    # Join tokens back into a single string
    clean_text = ' '.join(tokens)
    
    return clean_text

In [46]:
# Apply the function to the data
df['clean_text'] = df['text'].apply(clean_text)
print(df.head())

                                                text            label  \
0  \n   >\tIf the  new  Kuiper belt object *is*  ...        sci.space   
1  AL>>        Question:   Is there a certain dev...  sci.electronics   
2  \nIt's not quite what you were asking, but a f...        sci.space   
3  \n\n\nNo, the sky does not, at this time, belo...        sci.space   
4   \nDigi-Key also sells Quad Line Receivers, pa...  sci.electronics   

                                          clean_text  
0  new kuiper belt object called next one called ...  
1  al question certain device al use find number ...  
2  quite asking year ago helped ee remote sensing...  
3  sky time belong anyone ownership necessary def...  
4  also sell quad line receiver part quad line dr...  


In [48]:
# Keep only clean text column and label
clean_data = df[['clean_text', 'label']]
print(clean_data.head())

                                          clean_text            label
0  new kuiper belt object called next one called ...        sci.space
1  al question certain device al use find number ...  sci.electronics
2  quite asking year ago helped ee remote sensing...        sci.space
3  sky time belong anyone ownership necessary def...        sci.space
4  also sell quad line receiver part quad line dr...  sci.electronics


## Feature Extraction

Convert Preprocessed texts into numerical format or NLP algos.

One-hot encoding: Each word is encoded as a unique vector represented as one in its position against the entire vocab which is represented as zeroes. This creates a sparse matrix which is computationally inefficient.

Bag-of-Words Model counters sparsity by counting word frequencies without considering grammar or word order useful for classifying texts with similar words.

The more frequent the word, the more important it might be to the text. Drawbacks - Ignores word order.

Bag-of-n-grams Model extends this by considering a sequence of n words together capturing more context. Increasing n provides richer detail but can lead to feature explosion risking model overfitting.

TF-IDF (Term Frequency Inverse Document Frequency) helps identifying the most significant words in a document. TF counts how often a words appears in a given document, while IDF measures the importance of a term across a corpus. Mathematically, TF-IDF is their product, assigning a score that reflects the word's importance in a document relative to a collection of documents. Higher TF-IDF score suggests a great relevance. IDF suggests rarity.

Word Embedding are another type of representation in NLP where words froma vocab are mapped to vectors of real numbers in a predefined space. Each word is represented by a dense vector that capture its semantic meaning based on the context in which it appears. These embeddings are learned from large corpora of text data. Popular Model: Word2Vec

In [50]:
# Perform the train-test split
X = df['clean_text']
y = df['label']

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(1478,) (1478,)
(493,) (493,)


In [58]:
# Vectorize the data

# Bag-of-Words
from sklearn.feature_extraction.text import CountVectorizer # counts how many times each word appears in a vocab
count_vect = CountVectorizer(min_df=10) # , ngram_range = (2, 2)) # 2 word combinations. Sets the lower and upper boundary of the range of n values for different word n grams.

# Fit and transform the training data
X_train_counts = count_vect.fit_transform(X_train) # the method fits the model or learns the vocab of the training set and then it transforms the training data into a matrix of token counts

# Transform the test data
X_test_counts = count_vect.transform(X_test) # Only transform do not fit. This means that the test data will be transformed into a matrix of token counts using the vocab learn from the training data preventing data leakage

# Display feature names
counts_df = pd.DataFrame(X_train_counts.toarray(), columns = count_vect.get_feature_names_out())
print(counts_df.head())

   ability  able  absolutely  ac  acceleration  accept  accepted  access  \
0        0     0           0   0             0       0         0       0   
1        0     0           0   0             0       0         0       0   
2        0     0           0   0             0       0         0       0   
3        0     0           0   0             0       0         0       0   
4        0     0           0   0             0       0         0       0   

   accomplish  according  ...  wrong  wrote  yeah  year  yellow  yes  \
0           0          0  ...      0      0     0     0       0    0   
1           0          0  ...      0      0     0     0       0    0   
2           0          0  ...      0      0     0     0       0    0   
3           0          0  ...      0      0     0     0       0    0   
4           0          0  ...      0      0     0     0       0    0   

   yesterday  yet  york  zero  
0          0    0     0     0  
1          0    0     0     0  
2          0  

In [55]:
# TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vect = TfidfVectorizer(max_df=0.7, min_df=0.01) # Restricting the minimum and maximum document frequency

# Fit and transform the training data
tfidf_train = tfidf_vect.fit_transform(X_train) 

# Transform the test data
tfidf_test = tfidf_vect.transform(X_test)

# Display feature names
tfidf_df = pd.DataFrame(tfidf_train.toarray(), columns = tfidf_vect.get_feature_names_out())
print(tfidf_df.head()) # this helps for preparing data for using in a classifier

   able   ac  acceleration  access  according  across  act  action  active  \
0   0.0  0.0           0.0     0.0        0.0     0.0  0.0     0.0     0.0   
1   0.0  0.0           0.0     0.0        0.0     0.0  0.0     0.0     0.0   
2   0.0  0.0           0.0     0.0        0.0     0.0  0.0     0.0     0.0   
3   0.0  0.0           0.0     0.0        0.0     0.0  0.0     0.0     0.0   
4   0.0  0.0           0.0     0.0        0.0     0.0  0.0     0.0     0.0   

   activity  ...     would  write  writing  written  wrong  wrote  year  yes  \
0       0.0  ...  0.070156    0.0      0.0      0.0    0.0    0.0   0.0  0.0   
1       0.0  ...  0.000000    0.0      0.0      0.0    0.0    0.0   0.0  0.0   
2       0.0  ...  0.000000    0.0      0.0      0.0    0.0    0.0   0.0  0.0   
3       0.0  ...  0.000000    0.0      0.0      0.0    0.0    0.0   0.0  0.0   
4       0.0  ...  0.000000    0.0      0.0      0.0    0.0    0.0   0.0  0.0   

   yet  zero  
0  0.0   0.0  
1  0.0   0.0  
2  0.

## Data Modeling and Evaluation

Models:
    
    Text Classification (Classifying Emails as Spam or Organizing Articles) using Naive Bayes, Support Vector Machines (SVM) and Neural Networks
    
    Sentiment Analysis (Discerns emotions or sentiments with text for Brand Monitoring and Analysing customer feedback) using Logistic Regression, Long Short Term Memory Networks (LSTMs)
    
    Named Entity Recognition is about identifying and categorizing key elements in text such as names of people, places etc. using Conditional Random Fields and transformer based models like BERT
    
    Speech Recognition is conversion of spoken lang into text using Hidden Markov Models and now emplying deep neural networks like LSTMs and RNN
    
    Text generation with LSTMs and GPT
    
    Machine Translation using Statistical Machine translation to more advanced models such as Seq2Seq, GPT


Evaluate our Model's Performance:
    
    Use various metrics including accuracy (proportion of correct prediction), precision (looks at proportion of two positive predictions in the total positive predictions), recall (Assesses the proportion of actual positive correctly identified), and F-score
    
    Adjusting parameters, choosing different algos or revising data cleaning methods

In [59]:
# Use Naive Bayes classifier since it is particularly suited for count vectorized data as it expects integer inputs.
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

nb = MultinomialNB()
nb.fit(X_train_counts, y_train)
y_pred = nb.predict(X_test_counts)
metrics.accuracy_score(y_test, y_pred)

0.9006085192697769

In [60]:
labels = ['sci.electronics', 'sci.space']
cm = metrics.confusion_matrix(y_test, y_pred, labels=labels)

# Create a DataFrame from the Confusion Matrix 
cm_df = pd.DataFrame(cm, index=labels, columns=labels)

# Print
print("Confusion Matrix:")
print(cm_df)

Confusion Matrix:
                 sci.electronics  sci.space
sci.electronics              227          7
sci.space                     42        217
