In [2]:
# NLP TECHNIQUES :
  # STEMMING
  # LEMMATIZATION
  # ONE HOT ENCODING
  # BAG OF WORDS
  # TF-IDF (Term Frequency-Inverse Document Frequency)
  # WORD EMBEDDIND :- WORD2VEC :- # CBOW (CONTUNUES BAG OF WORDS)
                                  # SKIP GRAM

In [3]:
 # STEMMING :

# Stemming is a process in Natural Language Processing (NLP) that reduces words to their base or root form, typically by stripping suffixes.
#The goal is to simplify text by unifying different forms of a word (e.g., "running," "ran," "runner") into a common stem (e.g., "run").

#Types of Stemming Algorithms

# 1)Porter Stemmer:

# One of the most popular stemming algorithms, developed by Martin Porter in 1980.
# It uses a set of rules to iteratively reduce words to their root form.


from nltk.stem import PorterStemmer
porter_stemmer = PorterStemmer()
print(porter_stemmer.stem("running"))  # Output: run
print(porter_stemmer.stem("connected"))  # Output: connect

run
connect


In [4]:
# 2)Lancaster Stemmer:

# A more aggressive stemming algorithm compared to the Porter Stemmer, which sometimes results in very short stems.

from nltk.stem import LancasterStemmer
lancaster_stemmer = LancasterStemmer()
print(lancaster_stemmer.stem("running"))  # Output: run
print(lancaster_stemmer.stem("connected"))  # Output: connect
print(lancaster_stemmer.stem("responsiveness"))  # Output: respond

run
connect
respond


In [5]:
# 3)Snowball Stemmer:

# Also known as the "Porter2" stemmer, it is an improvement over the original Porter Stemmer.
# It supports multiple languages and is considered more consistent and efficient.


from nltk.stem import SnowballStemmer
snowball_stemmer = SnowballStemmer("english")
print(snowball_stemmer.stem("running"))  # Output: run
print(snowball_stemmer.stem("connected"))  # Output: connect

run
connect


In [6]:

# 4) Regex-based Stemmer:

# A simple approach where regular expressions are used to manually define stemming rules.

import re
def simple_stemmer(word):
    return re.sub(r'(ing|ed|s)$', '', word)

print(simple_stemmer("running"))  # Output: runn
print(simple_stemmer("connected"))  # Output: connect

runn
connect


In [7]:
# 5)Customized Stemmer:

# In some cases, a custom stemmer may be developed to handle specific stemming needs, especially when working with domain-specific texts.


from nltk.stem import PorterStemmer

class CustomStemmer(PorterStemmer):
    def stem(self, word):
        if word.endswith('ness'):
            return word[:-4]
        return super().stem(word)

custom_stemmer = CustomStemmer()
print(custom_stemmer.stem("happiness"))  # Output: happy
print(custom_stemmer.stem("running"))  # Output: run

happi
run


In [8]:
# LEMMATIZATION :Lemmatization is a process in Natural Language Processing (NLP) that reduces words to their base or dictionary form, known as the "lemma."

# Lemmatization is more accurate than stemming because it considers the context of the word and returns a proper word that exists in the language.
# For example, while stemming would reduce "better" to "bet," lemmatization correctly identifies that the lemma for "better" is "good."

# Key Differences Between Lemmatization and Stemming:

  #Stemming: Reduces words to their root by removing suffixes, often leading to non-existent or partial words.

# Example: "running" → "run", "better" → "bet"

# Lemmatization: Reduces words to their dictionary form by considering the word’s meaning and context.

# Example: "running" → "run", "better" → "good"

In [9]:
# Types of Lemmatization with Examples
 
# WordNet Lemmatizer (in NLTK):
 # One of the most commonly used lemmatizers, based on the WordNet lexical database.

from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

lemmatizer = WordNetLemmatizer()
print(lemmatizer.lemmatize("running", pos="v"))  # Output: run
print(lemmatizer.lemmatize("better", pos="a"))  # Output: good
print(lemmatizer.lemmatize("geese", pos="n"))  # Output: goose

LookupError: 
**********************************************************************
  Resource [93mwordnet[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('wordnet')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mcorpora/wordnet[0m

  Searched in:
    - 'C:\\Users\\Chandra Shekhar/nltk_data'
    - 'C:\\ProgramData\\anaconda3\\nltk_data'
    - 'C:\\ProgramData\\anaconda3\\share\\nltk_data'
    - 'C:\\ProgramData\\anaconda3\\lib\\nltk_data'
    - 'C:\\Users\\Chandra Shekhar\\AppData\\Roaming\\nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
**********************************************************************


In [10]:
# Custom Lemmatizer:

#Sometimes, a custom lemmatizer is created for specific applications, using predefined rules and dictionaries.


def custom_lemmatizer(word, pos):
    lemma_dict = {
        'children': 'child',
        'mice': 'mouse',
        'feet': 'foot'
    }
    return lemma_dict.get(word, word)

print(custom_lemmatizer("children", pos="n"))  # Output: child
print(custom_lemmatizer("mice", pos="n"))  # Output: mouse

child
mouse


In [11]:
 # Bag of Words (BoW);

#The Bag of Words (BoW) model is a fundamental method used in Natural Language Processing (NLP) to convert text data into numerical representations.
#This approach disregards grammar and word order but considers the frequency or presence of words in a document.
#It is particularly useful for text classification, sentiment analysis, and information retrieval tasks.

#Implementation

#Step 1: Tokenization

import nltk
from sklearn.feature_extraction.text import CountVectorizer

# Sample documents
documents = [
    "Cats are beautiful animals.",
    "Dogs are loyal and friendly animals.",
    "Cats and dogs are popular pets."
]

# Tokenize the documents (you may add more preprocessing steps like stemming or stop word removal)
nltk.download('punkt')
tokenized_docs = [nltk.word_tokenize(doc.lower()) for doc in documents]

# Flatten the list of lists and remove duplicates to create a vocabulary
vocabulary = sorted(set([word for doc in tokenized_docs for word in doc]))

print("Vocabulary:", vocabulary)

[nltk_data] Downloading package punkt to C:\Users\Chandra
[nltk_data]     Shekhar\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


Vocabulary: ['.', 'and', 'animals', 'are', 'beautiful', 'cats', 'dogs', 'friendly', 'loyal', 'pets', 'popular']


In [12]:
#Step 2: Building the Vocabulary and Vectorization

from sklearn.feature_extraction.text import CountVectorizer

# Initialize CountVectorizer
vectorizer = CountVectorizer()

# Fit and transform the documents to create the bag of words model
X = vectorizer.fit_transform(documents)

# Convert the result to an array for better readability
bow_array = X.toarray()

print("Bag of Words Model:\n", bow_array)
print("Feature Names:\n", vectorizer.get_feature_names_out())

Bag of Words Model:
 [[0 1 1 1 1 0 0 0 0 0]
 [1 1 1 0 0 1 1 1 0 0]
 [1 0 1 0 1 1 0 0 1 1]]
Feature Names:
 ['and' 'animals' 'are' 'beautiful' 'cats' 'dogs' 'friendly' 'loyal' 'pets'
 'popular']


In [13]:

# TF-IDF (Term Frequency-Inverse Document Frequency)

# TF-IDF (Term Frequency-Inverse Document Frequency) is a statistical measure used to evaluate the importance of a word in a document relative to a collection of documents (corpus).
# It is often used in information retrieval and text mining to identify relevant terms in documents.


# Implementation of TF-IDF

from sklearn.feature_extraction.text import TfidfVectorizer

# Sample documents
documents = [
    "The quick brown fox jumps over the lazy dog",
    "Never jump over the lazy dog quickly",
    "Brown foxes are quick and they can jump high"
]

# Create a TfidfVectorizer object
vectorizer = TfidfVectorizer()

# Transform the documents into TF-IDF vectors
tfidf_matrix = vectorizer.fit_transform(documents)

# Get feature names
feature_names = vectorizer.get_feature_names_out()

# Convert the TF-IDF matrix to a dense representation
dense_tfidf_matrix = tfidf_matrix.todense()

# Display the TF-IDF matrix
import pandas as pd
df = pd.DataFrame(dense_tfidf_matrix, columns=feature_names)
print(df)

        and       are     brown       can       dog       fox     foxes  \
0  0.000000  0.000000  0.283321  0.000000  0.283321  0.372533  0.000000   
1  0.000000  0.000000  0.000000  0.000000  0.343851  0.000000  0.000000   
2  0.359554  0.359554  0.273450  0.359554  0.000000  0.000000  0.359554   

       high      jump     jumps      lazy     never      over     quick  \
0  0.000000  0.000000  0.372533  0.283321  0.000000  0.283321  0.283321   
1  0.000000  0.343851  0.000000  0.343851  0.452123  0.343851  0.000000   
2  0.359554  0.273450  0.000000  0.000000  0.000000  0.000000  0.273450   

    quickly       the      they  
0  0.000000  0.566642  0.000000  
1  0.452123  0.343851  0.000000  
2  0.000000  0.000000  0.359554  


In [14]:
# Word embedding :

# Word embedding is a technique used in Natural Language Processing (NLP) to represent words in a continuous vector space where words with similar meanings are mapped closer together.
# Unlike traditional methods like Bag of Words (BoW) or TF-IDF, word embeddings capture semantic relationships between words.
# These embeddings are usually learned from large corpora of text using neural network models.

# Common Word Embedding Techniques :

# Word2Vec: Developed by Google, it uses two main architectures:

# Continuous Bag of Words (CBOW): Predicts a word based on its context.
# Skip-gram: Predicts the context given a word.


In [16]:
!pip install gensim

Defaulting to user installation because normal site-packages is not writeable


In [18]:
# Implementation in Python

import gensim
from gensim.models import Word2Vec
import nltk

# Sample documents
documents = [
    "Cats are beautiful animals.",
    "Dogs are loyal and friendly animals.",
    "Cats and dogs are popular pets.",
    "I love my dog.",
    "My cat is very playful."
]

# Preprocess the documents: tokenize and lower case
nltk.download('punkt')
tokenized_docs = [nltk.word_tokenize(doc.lower()) for doc in documents]

# Train the Word2Vec model
model = Word2Vec(sentences=tokenized_docs, vector_size=100, window=5, min_count=1, workers=4)

# Access the vector for a specific word
cat_vector = model.wv['cat']
print("Vector representation for 'cat':\n", cat_vector)

# Find the most similar words to 'cat'
similar_words = model.wv.most_similar('cat')
print("Most similar words to 'cat':\n", similar_words)

ImportError: cannot import name 'triu' from 'scipy.linalg' (C:\ProgramData\anaconda3\Lib\site-packages\scipy\linalg\__init__.py)

In [19]:
# The Word2Vec model can be trained using two different algorithms: Continuous Bag of Words (CBOW) and Skip-gram. Both methods are used to predict context in a different manner.

# CBOW (Continuous Bag of Words): Predicts the target word (center word) from the context words (surrounding words).
# Skip-gram: Predicts the context words from the target word

import nltk
nltk.download('punkt')

# Sample documents
documents = [
    "Cats are beautiful animals.",
    "Dogs are loyal and friendly animals.",
    "Cats and dogs are popular pets.",
    "I love my dog.",
    "My cat is very playful."
]

# Tokenize the documents
tokenized_docs = [nltk.word_tokenize(doc.lower()) for doc in documents]

[nltk_data] Downloading package punkt to C:\Users\Chandra
[nltk_data]     Shekhar\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [23]:
#CBOW Implementation
#In gensim, setting the sg parameter to 0 will use the CBOW model.

from gensim.models import Word2Vec

# Train CBOW model
cbow_model = Word2Vec(sentences=tokenized_docs, vector_size=100, window=5, min_count=1, workers=4, sg=0)

# Access the vector for a specific word
cat_vector_cbow = cbow_model.wv['cat']
print("CBOW Vector for 'cat':\n", cat_vector_cbow)

# Find the most similar words to 'cat'
similar_words_cbow = cbow_model.wv.most_similar('cat')
print("Most similar words to 'cat' (CBOW):\n", similar_words_cbow)

ImportError: cannot import name 'triu' from 'scipy.linalg' (C:\ProgramData\anaconda3\Lib\site-packages\scipy\linalg\__init__.py)

In [22]:
#Skip-gram Implementation
#In gensim, setting the sg parameter to 1 will use the Skip-gram model.

# Train Skip-gram model
skipgram_model = Word2Vec(sentences=tokenized_docs, vector_size=100, window=5, min_count=1, workers=4, sg=1)

# Access the vector for a specific word
cat_vector_skipgram = skipgram_model.wv['cat']
print("Skip-gram Vector for 'cat':\n", cat_vector_skipgram)

# Find the most similar words to 'cat'
similar_words_skipgram = skipgram_model.wv.most_similar('cat')
print("Most similar words to 'cat' (Skip-gram):\n", similar_words_skipgram)

NameError: name 'Word2Vec' is not defined