# 1. Vectorizaton and Tokenization

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer

# Sample text
#texts = ["I love deep learning", "Deep learning is amazing"]
texts = ["I love deep learning", "Deep learning is amazing"]

tokenizer = Tokenizer()                 # Create a tokenizer
tokenizer.fit_on_texts(texts)           # Fit the tokenizer on the text
print("Word Index:", tokenizer.word_index)      # Show word index (each word gets a number)

sequences = tokenizer.texts_to_sequences(texts) # Convert text to sequences of numbers
print("Tokenized & Vectorized:", sequences)


Word Index: {'deep': 1, 'learning': 2, 'i': 3, 'love': 4, 'is': 5, 'amazing': 6}
Tokenized & Vectorized: [[3, 4, 1, 2], [1, 2, 5, 6]]


In [1]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Sample text
texts = ["I love deep learning", "Deep learning is amazing"]

# Create a tokenizer and fit it on the text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts)

# Show word index (each word gets a number)
print("Word Index:", tokenizer.word_index)

# Convert text to sequences of numbers
sequences = tokenizer.texts_to_sequences(texts)
print("Tokenized & Vectorized:", sequences)

# Pad sequences to create a uniform input shape
padded = pad_sequences(sequences)
print("Padded Sequences:\n", padded)


Word Index: {'deep': 1, 'learning': 2, 'i': 3, 'love': 4, 'is': 5, 'amazing': 6}
Tokenized & Vectorized: [[3, 4, 1, 2], [1, 2, 5, 6]]
Padded Sequences:
 [[3 4 1 2]
 [1 2 5 6]]


In [4]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.text import one_hot

# Sample text
texts = ["I love deep learning", "Deep learning is amazing"]

# One-Hot Encoding manually (with vocab size)
vocab_size = 50  # Choose a vocab size larger than your total unique words
encoded_docs = [one_hot(d, vocab_size) for d in texts]
print("One Hot Encoded Docs:", encoded_docs)

# Using Tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts)

print("Word Index (Tokenizer):", tokenizer.word_index)

sequences = tokenizer.texts_to_sequences(texts)
print("Tokenized & Vectorized (Tokenizer):", sequences)


One Hot Encoded Docs: [[1, 32, 19, 41], [19, 41, 32, 40]]
Word Index (Tokenizer): {'deep': 1, 'learning': 2, 'i': 3, 'love': 4, 'is': 5, 'amazing': 6}
Tokenized & Vectorized (Tokenizer): [[3, 4, 1, 2], [1, 2, 5, 6]]


# 2. Sentiment analysis

In [None]:
from textblob import TextBlob

# Sample text
text = ["I like action movies",
        "I like action movies very much",
        "I love this product, it's amazing!",
        " dont known why I feel uncomforatable",
        "I watched a movie last night when I was sleepy mood. ",
        "I feel headache and getting angry soon. ",
        "I feel irritating, leave me alone",
        "Leave me alone else I wil shout on you",
        "Leave me alone else I wil shout on you due to angry."]
for i in text:
    blob = TextBlob(i)                              # Create a TextBlob object
    sentiment = blob.sentiment.polarity             # Get the sentiment polarity (ranges from -1 to 1)
    print("Sentiment score : ",sentiment,end="")

    # Output the sentiment
    if sentiment > 0:
        print(" ; Positive sentiment : ",i)
    elif sentiment < 0:
        print(" ; Negative sentiment : ",i)
    else:
        print(" ; Neutral sentiment : ",i)


Sentiment score :  0.1 ; Positive sentiment :  I like action movies
Sentiment score :  0.18 ; Positive sentiment :  I like action movies very much
Sentiment score :  0.625 ; Positive sentiment :  I love this product, it's amazing!
Sentiment score :  0.0 ; Neutral sentiment :   dont known why I feel uncomforatable
Sentiment score :  0.0 ; Neutral sentiment :  I watched a movie last night when I was sleepy mood. 
Sentiment score :  -0.5 ; Negative sentiment :  I feel headache and getting angry soon. 
Sentiment score :  -0.4 ; Negative sentiment :  I feel irritating, leave me alone
Sentiment score :  0.0 ; Neutral sentiment :  Leave me alone else I wil shout on you
Sentiment score :  -0.3125 ; Negative sentiment :  Leave me alone else I wil shout on you due to angry.


In [3]:
"""from pattern.en import sentiment, lexicon

# See number of words in the lexicon
print(len(lexicon))  # Output: ~2900 words

# Print a few words and their polarity
for word in list(lexicon.keys())[:10]:
    print(word, sentiment(word))
"""

'from pattern.en import sentiment, lexicon\n\n# See number of words in the lexicon\nprint(len(lexicon))  # Output: ~2900 words\n\n# Print a few words and their polarity\nfor word in list(lexicon.keys())[:10]:\n    print(word, sentiment(word))\n'

# 3. Text clasifiction

In [4]:
"""
After performing Sentiment Analysis, if you use the labeled data (like positive, negative, neutral) to train a 
machine learning model that can predict the sentiment of new, unseen text, you're performing: ✅ Text Classification.
"""

"\nAfter performing Sentiment Analysis, if you use the labeled data (like positive, negative, neutral) to train a \nmachine learning model that can predict the sentiment of new, unseen text, you're performing: ✅ Text Classification.\n"

In [None]:
from textblob import TextBlob

# Sample text
data=dict()
text = ["I like action movies",
        "I love this product, it's amazing!",
        "I watched a movie last night when I was sleepy mood. ",
        "I feel headache and getting angry soon. ",
        "Leave me alone else I wil shout on you",
        "Leave me alone else I wil shout on you due to angry."]
for i in text:
    blob = TextBlob(i)                              # Create a TextBlob object
    sentiment = blob.sentiment.polarity             # Get the sentiment polarity (ranges from -1 to 1)
    print("Sentiment score : ",sentiment,end="")
    
    # Output the sentiment
    if sentiment > 0:
        print(" ; Positive sentiment : ",i)
        data.update({i:"Positive sentiment"})
    elif sentiment < 0:
        print(" ; Negative sentiment : ",i)
        data.update({i:"Negative sentiment"})
    else:
        print(" ; Neutral sentiment : ",i)
        data.update({i:"Neutral sentiment"})
data

Sentiment score :  0.1 ; Positive sentiment :  I like action movies
Sentiment score :  0.625 ; Positive sentiment :  I love this product, it's amazing!
Sentiment score :  0.0 ; Neutral sentiment :  I watched a movie last night when I was sleepy mood. 
Sentiment score :  -0.5 ; Negative sentiment :  I feel headache and getting angry soon. 
Sentiment score :  0.0 ; Neutral sentiment :  Leave me alone else I wil shout on you
Sentiment score :  -0.3125 ; Negative sentiment :  Leave me alone else I wil shout on you due to angry.


{'I like action movies': 'Positive sentiment',
 "I love this product, it's amazing!": 'Positive sentiment',
 'I watched a movie last night when I was sleepy mood. ': 'Neutral sentiment',
 'I feel headache and getting angry soon. ': 'Negative sentiment',
 'Leave me alone else I wil shout on you': 'Neutral sentiment',
 'Leave me alone else I wil shout on you due to angry.': 'Negative sentiment'}

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

# Sample data
texts = list(data.keys())       #["I love this! amazing", "I hate this!", "Best experience", "Worst experience"]
labels = list(data.values())    #["positive", "negative", "positive", "negative"]

vectorizer = CountVectorizer()      # Convert text to numbers
X = vectorizer.fit_transform(texts)

model = MultinomialNB()             # Load model
model.fit(X, labels)                # Train model

test = ["It was amazing", "Terrible service"]
X_test = vectorizer.transform(test)                 # Vectorising
for i,j in zip(test,model.predict(X_test)):         # Predict
    print(i," : ",j)


It was amazing  :  Positive sentiment
Terrible service  :  Negative sentiment


# 4. Machine Translation

In [7]:
from transformers import MarianMTModel, MarianTokenizer
import warnings
warnings.filterwarnings("ignore")

# English to Hindi model
model_name = 'Helsinki-NLP/opus-mt-en-hi'
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)

# Translate
text = "How are you? Are u fine ?"
tokens = tokenizer.prepare_seq2seq_batch([text], return_tensors="pt")
translation = model.generate(**tokens)
translated_text = tokenizer.decode(translation[0], skip_special_tokens=True)

print(translated_text)  # Output: आप कैसे हैं?


आप कैसे हैं?


# 6. Named Entity Recognition (NER)

In [None]:
import spacy

nlp = spacy.load("en_core_web_sm")      # Load English model

# Input text
text = "Google was founded by Larry Page and Sergey Brin at Stanford University in 1998."

doc = nlp(text)                         # Process text

for ent in doc.ents:                    # Print named entities
    print(ent.text, "=>", ent.label_)


Google => ORG
Larry Page => PERSON
Sergey Brin => PERSON
Stanford University => ORG
1998 => DATE


# 7. Text Summarization

In [None]:
from transformers import pipeline

# Load pre-trained summarizer
summarizer = pipeline("summarization")

# Long text to summarize
text = """Python is a high-level, interpreted programming language known for its simplicity and readability. 
It supports multiple programming paradigms and has a large standard library. 
Python is widely used in data science, machine learning, web development, and automation."""

# Get summary
summary = summarizer(text, max_length=50, min_length=25, do_sample=False)
print(summary[0]['summary_text'])


In [None]:
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lsa import LsaSummarizer

text = """Python is a high-level, interpreted programming language known for its simplicity and readability. 
It supports multiple programming paradigms and has a large standard library. 
Python is widely used in data science, machine learning, web development, and automation."""

parser = PlaintextParser.from_string(text, Tokenizer("english"))
summarizer = LsaSummarizer()
summary = summarizer(parser.document, 2)  # 2 sentences


for sentence in summary:
    print(sentence)


Python is a high-level, interpreted programming language known for its simplicity and readability.
Python is widely used in data science, machine learning, web development, and automation.


# 8. Speech Recognition

In [3]:
import speech_recognition as sr

# Initialize recognizer
recognizer = sr.Recognizer()

# Use microphone
with sr.Microphone() as source:
    print("🎤 Speak something...")
    audio = recognizer.listen(source)

    try:
        # Recognize speech using Google Web API
        text = recognizer.recognize_google(audio)
        print("📝 You said:", text)
    except sr.UnknownValueError:
        print("❌ Could not understand audio")
    except sr.RequestError:
        print("⚠️ Could not request results; check your internet")


🎤 Speak something...
📝 You said: yaar Aruna is there


# 9. Autocorrect and Spell Check

In [9]:
from textblob import TextBlob

# Example sentence with spelling mistakes
texts = [
    "Ths is smple txt for autocorect",
    "I hav a dreem",
    "Pythn is a grate progrmming langauge"
]

for i in texts:
    # Create a TextBlob object
    blob = TextBlob(i)

    # Apply autocorrect
    corrected = blob.correct()

    print("Before:", i)
    print("After :", corrected)
    print()


Before: Ths is smple txt for autocorect
After : The is smile txt for autocorect

Before: I hav a dreem
After : I had a dream

Before: Pythn is a grate progrmming langauge
After : Myth is a grate programming language



In [11]:
from autocorrect import Speller

spell = Speller(lang='en')

texts = [
    "Ths is smple txt for autocorect",
    "I hav a dreem",
    "Pythn is a grate progrmming langauge"
]

# Apply autocorrect
for t in texts:
    corrected = spell(t)
    print("Before:", t)
    print("After :", corrected)
    print()


Before: Ths is smple txt for autocorect
After : The is simple txt for autocorect

Before: I hav a dreem
After : I hav a dream

Before: Pythn is a grate progrmming langauge
After : Python is a rate programming language



# 10. Information Retrieval (Search Engines)

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Example documents
docs = [
    "Python is great for data science",
    "Machine learning is a part of artificial intelligence",
    "Data science uses Python and R",
    "Cooking recipes with lots of ingredients"
]

# User query
query = "Which language is best for data science?"

# Add query to the document list
documents = docs + [query]

# Vectorize using TF-IDF
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(documents)

# Compute cosine similarity between query and all documents
cosine_sim = cosine_similarity(tfidf_matrix[-1], tfidf_matrix[:-1])

# Show ranked results
for idx in cosine_sim[0].argsort()[::-1]:
    print(f"Score: {cosine_sim[0][idx]:.2f} | Doc: {docs[idx]}")


Score: 0.47 | Doc: Python is great for data science
Score: 0.21 | Doc: Data science uses Python and R
Score: 0.08 | Doc: Machine learning is a part of artificial intelligence
Score: 0.00 | Doc: Cooking recipes with lots of ingredients


In [16]:
from googlesearch import search

query = "Best Python libraries for data science"

# Perform Google Search
#for result in search(query, num_results=5):

#query = "Best Python libraries for data science"

# Get top 5 search results
results = search(query, num=5, stop=5, pause=2)

for link in results:
    print(link)


https://www.datacamp.com/blog/top-python-libraries-for-data-science
https://www.datacamp.com/blog/top-python-libraries-for-data-science#introduction-pytho
https://www.datacamp.com/blog/top-python-libraries-for-data-science#6.-scikit-learn-thete
https://www.datacamp.com/blog/top-python-libraries-for-data-science#11.-rapids.ai-cudf-and-cuml-thera
https://www.datacamp.com/blog/top-python-libraries-for-data-science#15.-tpot-tpoti


In [1]:
from googlesearch import search
query = "Best Python libraries for data science"
results = search(query, stop=5)
for link in results:
    print(link)

https://www.datacamp.com/blog/top-python-libraries-for-data-science
https://www.datacamp.com/blog/top-python-libraries-for-data-science#introduction-pytho
https://www.datacamp.com/blog/top-python-libraries-for-data-science#6.-scikit-learn-thete
https://www.datacamp.com/blog/top-python-libraries-for-data-science#11.-rapids.ai-cudf-and-cuml-thera
https://www.datacamp.com/blog/top-python-libraries-for-data-science#15.-tpot-tpoti


# 11. Language Detection

In [4]:
from langdetect import detect, detect_langs

# Sample texts
texts = [
    "This is an English sentence.",
    "C'est une phrase française.",
    "यह एक हिंदी वाक्य है।",
    "Dies ist ein deutscher Satz."
]

for text in texts:
    lang = detect(text)
    print(f"Text: {text}\nDetected Language: {lang}\n")


Text: This is an English sentence.
Detected Language: en

Text: C'est une phrase française.
Detected Language: fr

Text: यह एक हिंदी वाक्य है।
Detected Language: hi

Text: Dies ist ein deutscher Satz.
Detected Language: de



# 12. Keyword Extraction

In [2]:
"""from rake_nltk import Rake

# Initialize RAKE
rake = Rake()

# Text input
text = "Natural Language Processing helps computers understand human language. Keyword extraction is useful for summarizing long documents."

# Extract keywords
rake.extract_keywords_from_text(text)

# Get top keywords
keywords = rake.get_ranked_phrases()
print("Extracted Keywords:")
for kw in keywords:
    print("-", kw)
"""

'from rake_nltk import Rake\n\n# Initialize RAKE\nrake = Rake()\n\n# Text input\ntext = "Natural Language Processing helps computers understand human language. Keyword extraction is useful for summarizing long documents."\n\n# Extract keywords\nrake.extract_keywords_from_text(text)\n\n# Get top keywords\nkeywords = rake.get_ranked_phrases()\nprint("Extracted Keywords:")\nfor kw in keywords:\n    print("-", kw)\n'

In [3]:
from rake_nltk import Rake

# Initialize RAKE
rake = Rake()

# Text input
text = "Natural Language Processing helps computers understand human language. Keyword extraction is useful for summarizing long documents."

# Extract keywords
rake.extract_keywords_from_text(text)

# Get top keywords
keywords = rake.get_ranked_phrases()

print("Extracted Keywords:")
for kw in keywords:
    print("-", kw)


Extracted Keywords:
- natural language processing helps computers understand human language
- summarizing long documents
- keyword extraction
- useful


# 13. Topic Modeling

# 14. Text-to-Speech (TTS)

In [None]:
import pyttsx3

# Initialize the TTS engine
engine = pyttsx3.init()

# Set properties like rate, volume, and voice
engine.setProperty('rate', 150)  # Speed of speech
engine.setProperty('volume', 1)  # Volume (0.0 to 1.0)

# Say something
engine.say("Hello, I am a text to speech system.")
engine.runAndWait()


In [None]:
import speech_recognition as sr
import pyttsx3

# Initialize recognizer and TTS engine
recognizer = sr.Recognizer()
engine = pyttsx3.init()

# Capture speech from microphone
with sr.Microphone() as source:
    print("Listening...")
    audio = recognizer.listen(source)

# Recognize speech
try:
    text = recognizer.recognize_google(audio)
    print("You said: ", text)

    # Respond with TTS
    engine.say(f"You said: {text}")
    engine.runAndWait()
except sr.UnknownValueError:
    print("Sorry, I couldn't understand.")
    engine.say("Sorry, I couldn't understand.")
    engine.runAndWait()


In [None]:
voices = engine.getProperty('voices')
for voice in voices:
    print(f"ID: {voice.id}, Name: {voice.name}")
# Set a specific voice
engine.setProperty('voice', voices[1].id)  # Choosing second voice (female)


# 15. Question Answering

In [11]:
"""from transformers import pipeline

# Load QA pipeline
qa_pipeline = pipeline("question-answering")

# Input text and question
context = "Natural Language Processing enables machines to understand and interpret human language."
question = "What does NLP enable machines to do?"

# Get answer
result = qa_pipeline(question=question, context=context)
print(result)
"""

'from transformers import pipeline\n\n# Load QA pipeline\nqa_pipeline = pipeline("question-answering")\n\n# Input text and question\ncontext = "Natural Language Processing enables machines to understand and interpret human language."\nquestion = "What does NLP enable machines to do?"\n\n# Get answer\nresult = qa_pipeline(question=question, context=context)\nprint(result)\n'

# 16. Text Similarity / Duplicate Detection

In [5]:
import numpy
def jaccard_similarity(str1, str2):
    a = set(str1.lower().split())
    b = set(str2.lower().split())
    return len(a & b) / len(a | b)


In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

texts = ["NLP is fun", "Natural Language Processing is enjoyable"]
vec = TfidfVectorizer().fit_transform(texts)
similarity = cosine_similarity(vec[0:1], vec[1:2])
print(similarity)


[[0.15064018]]


In [10]:
"""from sentence_transformers import SentenceTransformer, util

model = SentenceTransformer('all-MiniLM-L6-v2')
emb1 = model.encode("NLP is fun", convert_to_tensor=True)
emb2 = model.encode("Natural Language Processing is enjoyable", convert_to_tensor=True)

similarity = util.pytorch_cos_sim(emb1, emb2)
print(similarity)
"""

'from sentence_transformers import SentenceTransformer, util\n\nmodel = SentenceTransformer(\'all-MiniLM-L6-v2\')\nemb1 = model.encode("NLP is fun", convert_to_tensor=True)\nemb2 = model.encode("Natural Language Processing is enjoyable", convert_to_tensor=True)\n\nsimilarity = util.pytorch_cos_sim(emb1, emb2)\nprint(similarity)\n'

In [9]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

df = pd.DataFrame({
    'text': ["What is NLP?", "Define Natural Language Processing", "Explain AI", "What is NLP?"]
})

tfidf = TfidfVectorizer().fit_transform(df['text'])
cos_sim = cosine_similarity(tfidf)

# Print pairs with high similarity (excluding diagonal)
for i in range(len(df)):
    for j in range(i+1, len(df)):
        if cos_sim[i][j] > 0.8:
            print(f"Duplicate Pair:\n - {df['text'][i]}\n - {df['text'][j]}\n")


Duplicate Pair:
 - What is NLP?
 - What is NLP?



# 17. Tokenization

In [1]:
from nltk.tokenize import word_tokenize, sent_tokenize

text = "Natural Language Processing is fun. Let's explore it!"
print("Words:", word_tokenize(text))
print("Sentences:", sent_tokenize(text))


Words: ['Natural', 'Language', 'Processing', 'is', 'fun', '.', 'Let', "'s", 'explore', 'it', '!']
Sentences: ['Natural Language Processing is fun.', "Let's explore it!"]


# 18.  Stemming

In [2]:
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()
print(stemmer.stem("running"))  # Output: run
print(stemmer.stem("flies"))    # Output: fli


run
fli


# 19. Lemmatization

In [3]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
print(lemmatizer.lemmatize("running", pos='v'))  # Output: run
print(lemmatizer.lemmatize("better", pos='a'))   # Output: good


run
good


# 20. Segmentation

In [4]:
from nltk.tokenize import sent_tokenize

text = "Hello there! How are you doing? NLP is interesting."
print(sent_tokenize(text))


['Hello there!', 'How are you doing?', 'NLP is interesting.']
