In [None]:
import re
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
nltk.download('omw-1.4')
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from nltk.probability import FreqDist
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from textblob import TextBlob
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

In [None]:
text = """Books have always been my favorite escape, offering a window into worlds far beyond my own. Whether it's the gripping plot twists of a mystery
novel or the rich, immersive landscapes of fantasy, each book feels like a new adventure waiting to unfold. I especially enjoy how stories can evoke such
deep emotions—joy, sadness, excitement—just through the power of words. Reading also fuels my curiosity and teaches me new perspectives, allowing me to 
understand cultures, ideas, and people I may never meet in real life. There’s something deeply comforting about the quiet companionship of a good book. 
It's like having a universe tucked between two covers, ready to come alive whenever I turn the page."""

# Convert to lowercase
text_lower = text.lower()

# Remove punctuation
text_clean = re.sub(r'[^\w\s]', '', text_lower)

print(text_clean)

In [None]:
sentences = sent_tokenize(text)
print("Sentences:", sentences)

words = word_tokenize(text_clean)
print("Words:", words)

In [None]:
# Using split()
split_words = text_clean.split()
print("Split words:", split_words)

# Using word_tokenize()
tokenized_words = word_tokenize(text_clean)
print("Tokenized words:", tokenized_words)

In [None]:
stop_words = set(stopwords.words('english'))

filtered_words = [word for word in tokenized_words if word not in stop_words]
print("Filtered Words:", filtered_words)

In [None]:
fdist = FreqDist(filtered_words)

fdist.plot(20, title="Word Frequency (without Stopwords)")

In [None]:
text_lower = text.lower()

alphabetic_words = re.findall(r'\b[a-zA-Z]+\b', text_lower)
print("Alphabetic Words:", alphabetic_words)

In [None]:
stop_words = set(stopwords.words('english'))

filtered_words = [word for word in alphabetic_words if word not in stop_words]
print("Filtered Words:", filtered_words)

In [None]:
porter = PorterStemmer()

stemmed_words = [porter.stem(word) for word in filtered_words]
print("Stemmed Words:", stemmed_words)

In [None]:
lemmatizer = WordNetLemmatizer()

lemmatized_words = [lemmatizer.lemmatize(word) for word in filtered_words]
print("Lemmatized Words:", lemmatized_words)

In [None]:
texts = [
    "AI Technology Set to Revolutionize Healthcare Diagnostics by 2026.",
    "The new wireless earbuds have amazing sound quality and battery life, but the touch controls can be a bit too sensitive.",
    "Just tried the caramel cold brew from the new café downtown—absolute game changer!"
]

In [None]:
count_vectorizer = CountVectorizer()

count_matrix = count_vectorizer.fit_transform(texts)

# Show feature names
print("Vocabulary:", count_vectorizer.get_feature_names_out())

# Show Bag of Words Matrix
print("Bag of Words Matrix:\n", count_matrix.toarray())

In [None]:
tfidf_vectorizer = TfidfVectorizer()

tfidf_matrix = tfidf_vectorizer.fit_transform(texts)

# Get feature names
feature_names = tfidf_vectorizer.get_feature_names_out()

# Show TF-IDF Matrix
print("TF-IDF Matrix:\n", tfidf_matrix.toarray())

In [None]:
# Loop over each text (row)
for i in range(len(texts)):
    print(f"\nText {i+1}: {texts[i]}")

    # Get the row
    row = tfidf_matrix[i].toarray()[0]

    # Find indices of top 3 scores
    top_indices = row.argsort()[-3:][::-1]

    # Print top keywords
    for idx in top_indices:
        print(f"   {feature_names[idx]} (Score: {row[idx]:.3f})")

In [None]:
text1 = """Artificial Intelligence refers to the simulation of human intelligence by machines, especially computer systems. It powers applications like
voice assistants, recommendation engines, and autonomous vehicles. AI can analyze massive datasets quickly and make decisions or predictions. Its 
learning capability allows it to improve over time with exposure to more data."""

text2 = """Blockchain is a decentralized and secure technology that stores data in blocks connected in a chain. It enables transparent, tamper-proof 
transactions without the need for intermediaries. Blockchain powers cryptocurrencies like Bitcoin and supports smart contracts."""

def preprocess(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)  # remove punctuation
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    return tokens

tokens1 = preprocess(text1)
tokens2 = preprocess(text2)

print("Tokens 1:", tokens1)
print("Tokens 2:", tokens2)

In [None]:
set1 = set(tokens1)
set2 = set(tokens2)

jaccard_sim = len(set1 & set2) / len(set1 | set2)
print("Jaccard Similarity:", round(jaccard_sim, 3))

In [None]:
review = """I recently tried the Kindle Paperwhite, and it's easily one of the best e-readers I've used. The display is crisp and glare-free, making 
it perfect for reading even in direct sunlight. Its battery life is impressive—I only need to charge it once every few weeks. I also love the 
adjustable warm light feature, which makes nighttime reading much more comfortable. Overall, it's a lightweight, user-friendly device that’s perfect 
for book lovers on the go."""

blob = TextBlob(review)
polarity = blob.sentiment.polarity   # between -1 (negative) and 1 (positive)
subjectivity = blob.sentiment.subjectivity  # between 0 (objective) and 1 (subjective)

print(f"Polarity: {polarity}")
print(f"Subjectivity: {subjectivity}")

In [None]:
if polarity > 0:
    sentiment = "Positive"
elif polarity < 0:
    sentiment = "Negative"
else:
    sentiment = "Neutral"

print("Sentiment:", sentiment)

In [None]:
positive_reviews = """I recently tried out the Ember Temperature Control Smart Mug, and it’s honestly a small luxury that makes a big difference."""

wordcloud = WordCloud(width=800, height=400, background_color='white').generate(positive_reviews)

plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

In [None]:
paragraph = """Sure! Here's a paragraph of around 100 words that could be used as training data:

Technology has transformed the way we live, work, and communicate. From smartphones that keep us connected around the clock to AI-powered tools that
automate complex tasks, innovation is constantly reshaping our world. One of the most exciting developments is the rise of smart homes, where 
everything from lights to thermostats can be controlled with a voice command. These advancements not only offer convenience but also improve energy 
efficiency and security. As technology continues to evolve, it challenges us to adapt, learn, and imagine new possibilities for the future—making it a 
fascinating area to explore and understand."""

In [None]:
# Initialize and fit tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts([paragraph])

# Create input sequences
sequences = []
total_words = len(tokenizer.word_index) + 1

# Generate sequences of words
for line in paragraph.split('.'):
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        sequences.append(n_gram_sequence)

# Pad sequences
max_sequence_len = max([len(seq) for seq in sequences])
sequences = pad_sequences(sequences, maxlen=max_sequence_len, padding='pre')

# Split data into features and labels
X = sequences[:, :-1]
y = sequences[:, -1]
y = np.array(y)

In [None]:
model = Sequential()
model.add(Embedding(total_words, 10, input_length=max_sequence_len-1))
model.add(LSTM(100))
model.add(Dense(total_words, activation='softmax'))

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
seed_text = "Artificial intelligence"
next_words = 5

for _ in range(next_words):
    token_list = tokenizer.texts_to_sequences([seed_text])[0]
    token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
    predicted = model.predict(token_list, verbose=0)
    predicted_word_index = np.argmax(predicted)

    output_word = ''
    for word, index in tokenizer.word_index.items():
        if index == predicted_word_index:
            output_word = word
            break
    seed_text += " " + output_word

print(seed_text)