Document 1: "Cats like milk."
Document 2: "Dogs like bones."
Document 3: "Cats and dogs are pets."

In [1]:
import re
from sklearn.datasets import fetch_20newsgroups
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from collections import Counter
import numpy as np
import nltk

# Download necessary NLTK resources if not already downloaded
nltk.download('stopwords')
nltk.download('wordnet')

# Imports: We import libraries for text processing (re), data handling, and matrix operations.
# Downloads: We download NLTKâ€™s stopwords and wordnet for text processing.

[nltk_data] Downloading package stopwords to C:\Users\FAIZ
[nltk_data]     SIDDIQUI\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\FAIZ
[nltk_data]     SIDDIQUI\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
# Step 2: Define rmv_emails_websites Function
# python
# Copy code

def rmv_emails_websites(string, remove_special_chars=False):
    new_str = re.sub(r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b", '', string)
    new_str = re.sub(r"https?://\S+|www\.\S+", '', new_str)
    new_str = re.sub(r"\b\S+\.(com|org|net|edu|gov|co|info|biz|io)\b", '', new_str)
    new_str = re.sub(r"[0-9]+", '', new_str)
    
    if remove_special_chars:
        new_str = re.sub(r"[^A-Za-z\s]", '', new_str)
    
    return new_str

# This function removes email addresses, URLs, and numbers from text. In our small example corpus,
# there are no such elements, so this step would simply return the original text.

In [3]:
# Step 3: Define Preprocessing Function

lemmatizer = WordNetLemmatizer()
stopwords_set = set(stopwords.words('english'))

def preprocess(text):
    text = rmv_emails_websites(text)  # Remove unwanted elements
    text = text.lower()               # Convert to lowercase
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    words = [lemmatizer.lemmatize(word) for word in text.split() if word not in stopwords_set]
    return words

# Lowercase Conversion: "Cats like milk." becomes "cats like milk."
# Remove Punctuation: "cats like milk."
# Remove Stopwords: Removes common words like "and," "are."
# Lemmatization: Reduces words to their base form (e.g., "dogs" to "dog").
# Example Output for each document:

# Document 1: ['cat', 'like', 'milk']
# Document 2: ['dog', 'like', 'bone']
# Document 3: ['cat', 'dog', 'pet']

In [4]:
# Step 4: Create Document-Term Matrix (DTM)
corpus = ["Cats like milk.", "Dogs like bones.", "Cats and dogs are pets."]  # Sample corpus
preprocessed_corpus = [preprocess(doc) for doc in corpus]

# After preprocessing, preprocessed_corpus becomes:
# [['cat', 'like', 'milk'], ['dog', 'like', 'bone'], ['cat', 'dog', 'pet']]


In [5]:
# Step 5: Build Vocabulary and DTM



vocab = sorted(set(word for doc in preprocessed_corpus for word in doc))
vocab_to_index = {word: idx for idx, word in enumerate(vocab)}

# Build the DTM
dtm = [[0] * len(vocab) for _ in range(len(preprocessed_corpus))]
for doc_idx, doc in enumerate(preprocessed_corpus):
    word_counts = Counter(doc)
    for word, count in word_counts.items():
        dtm[doc_idx][vocab_to_index[word]] = count
        
        
# Vocabulary: vocab = ['bone', 'cat', 'dog', 'like', 'milk', 'pet']
# Vocabulary Index Mapping:
# vocab_to_index = {'bone': 0, 'cat': 1, 'dog': 2, 'like': 3, 'milk': 4, 'pet': 5}  
# Document-Term Matrix (DTM):
      
#       dtm = [
#     [0, 1, 0, 1, 1, 0],  # Document 1: "cat like milk"
#     [1, 0, 1, 1, 0, 0],  # Document 2: "dog like bone"
#     [0, 1, 1, 0, 0, 1]   # Document 3: "cat dog pet"
# ]


In [6]:
# Step 6: Perform SVD on DTM
def svd_from_scratch(matrix, num_components):
    M = np.dot(matrix, matrix.T)
    N = np.dot(matrix.T, matrix)
    eigvals_u, U = np.linalg.eigh(M)
    eigvals_v, V = np.linalg.eigh(N)
    
    idx_u = np.argsort(eigvals_u)[::-1][:num_components]
    idx_v = np.argsort(eigvals_v)[::-1][:num_components]
    
    U = U[:, idx_u]
    V = V[:, idx_v]
    singular_values = np.sqrt(eigvals_u[idx_u])
    
    return U, singular_values, V.T

num_components = 2
U, S, Vt = svd_from_scratch(dtm, num_components)



# SVD Calculation: Decomposes the DTM to identify U, S, and Vt, with num_components = 2 as the number of latent topics.
# Example Output:
# U: Document-topic relationships.
# S: Singular values indicating the importance of each topic.
# Vt: Term-topic relationships.


AttributeError: 'list' object has no attribute 'T'

In [None]:
# Step 7: Normalize Document-Topic and Topic-Word Matrices

document_topic_matrix = U * S[:num_components]
topic_word_matrix = Vt[:num_components, :]

document_topic_probs = document_topic_matrix / document_topic_matrix.sum(axis=1, keepdims=True)
topic_word_probs = topic_word_matrix / topic_word_matrix.sum(axis=1, keepdims=True)

print("Document-Topic Probabilities:\n", document_topic_probs)
print("Topic-Word Probabilities:\n", topic_word_probs)



# Normalize: Converts document_topic_matrix and topic_word_matrix into probabilities, 
# making it easier to interpret the relevance of each document to each topic
# and each term to each topic.