In [None]:
#step 1
import nltk
import os
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from collections import Counter

nltk.download('punkt')
nltk.download('stopwords')

def tokenize(text):
    # Tokenize words
    tokens = word_tokenize(text.lower())
    
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word.isalnum() and word not in stop_words]
    
    stemObj = PorterStemmer()
    tokens = [stemObj.stem(word) for word in tokens]
    
    return tokens

data_dir = 'Data'

#interate 
for i in range(10):
    pathRel = os.path.join(data_dir, f'a{i+1}.txt')
    
    with open(pathRel, 'r') as file:
        txtFile = file.read()

    document_tokens = tokenize(txtFile)
    totalTk = len(document_tokens)
    tks = len(set(document_tokens))

    print(f"total tokens in doc {i+1}: {totalTk}")
    print(f"(unique tokens) in doc {i+1}: {tks}")
    print()
    print()
    print()


In [None]:
#step 2
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from collections import Counter

def tokenize_and_remove_stopwords(text):
    tokens = word_tokenize(text.lower())
    
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word.isalnum() and word not in stop_words]
    
    stemObj = PorterStemmer()
    tokens = [stemObj.stem(word) for word in tokens]
    
    return tokens

data_dir = 'Data'
for i in range(10):
    pathRel = os.path.join(data_dir, f'a{i+1}.txt') 
    
    with open(pathRel, 'r') as file:
        txtFile = file.read()

    document_tokens = tokenize_and_remove_stopwords(txtFile)
    totalTk = len(document_tokens)
    tks = len(set(document_tokens))

    print(f"Total tokens in doc {i+1} after stop-word removd: {totalTk}")
    print(f"# of types unique tokens in doc {i+1} after stop-word removal: {tks}")
    print()


In [None]:
#step 3
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from collections import Counter

nltk.download('wordnet')

def tknFun(text):
    tokens = word_tokenize(text.lower())
    
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word.isalnum() and word not in stop_words]
    stemObj = PorterStemmer()
    tokens = [stemObj.stem(word) for word in tokens]
    
    return tokens
data_dir = 'Data'

corpDict = set()

for i in range(10):
    pathRel = os.path.join(data_dir, f'a{i+1}.txt')  
    
    with open(pathRel, 'r') as file:
        txtFile = file.read()

    document_tokens = tknFun(txtFile)

    corpDict.update(document_tokens)
    num_terms = len(set(document_tokens))

    print(f"Number of terms in document {i+1} after stemming: {num_terms}")
    print()

total_vocabulary_size = len(corpDict)
print(f"Total vocabulary size for the entire corpus after stemming: {total_vocabulary_size}")


In [None]:
#step 4
from sklearn.feature_extraction.text import TfidfVectorizer
data_dir = 'Data'
corpus = []
for i in range(10):
    pathRel = os.path.join(data_dir, f'a{i+1}.txt') 
    
    with open(pathRel, 'r') as file:
        txtFile = file.read()

    corpus.append(txtFile)

vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(corpus)

feature_names = vectorizer.get_feature_names_out()
document_names = [f'Document {i+1}' for i in range(10)]

import pandas as pd
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=feature_names, index=document_names)

print("TF-IDF features for each document:")
print(tfidf_df)


In [None]:
#step 5
from sklearn.metrics.pairwise import cosine_similarity

data_dir = 'Data'
corpus = []
for i in range(10):
    pathRel = os.path.join(data_dir, f'a{i+1}.txt')
    
    with open(pathRel, 'r') as file:
        txtFile = file.read()

    corpus.append(txtFile)

vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(corpus)

cosine_sim_matrix = cosine_similarity(tfidf_matrix, tfidf_matrix)

print("Cosine Matrix:")
print(cosine_sim_matrix)
