In [None]:
import os
import numpy as np
import math
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
import re  # Import regular expressions library
from collections import defaultdict, Counter
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer


from google.colab import drive
drive.mount('/content/drive')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
STOPWORDS = set(stopwords.words('english'))
LEMMATIZER = WordNetLemmatizer ()

In [None]:
def load_text_files(folder_path):
    """The program reads every file in a folder and outputs a list of tuples, each of which has the filename and the entire document's text as a string."""
    data = []
    filenames = []

    for filename in os.listdir(folder_path):
        if filename.endswith(".txt"):
            filenames.append(filename)
            with open(os.path.join(folder_path, filename), 'r', encoding='utf-8') as file:
                single_file = file.read()
                data.append(single_file)

    return filenames, data

In [None]:
def clean_text(text):
    """Performs text cleaning: removing special characters, digits, tokenization, stopword removal, and lemmatization."""

    # Convert to lowercase
    text = text.lower()

    # Remove special characters and punctuation using regular expressions (keeps only alphanumeric and spaces)
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)

    # Remove all digits
    text = re.sub(r"\d+", "", text)  # Removes digits globally

    # Tokenize the cleaned text
    tokens = word_tokenize(text)

    # Remove stopwords and lemmatize
    cleaned_tokens = [LEMMATIZER.lemmatize(word) for word in tokens if word not in STOPWORDS]

    return cleaned_tokens


In [None]:
def clean_text_queries(text):
    """Doesn't remove the stopword."""

    # Convert to lowercase
    text = text.lower()

    # Remove special characters and punctuation using regular expressions (keeps only alphanumeric and spaces)
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)

    # Remove all digits
    text = re.sub(r"\d+", "", text)  # Removes digits globally

    # Tokenize the cleaned text
    tokens = word_tokenize(text)

    # Lemmatize only (no stopword removal)
    cleaned_tokens = [LEMMATIZER.lemmatize(word) for word in tokens]

    return cleaned_tokens


In [None]:
folder_path = '/content/drive/MyDrive/dataste'

filenames, documents = load_text_files(folder_path)
tokenized_docs = [clean_text(doc) for doc in documents]
print('tokenized_docs',tokenized_docs)

tokenized_docs [['sonysupport', 'sony', 'bravia', 'tv', 'keep', 'shutting', 'minute', 'use', 'power', 'indicator', 'flash', 'red', 'ive', 'tried', 'resetting', 'still', 'doesnt', 'work', 'doesnt', 'sound', 'right', 'please', 'dm', 'u', 'tv', 'model', 'number', 'well', 'help', 'troubleshoot', 'issue', 'see', 'repair', 'needed', 'np', 'httpstcotvsupport'], ['samsungsupport', 'galaxy', 'keep', 'restarting', 'randomly', 'throughout', 'day', 'even', 'im', 'anything', 'intensive', 'already', 'tried', 'factory', 'reset', 'didnt', 'help', 'definitely', 'doesnt', 'sound', 'normal', 'please', 'dm', 'u', 'phone', 'model', 'number', 'software', 'version', 'recent', 'change', 'made', 'device', 'well', 'assist', 'troubleshooting', 'ck', 'httpstcodevicehelp'], ['deltahelp', 'arrived', 'destination', 'checked', 'luggage', 'didnt', 'make', 'airline', 'tracker', 'say', 'still', 'departure', 'airport', 'need', 'whats', 'next', 'step', 'really', 'sorry', 'hear', 'luggage', 'issue', 'please', 'dm', 'u', 'b

In [None]:
# Build vocabulary (unique words across all documents and queries)
vocab = set([word for doc in tokenized_docs for word in doc])
vocab = sorted(vocab) # Optional sorting for consistency
print("Vocabulary:", vocab)


Vocabulary: ['account', 'additional', 'address', 'advice', 'airline', 'airport', 'already', 'amazonhelp', 'anything', 'apologize', 'app', 'applesupport', 'arrived', 'assist', 'assistance', 'available', 'away', 'bag', 'barely', 'battery', 'booking', 'bravia', 'cant', 'case', 'change', 'charged', 'check', 'checked', 'ck', 'claim', 'cleaning', 'code', 'connection', 'console', 'controller', 'crash', 'currently', 'customer', 'day', 'definitely', 'deltahelp', 'departure', 'description', 'destination', 'detail', 'device', 'didnt', 'disconnect', 'dispute', 'dm', 'doesnt', 'else', 'email', 'even', 'every', 'experiencing', 'factory', 'fee', 'fg', 'figure', 'file', 'fix', 'fixed', 'flash', 'flight', 'frustrating', 'galaxy', 'game', 'get', 'getting', 'gmail', 'going', 'googlehelp', 'guide', 'happening', 'happens', 'hear', 'help', 'hour', 'httpstcobatteryissues', 'httpstcocontrollerhelp', 'httpstcodevicehelp', 'httpstcodisputefee', 'httpstcogmailsupport', 'httpstcohelpcenter', 'httpstcointernetsupp

In [None]:
# Tokenization
# Preprocess documents and queries: lowercase and tokenize
def tokenize(text):
    return text.lower().split()

tokenized_docs = [tokenize(doc) for doc in documents] # Changed doc to documents
tokenized_queries = [tokenize(query) for query in queries]

In [None]:
# Function to calculate term frequency (TF)
def term_frequency(term, document):
  return document.count(term) / len(document)

In [None]:
# Function to calculate inverse document frequency (IDF)
def inverse_document_frequency(term, all_documents):
  num_docs_containing_term = sum(1 for doc in all_documents if term in doc)
  return math.log(len(all_documents) / (1 + num_docs_containing_term))

In [None]:
# Compute TF-IDF for a document
def compute_tfidf(document, all_documents, vocab):
  tfidf_vector = []
  for term in vocab:
    tf = term_frequency(term, document)
    idf = inverse_document_frequency(term, all_documents)
    tfidf_vector.append(tf * idf)
  return np.array(tfidf_vector)

In [None]:
# Compute cosine similarity between two vectors
def cosine_similarity(vec1, vec2):
  dot_product = np.dot(vec1, vec2)
  norm_vec1 = np.linalg.norm(vec1)
  norm_vec2 = np.linalg.norm(vec2)
  return dot_product / (norm_vec1 * norm_vec2)

In [None]:
# Queries with logical operators
queries = [
    "Why is my internet speed slow despite having an unlimited plan on Verizon?",
    "What should I do if I receive the wrong item in my Amazon order, and the return option isn’t available?",
    "How do I fix my iPhone 12 battery draining quickly after the iOS update?",
    "Why does the Spotify app crash when playing songs on my playlist?",
]

In [None]:
# Calculate TF-IDF vectors for documents and queries
doc_tfidf_vectors = [compute_tfidf(doc, tokenized_docs, vocab) for doc in
tokenized_docs]
query_tfidf_vectors = [compute_tfidf(query, tokenized_docs, vocab) for query in
tokenized_queries]

In [None]:
# Calculate cosine similarities
cosine_similarities = []
for query_vector in query_tfidf_vectors:
  similarities = [cosine_similarity(query_vector, doc_vector) for doc_vector in doc_tfidf_vectors]
  cosine_similarities.append(similarities)

# Display the results in order of highest to lowest cosine similarity
for i, query in enumerate(queries):
    print(f"\nCosine similarities for query '{query}':")

    # Zip document indices and their corresponding similarities
    doc_sim_pairs = list(enumerate(cosine_similarities[i]))

    # Sort the pairs based on similarity in descending order
    doc_sim_pairs_sorted = sorted(doc_sim_pairs, key=lambda x: x[1], reverse=True)

    # Print the sorted document similarities
    for doc_idx, similarity in doc_sim_pairs_sorted:
        print(f"Document {doc_idx + 1}: {similarity:.4f}")




Cosine similarities for query 'Why is my internet speed slow despite having an unlimited plan on Verizon?':
Document 6: 0.3957
Document 5: 0.0540
Document 1: 0.0000
Document 2: 0.0000
Document 3: 0.0000
Document 4: 0.0000
Document 7: 0.0000
Document 8: 0.0000
Document 9: 0.0000
Document 10: 0.0000

Cosine similarities for query 'What should I do if I receive the wrong item in my Amazon order, and the return option isn’t available?':
Document 9: 0.4219
Document 1: 0.0000
Document 2: 0.0000
Document 3: 0.0000
Document 4: 0.0000
Document 5: 0.0000
Document 6: 0.0000
Document 7: 0.0000
Document 8: 0.0000
Document 10: 0.0000

Cosine similarities for query 'How do I fix my iPhone 12 battery draining quickly after the iOS update?':
Document 8: 0.3617
Document 9: 0.1218
Document 5: 0.1142
Document 1: 0.0000
Document 2: 0.0000
Document 3: 0.0000
Document 4: 0.0000
Document 6: 0.0000
Document 7: 0.0000
Document 10: 0.0000

Cosine similarities for query 'Why does the Spotify app crash when playi