In [1]:
import numpy as np
import pandas as pd
import nltk
import re
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer
nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\deniz\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\deniz\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
# Loading and cleaning data
df = pd.read_csv("comcast_consumeraffairs_complaints.csv")
df.dropna(subset=["text"], inplace=True)
df.reset_index(drop=True, inplace=True)

In [3]:
def filter_dates(date_string):
    year = int(date_string[-2:])
    return year >= 9

df = df[df['posted_on'].apply(filter_dates)].reset_index(drop=True)

In [4]:
file_path = "NLTK's list of english stopwords"

with open(file_path, 'r') as file:
    lines = file.readlines()

stopwords = [line.strip() for line in lines]

In [5]:
# Matrix initialization part

# Tokenization
df['tokens'] = df['text'].apply(word_tokenize)

# Stopword removal
df['tokens'] = df['tokens'].apply(lambda x: [word for word in x if word.lower() not in stopwords])

# Remove tokens containing any digit
df['tokens'] = df['tokens'].apply(lambda x: [word for word in x if not re.search(r'\d', word)])

# Remove any term containing punctuations
df['tokens'] = df['tokens'].apply(lambda x: [word for word in x if word.isalnum()])

# Lemmatization
lemmatizer = WordNetLemmatizer()
df['tokens'] = df['tokens'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])

# Stemming
stemmer = PorterStemmer()
df['tokens'] = df['tokens'].apply(lambda x: [stemmer.stem(word) for word in x])

df['clean_text'] = df['tokens'].apply(lambda x: ' '.join(x))

# Vectorize tokenized and cleaned text
vectorizer = CountVectorizer()
term_doc_matrix = vectorizer.fit_transform(df['clean_text'])
term_doc_df = pd.DataFrame(term_doc_matrix.toarray(), columns=vectorizer.get_feature_names_out())
A_hat = term_doc_df.values.T
A = A_hat / np.linalg.norm(A_hat, axis=0) # Normalize
print(A.shape)

(8732, 5058)


In [6]:
term_doc_df

Unnamed: 0,aaron,aback,abandon,abc,abhor,abhorr,abid,abil,abl,abnorm,...,zenith,zero,zillion,zion,zip,zipcod,zogbi,zombi,zone,zoom
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5053,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5054,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5055,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5056,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
# SVD IMPLEMENTATION
# Note: Since 5200x9000 Matrix is too much to compute, I have decided to not
# compute whole matrix. Instead compute with approximation directly with k = min(t,d) / 10
# Took 527 second maximum on my computer.

import time

def power_iteration(B, num_simulations):
    b_k = np.random.normal(0, 1, B.shape[1])
    for _ in range(num_simulations):
        # Iterate with given iteration until vector and value converges
        b_k1 = np.dot(B, b_k)
        eigenvalue = np.linalg.norm(b_k1)
        b_k = b_k1 / eigenvalue

    return b_k, eigenvalue

def svd_from_scratch(A, num_values, num_iterations):
    ATA = np.dot(A.T, A)
    vectors = []
    values = []

    for _ in range(num_values):
        eigenvector, eigenvalue = power_iteration(ATA, num_iterations)
        values.append(eigenvalue)
        vectors.append(eigenvector)
        
        # Subtract to find next eigenvalues and vectors given with number of num_values
        ATA -= eigenvalue * np.outer(eigenvector, eigenvector)

    # Compute the singular values
    singular_values = np.sqrt(values)
    V = np.array(vectors).T

    # Compute U
    U = np.dot(A, V) / singular_values

    return U, singular_values, V.T

start_time = time.time()  # Start timing

# k value = min(t,d) / 10
k = min(A.shape) // 10

# Build U_k, s_k, Vt_k with k = 510
# eg. U_k = first k column of U

U, s, Vt = svd_from_scratch(A, num_values=k, num_iterations=50)
S = np.diag(s)

end_time = time.time()
elapsed_time = end_time - start_time 

print("Singular values:")
print(S.shape)
print("Left singular vectors (U matrix):")
print(U.shape)
print("Right singular vectors (V^T matrix):")
print(Vt.shape)

print("Elapsed time: {:.2f} seconds".format(elapsed_time))

Singular values:
(505, 505)
Left singular vectors (U matrix):
(8732, 505)
Right singular vectors (V^T matrix):
(505, 5058)
Elapsed time: 251.12 seconds


# Error

In [8]:
# Calculate MSE
# k = 10, 30,... , min(t,d) / 10
step_size = 20
my_list = [i for i in range(10, k + 1, step_size)]

least_error = float('inf')   # Error variable initialized
k_value = 0    # k variable initialized
for k in my_list:
    # Constructing A_hat: 8874x5190 approximated matrix
    U_k = U[:, :k]
    S_k = S[:k, :k]
    Vt_k = Vt[:k, :]
    A_hat = np.dot(np.dot(U_k, S_k), Vt_k) 
    
    # MSE values for each k. Take the minimum error.
    squared_differences = (A - A_hat) ** 2
    total_squared_difference = np.sum(squared_differences)
    total_squared_difference = total_squared_difference / (A.shape[0]*A.shape[1])
    if total_squared_difference < least_error:
        k_value = k
        least_error = total_squared_difference
print(f"k value = {k_value} , MSE = {least_error}")

k value = 490 , MSE = 1.4292843825709602e-05


In [9]:
# Calculate Frobenius Norm
frobenius_norm = float('inf')
k_value = 0
for k in my_list:
    U_k = U[:, :k]
    S_k = S[:k, :k]
    Vt_k = Vt[:k, :]
    A_hat = np.dot(np.dot(U_k, S_k), Vt_k)
    squared_differences = (A - A_hat) ** 2
    total_squared_difference = np.sum(squared_differences)
    result = np.sqrt(total_squared_difference)  # Calculate the actual Frobenius norm
    if result < frobenius_norm:
        k_value = k
        frobenius_norm = result
print(f"k value = {k_value}, Frobenius Norm = {frobenius_norm}")

k value = 490, Frobenius Norm = 25.12497279487233


In [10]:
# Reconstruct A_hat with optimal k value
U_k = U[:, :k_value]
S_k = S[:k_value, :k_value]
Vt_k = Vt[:k_value, :]
A_hat = np.dot(np.dot(U_k, S_k), Vt_k) 

In [11]:
np.linalg.inv(S_k).shape

(490, 490)

In [12]:
U_k.shape

(8732, 490)

# Queries

In [13]:
# Queries
query1 = {'ignorant', 'overwhelming'}
query2 = {'xfinity', 'frustrate', 'adapter', 'verizon', 'router'}
query3 = {'terminate', 'rent', 'promotion', 'joke', 'liar', 'internet', 'horrible'}
query4 = {'kindergarten', 'ridiculous', 'internet', 'clerk', 'terrible' }

queries = [query1, query2, query3, query4]
returned_ids = []

for index, query in enumerate(queries):
    lemmatizer = WordNetLemmatizer()
    stemmer = PorterStemmer()
    # Lemmatize each word in query since it is done for terms too.
    lemmatized_words = [lemmatizer.lemmatize(word) for word in query]
    query = [stemmer.stem(word) for word in lemmatized_words]
    column_names = list(term_doc_df.columns)
    
    # Query vector
    q = [1 if col in query else 0 for col in column_names]
    q = np.array(q)
    max_sim = 0
    # Iterate for every document d to find maximum similarity.
    for i in range(A_hat.shape[1]):
        d = A_hat[:,i].tolist() 
        d = np.array(d)
        q_norm = np.linalg.norm(q)
        d_norm = np.linalg.norm(d)
        sim = np.dot(q, d) / (q_norm * d_norm)
        if sim > max_sim:
            max_sim = sim
            max_id = i
    returned_ids.append(max_id)
    print(f"For query{index + 1}:")
    print(f"Related document id: {max_id}, Similarity = {max_sim:.4f}\n")

For query1:
Related document id: 931, Similarity = 0.0634

For query2:
Related document id: 279, Similarity = 0.2470

For query3:
Related document id: 1047, Similarity = 0.2935

For query4:
Related document id: 1557, Similarity = 0.3052



In [14]:
for index in returned_ids:
    print(df.loc[index, "text"], "\n")

They ignored first amendment rights and ignored dispute letter for over-charging and making false charges for services not provided. Internet is for $29.99 or $10; cable, for $65.54 or $75.  However, Comcast wants $300 a month for services that were not given or started. This is fraud on the account. 

Xfinity has taken over my wifi without my consent... I would NEVER subscribe to xfinity because of this... Leave me alone and get off my server!!! 

Since May, it's the second time I have had to go to their office to get a modem.  I was without phone service since I have my Vonage connected to the Comcast Internet service. I am stuck since I had several problems with service and money issues with Verizon that cost me a lot of money.  So my only choice is Comcast. As far as phone service is concerned, Comcast works very well with my Vonage service.  It's a relief that I don't have to call the Internet service provider and Vonage like I always had to with Verizon. I am annoyed that my plan