In [1]:
import numpy as np
import pandas as pd
from models import Quote

df = pd.read_csv('../data/reduced_quotes.csv')
reduced_df = df.sample(frac=0.1, random_state=42)  # Keep 10% of the data
reduced_df.to_csv('../data/reduced_reduced_quotes.csv', index=False)
reduced_df = reduced_df.dropna()
quotes = []

reduced_df['processed_category'] = reduced_df['category'].apply(lambda x: ' '.join(str(x).split(', ')))

print(reduced_df.head(5))

for _, row in reduced_df.iterrows():
    #quotes.append(row['quote'] row['author'], row['processed_category']))
    quotes.append({'quote': row['quote'], 'author': row['author'], 'category': row['processed_category']})

print()
print(len(quotes))
print(quotes[0])


                                                   quote  \
3835   Despite all my rageI am still just a rat in th...   
3578   The most important experiences a man can have ...   
17679  I can feel myself gently slipping away into a ...   
31388  It is tempting to write the history of technol...   
12801  He said he'd never remarry, because he'd never...   

                                   author                        category  \
3835                         Billy Corgan  anger, cages, life, rage, rats   
3578         Paulo Coelho, Eleven Minutes                         courage   
17679  Rak Razam, Aya: A Shamanic Odyssey    spirituality, transformation   
31388                      the microscope                    the airplane   
12801        Alexandra Bracken, Passenger                       true-love   

                processed_category  
3835    anger cages life rage rats  
3578                       courage  
17679  spirituality transformation  
31388                 the ai

In [4]:

from sklearn.model_selection import train_test_split

print(len(quotes))

training, test = train_test_split(quotes, test_size=0.33, random_state=42, shuffle=True)

print(training[0])

print(len(training))

x_train = [q['quote'] for q in training]
y_train = [q['author'] for q in training]

x_train = ["" if pd.isna(text) else str(text) for text in x_train]


x_test = [q['quote'] for q in test]
y_test = [q['author'] for q in test]

print(x_test[0])
print(y_test[0])


3240
{'quote': 'There is nothing meritorious but virtue and friendship.', 'author': 'Alexander Pope', 'category': 'friendship'}
2170
I own my past, it hasn't been grande' But it's had some pretty great moments. I own my movements of now, it isn't what I've dreamt, but I'm closer than I was before.I own my future, it is going to test me, But I trust I have the strength to pull through. Life isn't what happens to us, but what we choose to become.
Nikki Rowe


In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer()
train_x_vectors = tfidf_vectorizer.fit_transform(x_train)

In [6]:
print(train_x_vectors.shape)
print(train_x_vectors[0].toarray())
print(tfidf_vectorizer.get_feature_names_out())


(2170, 9574)
[[0. 0. 0. ... 0. 0. 0.]]
['000' '10' '100' ... 'zone' 'zones' 'zoophilism']


In [7]:
from sklearn.metrics.pairwise import cosine_similarity

similarity_matrix = cosine_similarity(train_x_vectors, dense_output=False)

print(similarity_matrix.shape)


(2170, 2170)


In [8]:
import pickle

# Assuming similarity_matrix is your computed matrix
with open("similarity_matrix.pkl", "wb") as f:
    pickle.dump(similarity_matrix, f)


In [12]:
import pickle

# Save the quotes list to a file
with open("quotes.pkl", "wb") as f:
    pickle.dump(quotes, f)
    print(quotes[0]) 


{'quote': 'Despite all my rageI am still just a rat in the cage.', 'author': 'Billy Corgan', 'category': 'anger cages life rage rats'}
{'quote': 'The most important experiences a man can have are those that take him to the very limit; that is the only way we learn, because it requires all our courage.', 'author': 'Paulo Coelho, Eleven Minutes', 'category': 'courage'}


In [21]:
## Recommending Similar Quotes

#def recommend_similar_quotes(quote_index):
#    print("Quote Index:", quote_index)
#
#    similarity_row = similarity_matrix[quote_index].toarray().flatten()
#    similar_indices = np.argsort(similarity_row)[::-1] ## indexing: [start:stop:step]
#
#    similar_quote = [quotes[i].quote for i in similar_indices if i != quote_index]
#    similar_quote_author = [quotes[i].author for i in similar_indices if i != quote_index]
#    returned_object = {'quote': similar_quote[0], 'author': similar_quote_author[0]}
#    return returned_object

In [20]:
import pickle5 as pickle

#liked_quote_index = 8

#with open("result.pkl", "wb") as f:
#    pickle.dump(recommend_similar_quotes(liked_quote_index), f)

#with open("result.pkl", "rb") as f:
#    loaded_result = pickle.load(f)
#    print(loaded_result)  # Will print similar quote

