In [None]:
#Extracting the stored dataframes from the csv file, turn it into a Series
import pandas as pd
text_data = pd.read_csv("data/posts_and_comments.csv")
combined_data = pd.concat(text_data[col] for col in text_data)
combined_data = combined_data[pd.notna(combined_data)].reset_index(drop=True).to_frame(name='Text')

In [87]:
import ollama
# embed_model = 'hf.co/CompendiumLabs/bge-base-en-v1.5-gguf'    Can't take large enough data
#Use the command olllama serve in the terminal to start the ollama server
embed_model = 'hf.co/bartowski/granite-embedding-30m-english-GGUF'
lang_model = 'hf.co/bartowski/Llama-3.2-1B-Instruct-GGUF'

ollama.pull(embed_model)
ollama.pull(lang_model)

combined_data['embeddings'] = combined_data['Text'].apply(
    lambda t: t if not pd.notna(t) else ollama.embed(model=embed_model, input=t)['embeddings'])


# Formats it so each element is not a list of a list unnecessarily
combined_data['embeddings'] = combined_data['embeddings'].apply(lambda x: x[0] if len(x) == 1 else x)
combined_data

Unnamed: 0,Text,embeddings
0,Over the past month on this subreddit there ha...,"[0.0076961033, 0.02189807, 0.049328394, -0.000..."
1,I am an incoming freshman at Haas next year an...,"[0.012069779, -0.005787543, 0.026605263, 0.026..."
2,Consulting clubs making us look bad smh,"[-0.05232234, -0.017782124, 0.07643678, 0.0361..."
3,titles kinda self explanatory lol. didn’t even...,"[-0.040933266, 0.05489012, 0.06248989, 0.03643..."
4,My friend and I were enjoying dinner in the so...,"[-0.038904518, 0.03499681, -0.016001709, 0.003..."
...,...,...
599,"Hey there, just went through the other side of...","[-0.033589683, 0.017472168, 0.03562603, -0.011..."
600,Hi! I went through all the interview rounds bu...,"[-0.07933153, -0.01163375, 0.08825164, 0.01141..."
601,Start your own if you really want to be one of...,"[-0.028648727, -0.045113333, 0.015668534, 0.01..."
602,Tailor it to the club as much as possible. We ...,"[-0.051620197, 0.029057426, 0.02242967, 0.0428..."


In [88]:
# Creating weights for each embedding based on score values, plugging them into a tuned sigmoid function
# 0-200 has factor ~1, 200-500 has factor ~1.3, 500-700 has factor ~1.6, 700+ has factor ~1.8
# Doesn't function well, need to include comment score scaling as well if implemented

# consulting_club_posts['Score Factor'] = consulting_club_posts['Score'].apply(lambda x: 1 + 1/(1+20*pow(2, -x/100)))

# consulting_club_posts['Weighted embedding'] = consulting_club_posts['embedding'].combine(consulting_club_posts['Score Factor'], lambda lst, weight: [weight * x for x in lst])
# print(consulting_club_posts['Weighted embedding'], consulting_club_posts['Score'])


In [89]:
def cosine_similarity(a, b):
    dot_prod = sum([x * y for x, y in zip(a, b)])
    mag_a = pow(sum([pow(x, 2) for x in a]), 0.5)
    mag_b = pow(sum([pow(y, 2) for y in b]), 0.5)
    return mag_a * mag_b and (dot_prod) / (mag_a * mag_b)   # add mag_b to dot_prod for weightage

In [90]:
def retrieve_data(query, n=5):
    query_embed = ollama.embed(model=embed_model, input=query)['embeddings']
    similarities = combined_data['embeddings'].apply(lambda x: (cosine_similarity(query_embed[0], x)))     # Change to 'weighted embedding' for weightage
    return similarities.nlargest(n)

In [None]:
input_query = input('Ask me a question: ')
retrieved_knowledge = retrieve_data(input_query)


instruction_prompt = f'''You are a helpful chatbot aimed to help UC Berkeley students learn about and choose clubs to join.
You do not know anything about the student asking you questions except from what they have told you in previous queries.
Use only the following pieces of context to answer the question. 
These pieces of context are not related to the student asking you questions.
They are simply provided as information for you to base your answer off of.
Don't make up any new information about any clubs or about the student you are talking to.

Context:
{'\n'.join([f' - {combined_data.loc[i, 'Text']}' for i in retrieved_knowledge.index])}
'''

print(instruction_prompt)

You are a helpful chatbot aimed to help UC Berkeley students learn about and choose clubs to join.
You do not know anything about the student asking you questions.
Use only the following pieces of context to answer the question. 
These pieces of context are not related to the student asking you questions.
They are simply provided as information for you to base your answer off of.
Don't make up any new information about any clubs or about the student you are talking to.

Context:
 - Hey guys! I am super excited to come to Berkeley in the fall and I heard that joining clubs is a great way to get involved. I applied here as Econ and I’m really curious about consulting clubs, but there are an overwhelming amount of options. The more I look at Reddit the more confused I get haha (why is it so deep??). I’ve seen some “ranking” posts that put some clubs over others, but I wanted some more information from current students about what these clubs actually are and how they are different. Any ins

In [106]:
stream = ollama.chat(
  model=lang_model,
  messages=[
    {'role': 'system', 'content': instruction_prompt},
    {'role': 'user', 'content': input_query},
  ],
  stream=True,
)

# print the response from the chatbot in real-time
print('Chatbot response:')
for chunk in stream:
  print(chunk['message']['content'], end='', flush=True)


Chatbot response:
Since you're interested in machine learning, I'll ask another question to get some more context.

However, I don't know much about the student asking me questions or their interests outside of machine learning. Could you please tell me what sparked your interest in machine learning as a computer science major? This will help me provide more tailored advice on potential clubs and resources for you to learn more about machine learning at UC Berkeley.