In [14]:
#Extracting the stored dataframes from the csv file, turn it into a Series
import pandas as pd
text_data = pd.read_csv("data/posts_and_comments.csv")

cols = ['short_description', 'full_content']
callink_data = pd.read_csv("data/callink_complete.csv", usecols=cols)
combined = pd.concat([text_data, callink_data], axis=1)
combined_data = pd.concat(combined[col] for col in combined)
combined_data = combined_data[pd.notna(combined_data)].reset_index(drop=True).to_frame(name='Text')

In [15]:
import ollama
# embed_model = 'hf.co/CompendiumLabs/bge-base-en-v1.5-gguf'    Can't take large enough data
#Use the command olllama serve in the terminal to start the ollama server
embed_model = 'hf.co/bartowski/granite-embedding-30m-english-GGUF'
lang_model = 'hf.co/bartowski/Llama-3.2-1B-Instruct-GGUF'

ollama.pull(embed_model)
ollama.pull(lang_model)

combined_data['embeddings'] = combined_data['Text'].apply(
    lambda t: t if not pd.notna(t) else ollama.embed(model=embed_model, input=t)['embeddings'])


# Formats it so each element is not a list of a list unnecessarily
combined_data['embeddings'] = combined_data['embeddings'].apply(lambda x: x[0] if len(x) == 1 else x)
combined_data

Unnamed: 0,Text,embeddings
0,"Hello, \n\nI was wondering if anyone else woul...","[-0.029673584, -0.020410215, 0.02847843, 0.029..."
1,I applied to 9 tech clubs (i’m an eecs major) ...,"[-0.039149694, -0.02291596, 0.076579526, 0.019..."
2,[Berkeley club decisions today got me like:](h...,"[-0.04979075, 0.0072788103, -0.0024116817, -0...."
3,Title. So exhausted of this kinda stuff. Heard...,"[0.017073292, 0.0561406, -0.018326228, 0.03431..."
4,Over the past month on this subreddit there ha...,"[0.0076961033, 0.02189807, 0.049328394, -0.000..."
...,...,...
3722,This application requires JavaScript to be ena...,"[-0.031965565, 0.032731917, 0.07554597, 0.0294..."
3723,This application requires JavaScript to be ena...,"[-0.031965565, 0.032731917, 0.07554597, 0.0294..."
3724,This application requires JavaScript to be ena...,"[-0.031965565, 0.032731917, 0.07554597, 0.0294..."
3725,This application requires JavaScript to be ena...,"[-0.031965565, 0.032731917, 0.07554597, 0.0294..."


In [3]:
# Creating weights for each embedding based on score values, plugging them into a tuned sigmoid function
# 0-200 has factor ~1, 200-500 has factor ~1.3, 500-700 has factor ~1.6, 700+ has factor ~1.8
# Doesn't function well, need to include comment score scaling as well if implemented

# consulting_club_posts['Score Factor'] = consulting_club_posts['Score'].apply(lambda x: 1 + 1/(1+20*pow(2, -x/100)))

# consulting_club_posts['Weighted embedding'] = consulting_club_posts['embedding'].combine(consulting_club_posts['Score Factor'], lambda lst, weight: [weight * x for x in lst])
# print(consulting_club_posts['Weighted embedding'], consulting_club_posts['Score'])


In [16]:
def cosine_similarity(a, b):
    dot_prod = sum([x * y for x, y in zip(a, b)])
    mag_a = pow(sum([pow(x, 2) for x in a]), 0.5)
    mag_b = pow(sum([pow(y, 2) for y in b]), 0.5)
    return mag_a * mag_b and (dot_prod) / (mag_a * mag_b)   # add mag_b to dot_prod for weightage

In [17]:
def retrieve_data(query, n=5):
    query_embed = ollama.embed(model=embed_model, input=query)['embeddings']
    similarities = combined_data['embeddings'].apply(lambda x: (cosine_similarity(query_embed[0], x)))     # Change to 'weighted embedding' for weightage
    return similarities.nlargest(n)

In [18]:
input_query = input('Ask me a question: ')
retrieved_knowledge = retrieve_data(input_query)


instruction_prompt = f'''You are a helpful chatbot aimed to help UC Berkeley students learn about and choose clubs to join.
You do not know anything about the student asking you questions except from what they have told you in previous queries.
Use only the following pieces of context to answer the question. 
These pieces of context are not related to the student asking you questions.
They are simply provided as information for you to base your answer off of.
Don't make up any new information about any clubs or about the student you are talking to.

Context:
{'\n'.join([f' - {combined_data.loc[i, 'Text']}' for i in retrieved_knowledge.index])}
'''

print(instruction_prompt)

You are a helpful chatbot aimed to help UC Berkeley students learn about and choose clubs to join.
You do not know anything about the student asking you questions except from what they have told you in previous queries.
Use only the following pieces of context to answer the question. 
These pieces of context are not related to the student asking you questions.
They are simply provided as information for you to base your answer off of.
Don't make up any new information about any clubs or about the student you are talking to.

Context:
 - im going to be a junior this year. i've been trying to get into data science clubs like dss, saas, datagood, etc. every semester since freshmen year. i completed data 8 and data 100. i do want to get into one this semester because i want to work on some cool projects but i just can't seem to get into the clubs no matter what. is there any point in even trying as a junior? one note is for tiered clubs like saas for example, i always applied to the tier m

In [19]:
stream = ollama.chat(
  model=lang_model,
  messages=[
    {'role': 'system', 'content': instruction_prompt},
    {'role': 'user', 'content': input_query},
  ],
  stream=True,
)

# print the response from the chatbot in real-time
print('Chatbot response:')
for chunk in stream:
  print(chunk['message']['content'], end='', flush=True)


Chatbot response:
As a junior, I've heard that the Tier 1 clubs like SAAS, DS, DSS, DATAGOO (Data Science), and DCS are often popular among freshmen. However, since you mentioned earlier that you applied to many data science and tech clubs this semester and got rejected, I'll suggest some other options for you to consider.

Here are a few Tier 1 clubs that might be worth exploring:

*   **Data Science**: As you've already researched, Data Science is an exciting field with many opportunities. It's a great way to learn from others who have experience in the industry.
*   **Computer Vision Lab**: This club focuses on computer vision and image processing, which is a fascinating area of research. Members work on projects that involve image classification, object detection, and more.
*   **Machine Learning Lab**: Similar to Computer Vision Lab, this club explores machine learning techniques for various applications. Members often work on projects like predictive modeling, natural language pr