In [41]:
#Extracting the stored dataframes from the csv file, turn it into a Series
import pandas as pd
text_data = pd.read_csv("data/posts_and_comments.csv")

cols = ['short_description', 'full_description']
callink_data = pd.read_csv("data/callink_complete.csv", usecols=cols)

cols = ['description']
eecs_data = pd.read_csv("data/berkeley_eecs_orgs.csv", usecols=cols)

combined = pd.concat([text_data, callink_data, eecs_data], axis=1)
combined_data = pd.concat(combined[col] for col in combined)
combined_data = combined_data[pd.notna(combined_data)].reset_index(drop=True).to_frame(name='Text')
print(combined_data)

                                                   Text
0     Hello, \n\nI was wondering if anyone else woul...
1     I applied to 9 tech clubs (i’m an eecs major) ...
2     [Berkeley club decisions today got me like:](h...
3     Title. So exhausted of this kinda stuff. Heard...
4     Over the past month on this subreddit there ha...
...                                                 ...
3537  apps.cskickstart@gmail.com CS KickStart is a o...
3538  mentors@berkeley.edu Computer Science Mentors ...
3539  politburo@csua.berkeley.edu The CSUA is a soci...
3540  dss.berkeley@gmail.com Data Science Society at...
3541  calgamedesign@gmail.com The Game Design and De...

[3542 rows x 1 columns]


In [42]:
import ollama
# embed_model = 'hf.co/CompendiumLabs/bge-base-en-v1.5-gguf'    Can't take large enough data
#Use the command olllama serve in the terminal to start the ollama server
embed_model = 'hf.co/bartowski/granite-embedding-30m-english-GGUF'
lang_model = 'hf.co/bartowski/Llama-3.2-1B-Instruct-GGUF'

ollama.pull(embed_model)
ollama.pull(lang_model)

combined_data['embeddings'] = combined_data['Text'].apply(
    lambda t: t if not pd.notna(t) else ollama.embed(model=embed_model, input=t)['embeddings'])


# Formats it so each element is not a list of a list unnecessarily
combined_data['embeddings'] = combined_data['embeddings'].apply(lambda x: x[0] if len(x) == 1 else x)
combined_data

Unnamed: 0,Text,embeddings
0,"Hello, \n\nI was wondering if anyone else woul...","[-0.029673584, -0.020410215, 0.02847843, 0.029..."
1,I applied to 9 tech clubs (i’m an eecs major) ...,"[-0.039149694, -0.02291596, 0.076579526, 0.019..."
2,[Berkeley club decisions today got me like:](h...,"[-0.04979075, 0.0072788103, -0.0024116817, -0...."
3,Title. So exhausted of this kinda stuff. Heard...,"[0.017073292, 0.0561406, -0.018326228, 0.03431..."
4,Over the past month on this subreddit there ha...,"[0.0076961033, 0.02189807, 0.049328394, -0.000..."
...,...,...
3537,apps.cskickstart@gmail.com CS KickStart is a o...,"[-0.0174135, -0.0011699701, -0.05148824, 0.018..."
3538,mentors@berkeley.edu Computer Science Mentors ...,"[0.018934086, 0.023859348, -0.0655467, -0.0518..."
3539,politburo@csua.berkeley.edu The CSUA is a soci...,"[0.0009029903, 0.02671817, -0.0026577099, 0.01..."
3540,dss.berkeley@gmail.com Data Science Society at...,"[0.035649933, 0.019879328, -0.0012036118, 0.01..."


In [23]:
# Creating weights for each embedding based on score values, plugging them into a tuned sigmoid function
# 0-200 has factor ~1, 200-500 has factor ~1.3, 500-700 has factor ~1.6, 700+ has factor ~1.8
# Doesn't function well, need to include comment score scaling as well if implemented

# consulting_club_posts['Score Factor'] = consulting_club_posts['Score'].apply(lambda x: 1 + 1/(1+20*pow(2, -x/100)))

# consulting_club_posts['Weighted embedding'] = consulting_club_posts['embedding'].combine(consulting_club_posts['Score Factor'], lambda lst, weight: [weight * x for x in lst])
# print(consulting_club_posts['Weighted embedding'], consulting_club_posts['Score'])


In [43]:
def cosine_similarity(a, b):
    dot_prod = sum([x * y for x, y in zip(a, b)])
    mag_a = pow(sum([pow(x, 2) for x in a]), 0.5)
    mag_b = pow(sum([pow(y, 2) for y in b]), 0.5)
    return mag_a * mag_b and (dot_prod) / (mag_a * mag_b)   # add mag_b to dot_prod for weightage

In [44]:
def retrieve_data(query, n=5):
    query_embed = ollama.embed(model=embed_model, input=query)['embeddings']
    similarities = combined_data['embeddings'].apply(lambda x: (cosine_similarity(query_embed[0], x)))     # Change to 'weighted embedding' for weightage
    return similarities.nlargest(n)

In [54]:
input_query = input('Ask me a question: ')
retrieved_knowledge = retrieve_data(input_query)


instruction_prompt = f'''You are a helpful chatbot aimed to help UC Berkeley students learn about and choose school clubs to join.
You do not know anything about the student asking you questions except from what they have told you in previous queries.
Use only the following pieces of context to answer the question. 
These pieces of context are not related to the student asking you questions.
They are simply provided as information for you to base your answer off of.
Don't make up any new information about any clubs or about the student you are talking to.

Context:
{'\n'.join([f' - {combined_data.loc[i, 'Text']}' for i in retrieved_knowledge.index])}


User Query:
{input_query}
'''

print(instruction_prompt)

You are a helpful chatbot aimed to help UC Berkeley students learn about and choose school clubs to join.
You do not know anything about the student asking you questions except from what they have told you in previous queries.
Use only the following pieces of context to answer the question. 
These pieces of context are not related to the student asking you questions.
They are simply provided as information for you to base your answer off of.
Don't make up any new information about any clubs or about the student you are talking to.

Context:
 - I'm an incoming frosh and I already found like +/- 10 clubs I want to join... I know it's not possible to be fully involved in all of them first semester, but how are you guys getting involved? Are you a committee member of 2-3 and then general member of 5? or is it just a solid commitment to 1-4? What's it like for premed students to get involved in these clubs? 

Also if you don't like a club, can you leave and try to join others. Do clubs take

In [53]:
stream = ollama.chat(
  model=lang_model,
  messages=[
    {'role': 'user', 'content': instruction_prompt},
    # {'role': 'user', 'content': input_query},
  ],
  stream=True,
)

# print the response from the chatbot in real-time
print('Chatbot response:')
for chunk in stream:
  print(chunk['message']['content'], end='', flush=True)


Chatbot response:
It sounds like you're looking for something more social than competitive, but still want to be involved with the CS community. Based on your interest and previous statements about not joining any CS-related clubs (since they didn't seem to lead to a clear future), I'd recommend exploring other areas of interest within CS.

One potential area could be Data Science or Machine Learning, as these topics are in high demand by companies and have a good balance of practicality and social interaction. Many universities offer data science-focused courses and projects through their departments or clubs, which might align with your interests and provide opportunities for networking and collaboration.

Additionally, there's also the Data Systems (DS) area within CS, which focuses on systems design and development for data-intensive applications. This field has a more moderate level of competition compared to traditional CS disciplines, but still offers plenty of opportunities to 