In [None]:
#Extracting the stored dataframes from the csv file, turn it into a Series
import pandas as pd
text_data = pd.read_csv("data/posts_and_comments.csv")

cols = ['short_description', 'full_description']
callink_data = pd.read_csv("data/callink_complete.csv", usecols=cols)

cols = ['eecs_summary', 'full_content']
eecs_data = pd.read_csv("data/eecs_organizations_complete.csv", usecols=cols)

combined = pd.concat([text_data, callink_data, eecs_data], axis=1)
combined_data = pd.concat(combined[col] for col in combined)
combined_data = combined_data[pd.notna(combined_data)].reset_index(drop=True).to_frame(name='Text')
print(combined_data)

                                                   Text
0     Hello, \n\nI was wondering if anyone else woul...
1     I applied to 9 tech clubs (i’m an eecs major) ...
2     [Berkeley club decisions today got me like:](h...
3     Title. So exhausted of this kinda stuff. Heard...
4     Over the past month on this subreddit there ha...
...                                                 ...
3537  apps.cskickstart@gmail.com CS KickStart is a o...
3538  mentors@berkeley.edu Computer Science Mentors ...
3539  politburo@csua.berkeley.edu The CSUA is a soci...
3540  dss.berkeley@gmail.com Data Science Society at...
3541  calgamedesign@gmail.com The Game Design and De...

[3542 rows x 1 columns]


In [86]:
import ollama
import pandas as pd

embed_model = "nomic-embed-text"
lang_model = "deepseek-r1:8b"    # OK for chat, not for embeddings

# ollama.pull(embed_model)
# ollama.pull(lang_model)

def embed_text(t):
    if pd.isna(t) or t.strip() == "":
        return None
    try:
        result = ollama.embed(model=embed_model, input=t)
        return result["embeddings"][0]  # extract vector
    except Exception as e:
        print("Error embedding:", t)
        print(e)
        return None

combined_data['embeddings'] = combined_data['Text'].apply(embed_text)
print(combined_data['embeddings'])

Error embedding: Miss out on a consulting club this semester? I recently graduated from Cal and was a member of a consulting club, but a lot of my closest friends were in non-business orgs. Looking back, I realized they often had a much healthier balance: friendlier communities, lower time commitments, and (surprisingly often) solid career outcomes too. So I wanted to share some honest thoughts on those clubs from what I’ve heard — the ones that still have applications, but are much easier to join and a lot less intense than consulting or finance orgs. 

**Most importantly, most of the clubs in this list still have applications open as of today!**

# Engineering and STEM

**Space Exploration @ Berkeley (SEB)** – Members are insanely talented and dedicated. Every semester they design and launch liquid-fueled rockets in the desert, with the long-term goal of reaching the Kármán Line (the edge of space). Has a bunch of departments covering engineering and non-engineering work (sponsors, p

In [87]:
import ollama

embed_model = "nomic-embed-text"

res = ollama.embed(model=embed_model, input="hello world")
print(res['embeddings'][0])
print(len(res["embeddings"][0]))

[-0.0067467727, -0.00133432, -0.17155784, 0.008378785, 0.0058282795, 0.06985107, -0.00021276812, -0.043072857, -0.014629322, -0.05409464, 0.00049852236, 0.03921064, 0.027733982, 0.080841236, 0.045365613, -0.06293847, 0.010274429, -0.029600356, -0.04279723, 0.029614873, -0.003703514, -0.09430282, -0.0076090745, 0.03807867, 0.09222674, -0.01426902, -0.014989232, 0.061616883, 0.006471232, -0.02197144, -0.0011848657, -0.010898966, -0.00022149562, 0.015662344, 0.039446663, 0.0027534093, 0.032555092, 0.017283518, 0.016348604, 0.005898644, -0.004709806, -0.014834746, 0.011998094, 0.01018382, 0.06594034, -0.0015392096, -0.004152813, 0.00032882535, 0.086824074, -0.0605711, -0.018226594, 0.0053619696, -0.0009732849, 0.06012123, 0.06726484, 0.035327177, 0.04966108, -0.061634053, 0.024222896, 0.0345504, 0.02174549, 0.04368797, 0.032916177, 0.06531373, -0.017444132, -0.03361823, -0.02522467, 0.035480324, -0.0027233004, 0.018134493, 0.073080435, 0.004239657, 0.0107999435, 0.014048353, 0.02455864, 0.

In [88]:
# Creating weights for each embedding based on score values, plugging them into a tuned sigmoid function
# 0-200 has factor ~1, 200-500 has factor ~1.3, 500-700 has factor ~1.6, 700+ has factor ~1.8
# Doesn't function well, need to include comment score scaling as well if implemented

# consulting_club_posts['Score Factor'] = consulting_club_posts['Score'].apply(lambda x: 1 + 1/(1+20*pow(2, -x/100)))

# consulting_club_posts['Weighted embedding'] = consulting_club_posts['embedding'].combine(consulting_club_posts['Score Factor'], lambda lst, weight: [weight * x for x in lst])
# print(consulting_club_posts['Weighted embedding'], consulting_club_posts['Score'])


In [89]:
def cosine_similarity(a, b):
    dot_prod = sum([x * y for x, y in zip(a, b)])
    mag_a = pow(sum([pow(x, 2) for x in a]), 0.5)
    mag_b = pow(sum([pow(y, 2) for y in b]), 0.5)
    return mag_a * mag_b and (dot_prod) / (mag_a * mag_b)   # add mag_b to dot_prod for weightage

In [90]:
def retrieve_data(query, n=5):
    query_embed = ollama.embed(model=embed_model, input=query)['embeddings']
    similarities = combined_data['embeddings'].apply(lambda x: -1 if x is None else (cosine_similarity(query_embed[0], x)))     # Change to 'weighted embedding' for weightage
    return similarities.nlargest(n)

In [94]:
input_query = input('Ask me a question: ')
retrieved_knowledge = retrieve_data(input_query)


instruction_prompt = f'''You are a helpful chatbot aimed to help UC Berkeley students learn about and choose school clubs to join.
You do not know anything about the student asking you questions except from what they have told you in previous queries.
Use only the following pieces of context to answer the question. 
These pieces of context are not related to the student asking you questions.
They are simply provided as information for you to base your answer off of.
Don't make up any new information about any clubs or about the student you are talking to.

Context:
{'\n'.join([f' - {combined_data.loc[i, 'Text']}' for i in retrieved_knowledge.index])}
'''

print(instruction_prompt)

You are a helpful chatbot aimed to help UC Berkeley students learn about and choose school clubs to join.
You do not know anything about the student asking you questions except from what they have told you in previous queries.
Use only the following pieces of context to answer the question. 
These pieces of context are not related to the student asking you questions.
They are simply provided as information for you to base your answer off of.
Don't make up any new information about any clubs or about the student you are talking to.

Context:
 - Which clubs would be good to apply to as a freshman with no business experience?
 - What are some fun clubs I can join? Idc about any consulting, cs or any other competitive clubs. Only want to socialize and have fun 

ty
 - So I know that especially for EECS and CS clubs tend to be competitive. But, other than for connecting, what are the big benefits of joining a club? Is it truly worth your time, all things considered? 
 - Hello everyone! I am

In [95]:
stream = ollama.chat(
    model=lang_model,
    messages=[
        {"role": "user", "content": instruction_prompt},
        {"role": "user", "content": input_query},
    ],
    stream=True,
)

print("Chatbot response:")
full_text = []
for chunk in stream:
    # some chunks (especially the last) may not have 'message'
    msg = chunk.get("message", {})
    content = msg.get("content")
    if content:
        print(content, end="", flush=True)
        full_text.append(content)

print()  # newline at the end
answer = "".join(full_text)


Chatbot response:
Based on the context, here are some clubs that might be a good fit for you as a freshman in EECS interested in CS and Machine Learning:

1.  **ML/AI Club:** This is likely the most direct fit for your interests. They probably host talks, workshops, and project sessions focused on Machine Learning and Artificial Intelligence.
2.  **Robotics Club:** This club often heavily involves programming, AI, and Machine Learning to control robots, providing practical application.
3.  **EECS Clubs:** Look for broader EECS clubs (like the official EECS Undergraduate Student Association or specific lab clubs). These often have sub-groups or events related to CS and ML. They provide a general EECS community and networking opportunities.
4.  **Hack Berkeley:** While not a traditional club, Hackathons are intense events where you build projects (often involving CS and ML) with others, which can be a great way to learn and meet people interested in these areas.

These clubs should provi