In [1]:
import praw
import pandas as pd

# #URL to access the app needed to scrape the data off the Berkeley subreddit
# # https://www.reddit.com/prefs/apps


# #PLAN
# #Scrape the data off the Berkeley subreddit about consulting clubs
# #Organize that data using the pandas library
# #Create chatbot that utilizes natural language processing that will give users feedback
# #about each consulting club here at Berkeley. 

reddit_read_only = praw.Reddit(client_id = "QlBfNfxQ3e_MGP9RkaOQig",
                               client_secret = "SpLjOwYdQPU4z1wqcXBjVl_7DnUIZg",
                               user_agent = "Berkeley_Consulting")

subreddit = reddit_read_only.subreddit("berkeley")

In [2]:
consulting_posts = subreddit.search('consulting')

posts_dict = {'Title': [], 'Post Text': [], 'ID': [], 'Score': [], 'Total Comments': [], 'Post URL': []}

In [3]:
for post in consulting_posts:
    #The title of the post
    posts_dict['Title'].append(post.title)
    #The text inside of the post
    posts_dict['Post Text'].append(post.selftext)
    #Unique ID of each post
    posts_dict['ID'].append(post.id)
    #The scoure of a post
    posts_dict['Score'].append(post.score)
    #Total number of comments inside the post
    posts_dict['Total Comments'].append(post.num_comments)
    #URL of each post
    posts_dict['Post URL'].append(post.url)

consulting_club_posts = pd.DataFrame(posts_dict)
print(consulting_club_posts)


                                                Title  \
0     What are consulting clubs actually looking for?   
1               Asian monoculture in consulting clubs   
2   Rejected from consulting club after having bee...   
3   Tech PM blocks all “.berkeley.edu” e-mails bc ...   
4   Why are some of these Business/Consulting Club...   
..                                                ...   
95  got into a consulting club that has nothing to...   
96          another bear rejected by consulting clubs   
97  There are 150-odd chapters of Chinese Students...   
98             NEW! - Stress Management Consultations   
99  Made it to a Consulting Club’s Final Round Int...   

                                            Post Text       ID  Score  \
0   I am an incoming freshman at Haas next year an...  1kvi9jt     43   
1   Over the past month on this subreddit there ha...  16o5u8z    267   
2   titles kinda self explanatory lol. didn’t even...  1na99k8     75   
3             Consultin

In [4]:
import ollama

embed_model = 'hf.co/CompendiumLabs/bge-base-en-v1.5-gguf'
lang_model = 'hf.co/bartowski/Llama-3.2-1B-Instruct-GGUF'

# print(ollama.embed(model=embed_model, input=consulting_club_posts['Post Text'][0]))

consulting_club_posts['embedding'] = consulting_club_posts['Post Text'].apply(
    lambda t: ollama.embed(model=embed_model, input=t)['embeddings'])

# Attempt to weight embeddings by multiplying by a coefficient calculated by the score, sigmoid function
# Didn't work, can't multiply floats to pandas column
# consulting_club_posts['embedding'].apply(lambda x: consulting_club_posts['Score'].apply(lambda x: 1 + 1/(1+20*pow(2, -x/100))))

print(consulting_club_posts['embedding'][0])

[[-0.018994978, 0.0129991, -0.0036018894, -0.0031700125, 0.009426655, -0.0028496964, 0.060952432, 0.034682203, -0.0030728243, -0.02904805, 0.012524607, 0.0104840845, -0.01167893, 0.041888535, 0.08428942, 0.05346695, 0.0726193, 0.017478492, -0.030527465, -0.0063055926, 0.0050021466, 0.014926702, 0.0056062546, 0.0005452206, 0.04803895, 0.01061787, 0.044724368, -0.019061187, -0.011002971, 0.015662197, 0.04212373, -0.05429164, 0.01642771, -0.0007761418, -0.016213376, -0.027217044, 0.0342329, 0.010205899, 0.017316276, -0.022378622, -0.0563324, -0.015968863, 0.0006317522, 0.0006160649, -0.061257437, -0.0077076415, -0.0005407495, 0.05545468, -0.015679678, -0.029212829, -0.019981192, 0.03326462, -0.0142626185, 0.020240018, 0.008820874, 0.05056833, 0.009174076, -0.0282753, -0.024767978, -0.09308863, 0.03182865, -0.0014693203, 0.012075412, 0.03759287, 0.047718354, 0.05173737, 0.03644486, 0.022752156, -0.0530598, -0.003418872, -0.043388393, -0.013045021, 0.035303984, -0.0566192, -0.018162442, -0.

In [6]:
def cosine_similarity(a, b):
    # print(a)
    # print(b)
    if len(a) < 1 or len(b) < 1 or len(a[0]) != len(b[0]):
        return 0
    dot_prod = sum([x * y for x, y in zip(a[0], b[0])])
    mag_a = pow(sum([pow(x, 2) for x in a[0]]), 0.5)
    mag_b = pow(sum([pow(y, 2) for y in b[0]]), 0.5)
    return dot_prod / (mag_a * mag_b)

# print(consulting_club_posts['embedding'][0][0] == [x for x in consulting_club_posts['embedding'][0]])
# print(cosine_similarity(consulting_club_posts['embedding'][0][0], consulting_club_posts['embedding'][2][0]))

In [7]:
def retrieve_data(query, n=3):
    query_embed = ollama.embed(model=embed_model, input=query)['embeddings']
    similarities = consulting_club_posts['embedding'].apply(lambda x: cosine_similarity(query_embed, x))
    return similarities.nlargest(n)


In [8]:
input_query = input('Ask me a question: ')
retrieved_knowledge = retrieve_data(input_query)


instruction_prompt = f'''You are a helpful chatbot.
Use only the following pieces of context to answer the question. Don't make up any new information:
{'\n'.join([f' - {consulting_club_posts.loc[i, 'Post Text']}' for i in retrieved_knowledge.index])}
'''

print(instruction_prompt)

You are a helpful chatbot.
Use only the following pieces of context to answer the question. Don't make up any new information:
 - I'm a freshman EECS major at Cal, and didn't end up joining any clubs my first semester because I wanted to focus on generally getting used to Cal and getting an internship for the summer. Now, however, I kinda realize that my social circle/network isn't very big and I honestly want to meet more people involved with tech, and have a better overall social scene.

Fortunately, I was able to get a FAANG internship for the summer, so I'm not too worried about resume value etc, and I'm more interested in working on **genuinely** cool projects and having a good time.

I know a lot of the tech/consulting clubs here have a reputation for being prestige competitions, but I'm wondering if there are any that actually do interesting work(that don't require too much knowledge beforehand). Currently I'm thinking about Valley, Venture, and Voyager for consulting and Launch

In [9]:
stream = ollama.chat(
  model=lang_model,
  messages=[
    {'role': 'system', 'content': instruction_prompt},
    {'role': 'user', 'content': input_query},
  ],
  stream=True,
)

# print the response from the chatbot in real-time
print('Chatbot response:')
for chunk in stream:
  print(chunk['message']['content'], end='', flush=True)


Chatbot response:
As an EECS freshman with a background in molecular environmental biology (MEB), you may want to consider joining the following clubs:

1. **Voyager**: This is one of the core tech clubs, which focuses on software development and technical projects.
2. **Launchpad**: While this club has gained attention for its prestigious competitions, it also offers a more relaxed atmosphere and various projects that involve machine learning, data science, and other relevant areas.

These two clubs seem to be a good fit for your background in EECS and MEB.

The others you mentioned, such as Valley, Venture, and Blockchain & Codebase (ML@B), are likely too competitive and might not be the best fit for someone with a strong interest in socializing.