In [10]:
import praw
import pandas as pd
from praw.models import MoreComments

# #URL to access the app needed to scrape the data off the Berkeley subreddit
# # https://www.reddit.com/prefs/apps


# #PLAN
# #Scrape the data off the Berkeley subreddit about consulting clubs
# #Organize that data using the pandas library
# #Create chatbot that utilizes natural language processing that will give users feedback
# #about each consulting club here at Berkeley. 

reddit_read_only = praw.Reddit(client_id = "QlBfNfxQ3e_MGP9RkaOQig",
                               client_secret = "SpLjOwYdQPU4z1wqcXBjVl_7DnUIZg",
                               user_agent = "Berkeley_Consulting")

subreddit = reddit_read_only.subreddit("berkeley")

In [11]:
consulting_posts = subreddit.search('consulting')

posts_dict = {'Title': [], 'Post Text': [], 'ID': [], 'Score': [], 'Total Comments': [], 'Post URL': []}

In [12]:
for post in consulting_posts:
    #The title of the post
    posts_dict['Title'].append(post.title)
    #The text inside of the post
    posts_dict['Post Text'].append(post.selftext)
    #Unique ID of each post
    posts_dict['ID'].append(post.id)
    #The scoure of a post
    posts_dict['Score'].append(post.score)
    #Total number of comments inside the post
    posts_dict['Total Comments'].append(post.num_comments)
    #URL of each post
    posts_dict['Post URL'].append(post.url)

consulting_club_posts = pd.DataFrame(posts_dict)
print(consulting_club_posts)

                                                Title  \
0     What are consulting clubs actually looking for?   
1               Asian monoculture in consulting clubs   
2   Rejected from consulting club after having bee...   
3   Tech PM blocks all “.berkeley.edu” e-mails bc ...   
4   Why are some of these Business/Consulting Club...   
..                                                ...   
95  got into a consulting club that has nothing to...   
96          another bear rejected by consulting clubs   
97  There are 150-odd chapters of Chinese Students...   
98             NEW! - Stress Management Consultations   
99  Made it to a Consulting Club’s Final Round Int...   

                                            Post Text       ID  Score  \
0   I am an incoming freshman at Haas next year an...  1kvi9jt     43   
1   Over the past month on this subreddit there ha...  16o5u8z    266   
2   titles kinda self explanatory lol. didn’t even...  1na99k8     74   
3             Consultin

In [13]:
#Need to figure out a way to get the URL of each and every post
post_comments = []
for i in posts_dict['Post URL']:
    if 'comments' in i:
        submission = reddit_read_only.submission(url = i)
    else:
        continue
    for comment in submission.comments:
        if type(comment) == MoreComments:
            continue
    post_comments.append(comment.body)


comments_df = pd.DataFrame(post_comments, columns=['comment'])
comments_df

Unnamed: 0,comment
0,reading this is honestly exhausting. what is t...
1,Don’t apply to any club next semester .. we wi...
2,Suck it up. Get over it. You're not privileged.
3,"Just tell him that you work for McKinsey, Bain..."
4,nepotism
...,...
86,I’m interested! Dm me if you are still looking!
87,The whole point of a club (and college experie...
88,"You might know this one then, what *is* consul..."
89,"You might know this one then, what *is* consul..."


In [13]:
import ollama

embed_model = 'hf.co/CompendiumLabs/bge-base-en-v1.5-gguf'
lang_model = 'hf.co/bartowski/Llama-3.2-1B-Instruct-GGUF'

consulting_club_posts['embedding'] = consulting_club_posts['Post Text'].apply(
    lambda t: ollama.embed(model=embed_model, input=t)['embeddings'])

# Attempt to weight embeddings by multiplying by a coefficient calculated by the score, sigmoid function
# Didn't work, can't multiply floats to pandas column
# consulting_club_posts['embedding'].apply(lambda x: consulting_club_posts['Score'].apply(lambda x: 1 + 1/(1+20*pow(2, -x/100))))

In [14]:
def cosine_similarity(a, b):
    if len(a) < 1 or len(b) < 1 or len(a[0]) != len(b[0]):
        return 0
    dot_prod = sum([x * y for x, y in zip(a[0], b[0])])
    mag_a = pow(sum([pow(x, 2) for x in a[0]]), 0.5)
    mag_b = pow(sum([pow(y, 2) for y in b[0]]), 0.5)
    return dot_prod / (mag_a * mag_b)

In [15]:
def retrieve_data(query, n=3):
    query_embed = ollama.embed(model=embed_model, input=query)['embeddings']
    similarities = consulting_club_posts['embedding'].apply(lambda x: cosine_similarity(query_embed, x))
    return similarities.nlargest(n)

In [16]:
input_query = input('Ask me a question: ')
retrieved_knowledge = retrieve_data(input_query)


instruction_prompt = f'''You are a helpful chatbot.
Use only the following pieces of context to answer the question. Don't make up any new information:
{'\n'.join([f' - {consulting_club_posts.loc[i, 'Post Text']}' for i in retrieved_knowledge.index])}
'''

print(instruction_prompt)

You are a helpful chatbot.
Use only the following pieces of context to answer the question. Don't make up any new information:
 - I heard consulting clubs like BC, Voyager, & BBS have really good placement and really help you forward in your career. I was looking to join one with more of social circle then just professional development stuff and was wondering what the recruiting process is like and what clubs i should look to join 
 - I want to just join a consulting club to try it out and learn as much as possible. I know it’s usually hard to get into these, which ones would y’all suggest are easier to get into and have a chiller vibe overall?
 - I want to do consulting as a career after college, so I think joining a club to get some early experience could be super helpful. However, it seems like all the clubs value their “community and culture” and have swanky projects with F500 companies. 

I know that some differ in the sense that they might emphasize different verticals, like heal

In [17]:
stream = ollama.chat(
  model=lang_model,
  messages=[
    {'role': 'system', 'content': instruction_prompt},
    {'role': 'user', 'content': input_query},
  ],
  stream=True,
)

# print the response from the chatbot in real-time
print('Chatbot response:')
for chunk in stream:
  print(chunk['message']['content'], end='', flush=True)


Chatbot response:
Acceptance into a consulting club is competitive, and there are several factors that can influence their decision. Here's a general overview:

1. **Experience**: Many clubs value experience, especially for those who have been working in the industry for some time.
2. **Skills**: Clubs often look for candidates with specific skills, such as Excel expertise, data analysis, or business acumen.
3. **Networking**: A strong professional network and connections within the industry can be beneficial.
4. **Personal statement or essay**: Many clubs require a personal statement or essay that highlights your relevant experience, skills, and career goals.
5. **Recommendations**: Recommendations from alumni, colleagues, or industry professionals can increase your chances of getting accepted.

To improve your chances of getting accepted into a consulting club:

1. **Gain relevant experience**: Try to gain relevant work experience in the industry, even if it's not directly related to