In [40]:
from transformers import AutoTokenizer, AutoModel
import torch
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer, util
from sklearn.metrics.pairwise import cosine_similarity

import mysql.connector
import pinecone
import random
import itertools

import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [41]:
# Connecting to OSU servers
mydb = mysql.connector.connect(
  host="classmysql.engr.oregonstate.edu",
  user="capstone_2023_tdsp1",
  password="Capstone1",
  database="capstone_2023_tdsp1"
)

In [42]:
cur = mydb.cursor()

In [43]:
# grabbing bios and job titles
cur.execute("select twitter_profiles.person_id,twitter_profiles.description, job_titles.title FROM twitter_profiles join people on twitter_profiles.person_id = people.id join positions on people.id = positions.person_id join job_titles on job_titles.id = positions.job_title_id where twitter_profiles.description is NOT NULL and job_titles.title is NOT NULL LIMIT 5000;")

In [44]:
myresult = cur.fetchall()

In [None]:
test_list = np.array(myresult)
test_list.reshape((len(test_list), len(test_list[0])))

In [46]:
for i in range(len(myresult)):
    myresult[i] = list(myresult[i])

In [47]:
df = pd.DataFrame(myresult)

In [48]:
df.rename(columns={0:'person_id',1:'bio', 2:'job_title'},inplace=True)

In [61]:
# Removing the urls in order to preprocess the text.
def remove_urls(text):
    # remove URLs
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'www\S+', '', text)
    text = re.sub(r'pic.twitter.com\S+', '', text)
    text = re.sub(r't\.co\S+', '', text)
    return text

df["bio"] = df["bio"].apply(remove_urls)

In [50]:
# Removing the emoji in order to preprocess the text.
def remove_emojis(text):
    emoji_pattern = re.compile("["
            u"\U0001F600-\U0001F64F"  # emoticons
            u"\U0001F300-\U0001F5FF"  # symbols & pictographs
            u"\U0001F680-\U0001F6FF"  # transport & map symbols
            u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                            "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

df["bio"] = df["bio"].apply(remove_emojis)

In [51]:

stop_words = set(stopwords.words('english'))
# removing the stop words.
def remove_stopwords(text):
    return " ".join([word for word in str(text).split() if word not in stop_words])

df["bio"] = df["bio"].apply(lambda x: remove_stopwords(x))

In [52]:
lemmatizer = WordNetLemmatizer()
def lemmatize(text):
    return " ".join([lemmatizer.lemmatize(word) for word in text.split()])

In [53]:
df["bio"] = df["bio"].apply(lambda x: lemmatize(x))

In [54]:
df["bio"] = df["bio"].str.strip()

We preprocess the text because when we run the model on the text, the model gets all the important information to give better results.

In [55]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"

In [56]:
model_name = 'symanto/sn-xlm-roberta-base-snli-mnli-anli-xnli'

In [57]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"

In [58]:
model = SentenceTransformer(model_name, device=device)

In [59]:
bio_embedding = []

In [None]:
# Preparing the bio embeddings in order to send them to Pinecone.
for i in range(df.shape[0]):
    bio_embedding.append( (f'{i}',model.encode(df.iloc[i,1], convert_to_tensor=True).tolist(),{"original_id": str(df.iloc[i,0])}) )

In [None]:
job_title_embedding = []

In [None]:
# Preparing the job title embeddings in order to send them to Pinecone.
for i in range(df.shape[0]):
    job_title_embedding.append( (f'{i}',model.encode(df.iloc[i,2], convert_to_tensor=True).tolist(),{"original_id": str(df.iloc[i,0])}) )

In [None]:
pinecone.init(api_key="0d149ae1-ac52-4f2a-87af-d54693849369", environment="us-east1-gcp")

In [None]:
index = pinecone.Index("testbiojob")

In [None]:
def chunks(iterable, batch_size=100):
    """A helper function to break an iterable into chunks of size batch_size."""
    it = iter(iterable)
    chunk = tuple(itertools.islice(it, batch_size))
    while chunk:
        yield chunk
        chunk = tuple(itertools.islice(it, batch_size))

vector_dim = 768
vector_count = 5000


# Upsert data with 100 vectors per upsert request
for ids_vectors_chunk in chunks(bio_embedding, batch_size=100):
    index.upsert(vectors=ids_vectors_chunk,namespace='biov2')  # Assuming `index` defined elsewhere

In [None]:
def chunks(iterable, batch_size=100):
    """A helper function to break an iterable into chunks of size batch_size."""
    it = iter(iterable)
    chunk = tuple(itertools.islice(it, batch_size))
    while chunk:
        yield chunk
        chunk = tuple(itertools.islice(it, batch_size))

vector_dim = 768
vector_count = 5000


# Upsert data with 100 vectors per upsert request
for ids_vectors_chunk in chunks(job_title_embedding, batch_size=100):
    index.upsert(vectors=ids_vectors_chunk,namespace='job_titlev2')  # Assuming `index` defined elsewhere

In [None]:
top_bio = index.query(
    vector=[bio_embedding[0][1]],
    top_k = df.shape[0],
    include_metadata=True,
    namespace='biov2'
)

In [None]:
top_job_title = index.query(
    vector=[job_title_embedding[0][1]],
    top_k = df.shape[0],
    include_metadata=True,
    namespace='job_titlev2'
)

In [None]:
temp_bio = 0

In [None]:
# Here we just selected a random ID in order to get the cosine result for testing purposes.
for i in top_bio['matches']:
    if i['metadata']['original_id'] == '44534':
        temp_bio = i['score']

In [None]:
temp_job = 0

In [None]:
# Here we just selected a random ID in order to get the cosine result for testing purposes.
for i in top_job_title['matches']:
    if i['metadata']['original_id'] == '44534':
        temp_job = i['score']

In [None]:
# For the weights, these could be a feature so that when a client wants to make the bio more important than the job title, then they could just set the weight for the bio bigger than the job title.
bio_weight = 1
job_weight = 1

# calculate weighted average
overall_similarity = (temp_bio * bio_weight + temp_job * job_weight) / (bio_weight + job_weight)
print("Bio similarity score:", temp_bio)
print("Job title similarity score:", temp_job)
print("Overall similarity score:", overall_similarity)