In [1]:

import pandas as pd
import numpy as np
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load the sample data
file_path = "sample_user_profiles.json"
df = pd.read_json(file_path)


In [2]:

# Convert user expertise into a text format for vectorization
def preprocess_expertise(row):
    expertise = row['resume']['expertise']
    text = " ".join(
        expertise.get('assetClasses', []) +
        expertise.get('products', []) +
        expertise.get('sectors', []) +
        expertise.get('regions', [])
    )
    return text

df['expertise_text'] = df.apply(preprocess_expertise, axis=1)


In [3]:

# Convert liked tags into a text format
df['tags_liked_text'] = df['tagsLiked'].apply(lambda x: " ".join(x))

# Combine all textual features for recommendation
df['profile_text'] = df['expertise_text'] + " " + df['tags_liked_text']


In [18]:
df

Unnamed: 0,guid,gpn,businessName,businessTitle,employeeRank,firstName,lastName,countryCode,countryName,email,resume,tagsLiked,documentsLiked,documentsPublished,expertise_text,tags_liked_text,profile_text
0,PSI410000,410000,User 0,Research Analyst,Associate,First0,Last0,US,India,user0@ubs.com,{'biography': 'User 0 has experience in financ...,"[Stocks, Oil Prices, Mergers]","[doc_81, doc_69, doc_6, doc_60, doc_1]",[doc_44],Fixed Income Mutual Funds ETFs Technology Nort...,Stocks Oil Prices Mergers,Fixed Income Mutual Funds ETFs Technology Nort...
1,PSI410001,410001,User 1,Global Markets - Sales,Managing Director,First1,Last1,IN,India,user1@ubs.com,{'biography': 'User 1 has experience in financ...,"[Bonds, Mergers]","[doc_49, doc_100, doc_73]",[doc_44],Commodities Derivatives Technology North America,Bonds Mergers,Commodities Derivatives Technology North Ameri...
2,PSI410002,410002,User 2,Investment Banking,Managing Director,First2,Last2,IN,United Kingdom,user2@ubs.com,{'biography': 'User 2 has experience in financ...,[Tech IPOs],[doc_84],"[doc_25, doc_4]",Equities Mutual Funds ETFs Technology Energy Asia,Tech IPOs,Equities Mutual Funds ETFs Technology Energy A...
3,PSI410003,410003,User 3,Investment Banking,Managing Director,First3,Last3,SG,India,user3@ubs.com,{'biography': 'User 3 has experience in financ...,[Oil Prices],"[doc_51, doc_46, doc_21]","[doc_13, doc_47]",Equities ETFs Mutual Funds Healthcare North Am...,Oil Prices,Equities ETFs Mutual Funds Healthcare North Am...
4,PSI410004,410004,User 4,Research Analyst,Managing Director,First4,Last4,GB,United Kingdom,user4@ubs.com,{'biography': 'User 4 has experience in financ...,"[Mergers, Tech IPOs, Bonds]","[doc_78, doc_36, doc_64]","[doc_30, doc_37]",Fixed Income Commodities Derivatives ETFs Ener...,Mergers Tech IPOs Bonds,Fixed Income Commodities Derivatives ETFs Ener...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,PSI410195,410195,User 195,Research Analyst,Executive Director,First195,Last195,SG,Singapore,user195@ubs.com,{'biography': 'User 195 has experience in fina...,"[Oil Prices, Stocks]","[doc_80, doc_78]",[],Fixed Income ETFs Energy Healthcare North America,Oil Prices Stocks,Fixed Income ETFs Energy Healthcare North Amer...
196,PSI410196,410196,User 196,Global Markets - Sales,Managing Director,First196,Last196,GB,India,user196@ubs.com,{'biography': 'User 196 has experience in fina...,"[Oil Prices, Stocks]","[doc_98, doc_94, doc_85, doc_77]","[doc_14, doc_39]",Equities Fixed Income Commodities Mutual Funds...,Oil Prices Stocks,Equities Fixed Income Commodities Mutual Funds...
197,PSI410197,410197,User 197,Investment Banking,Executive Director,First197,Last197,IN,United Kingdom,user197@ubs.com,{'biography': 'User 197 has experience in fina...,"[Mergers, Tech IPOs, Stocks]","[doc_55, doc_15, doc_61]",[doc_40],Fixed Income Mutual Funds Derivatives Healthca...,Mergers Tech IPOs Stocks,Fixed Income Mutual Funds Derivatives Healthca...
198,PSI410198,410198,User 198,Research Analyst,Executive Director,First198,Last198,GB,India,user198@ubs.com,{'biography': 'User 198 has experience in fina...,[Oil Prices],"[doc_39, doc_76, doc_79, doc_48, doc_14]",[doc_12],Commodities ETFs Derivatives Healthcare North ...,Oil Prices,Commodities ETFs Derivatives Healthcare North ...


In [11]:

# TF-IDF Vectorization
vectorizer = TfidfVectorizer()
profile_matrix = vectorizer.fit_transform(df['profile_text'])

# Compute similarity scores
similarity_matrix = cosine_similarity(profile_matrix)

# Save the model
joblib.dump((vectorizer, similarity_matrix, df), "/content/user_recommendation_model.pkl")

print("Model saved successfully!")


Model saved successfully!


In [12]:
from google.colab import files
files.download("/content/user_recommendation_model.pkl")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [13]:
import joblib
import pandas as pd

# Load the trained model
vectorizer, similarity_matrix, df = joblib.load("/content/user_recommendation_model.pkl")
print("Model loaded successfully!")


Model loaded successfully!


In [14]:
def recommend_articles(user_guid, top_n=5):
    if user_guid not in df['guid'].values:
        return f"User {user_guid} not found in the dataset."

    # Get index of the user
    user_idx = df[df['guid'] == user_guid].index[0]

    # Get similarity scores for this user
    sim_scores = list(enumerate(similarity_matrix[user_idx]))

    # Sort users by similarity score (excluding the user itself)
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:]

    # Get top N similar users
    top_users = [df.iloc[i[0]] for i in sim_scores[:top_n]]

    # Collect documents liked by similar users
    recommended_articles = set()
    user_liked_articles = set(df.iloc[user_idx]['documentsLiked'])

    for similar_user in top_users:
        for article in similar_user['documentsLiked']:
            if article not in user_liked_articles:
                recommended_articles.add(article)

    return list(recommended_articles)[:top_n]


In [16]:
# Pick the first user from the dataset
sample_user_guid = df.iloc[0]['guid']

# Get article recommendations
recommendations = recommend_articles(sample_user_guid, top_n=5)
print(f"Recommended articles for user {sample_user_guid}: {recommendations}")


Recommended articles for user PSI410000: ['doc_86', 'doc_72', 'doc_92', 'doc_90', 'doc_34']
