# Find a mentor - Proof of concept

### Setup

In [0]:
%pip install faiss-cpu

Collecting faiss-cpu
  Obtaining dependency information for faiss-cpu from https://files.pythonhosted.org/packages/e4/9c/aed8b7c6c490c777c404131b3f6a68e4924fbc149620dc6d6a3563435371/faiss_cpu-1.9.0.post1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata
  Downloading faiss_cpu-1.9.0.post1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.4 kB)
Collecting numpy<3.0,>=1.25.0 (from faiss-cpu)
  Obtaining dependency information for numpy<3.0,>=1.25.0 from https://files.pythonhosted.org/packages/5b/86/caec78829311f62afa6fa334c8dfcd79cffb4d24bcf96ee02ae4840d462b/numpy-2.2.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata
  Downloading numpy-2.2.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (62 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/62.0 kB[0m [31m?[0m eta [36m-:--:--[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.0/62.0 kB[0m [31m2.3 MB/s[0m eta

In [0]:
import matplotlib.pyplot as plt
import faiss
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Required Spark Imports
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.functions import *

# Initialize Spark session (only needed if running outside Databricks)
spark = SparkSession.builder.appName("MentorMatching").getOrCreate()

In [0]:
profiles_df = spark.read.parquet('/dbfs/linkedin_people_train_data').withColumnRenamed("сourses", "courses")

filepath = '/FileStore/tables/Anya_files/df1_clustered.parquet'
user_with_meta_industries = spark.read.parquet(f"dbfs:{filepath}")
user_with_meta_industries = user_with_meta_industries.select("id", "name", "cluster", "user_features", "percentile_group")
user_with_meta_industries = user_with_meta_industries.withColumnsRenamed({"cluster": "meta_industry",
                                                                          "user_features": "features"})

In [0]:
def find_potential_mentors(user_id: str, user_df: DataFrame, profiles_df: DataFrame, k: int = 5) -> DataFrame:
    """
    Find K potential mentors for a given user using FAISS KNN (IndexFlatL2).
    
    Parameters:
        user_id (str): The ID of the user for whom we are searching mentors.
        user_df (DataFrame): The database of users with feature vectors.
        profiles_df (DataFrame): The profiles DataFrame with full user details.
        k (int): The number of nearest neighbors to find.
    
    Returns:
        List[str]: List of mentor IDs and displays their profiles.
    """

    # Step 1: Retrieve user details
    user_row = user_df.filter(col("id") == user_id).collect()
    
    if not user_row:
        raise ValueError(f"User ID {user_id} not found in the database.")
    
    user_row = user_row[0]
    user_meta_industry = user_row["meta_industry"]
    user_percentile_group = user_row["percentile_group"]

    # Step 2: Ensure the user is not in the lowest percentile group
    if user_percentile_group == 0:
        print(f"User {user_id} is in the lowest percentile group and cannot have mentors.")
        return []

    # Step 3: Filter potential mentors (same industry, 2 percentile groups above)
    if user_percentile_group < 5: 
        potential_mentors_df = user_df.filter(
            (col("meta_industry") == user_meta_industry) &
            (col("percentile_group") > user_percentile_group) &
            (col("percentile_group") <= user_percentile_group + 2)
        )
    else:
        potential_mentors_df = user_df.filter(
            (col("percentile_group") == user_percentile_group) &
            (col("id") != user_id)
        )

    # Step 4: Collect mentor feature vectors
    mentor_data = potential_mentors_df.select("id", "features").collect()
    
    if len(mentor_data) < k:
        print(f"Only found {len(mentor_data)} mentors matching the criteria.")

    if not mentor_data:
        return []

    mentor_ids = [row["id"] for row in mentor_data]
    mentor_vectors = np.array([row["features"].toArray() for row in mentor_data]).astype("float32")

    # Step 5: Convert user's feature vector to NumPy array
    user_vector = np.array(user_row["features"].toArray()).astype("float32").reshape(1, -1)

    # Step 6: Create and populate FAISS index
    vector_dim = mentor_vectors.shape[1]  # Number of features (12 in this case)
    index = faiss.IndexFlatL2(vector_dim)  # L2 (Euclidean) distance index
    index.add(mentor_vectors)  # Add mentor vectors to index

    # Step 7: Run KNN search
    _, indices = index.search(user_vector, k)

    # Step 8: Retrieve mentor IDs
    mentor_ids_selected = [mentor_ids[idx] for idx in indices[0]]

    # Step 9: Display mentor profiles
    mentor_profiles = profiles_df.filter(col("id").isin(mentor_ids_selected))

    return mentor_profiles

### Enter your user id

In [0]:
user_id = 'aodsessrubin'

### These are your profile details

In [0]:
profiles_df.filter(col("id") == user_id).select("id", "name", "about", "current_company:name", "url").display()

id,name,about,current_company:name,url
aodsessrubin,Adam Odsess-Rubin,"Adam Odsess-Rubin is the Artistic Director and Founder of National Queer Theater and an experienced Teaching Artist. He is also Co-Founder of the Criminal Queerness Festival, a series of new plays showcasing LGBTQ+ playwrights from countries that criminalize or censor queer and trans artists. Formerly the Education Associate at New York Theatre Workshop and Education and Community Programs Fellow at American Conservatory Theater, Adam founded National Queer Theater after managing youth development programs at All Stars Project in San Francisco and New York. In San Francisco he became a prominent activist and organizer as the assistant to NAMES Project AIDS Memorial Quilt Founder Cleve Jones, organizing hotel service workers around issues affecting the LGBTQ+ community. Odsess-Rubin completed his Masters at New York University in the Educational Theatre for Colleges and Communities program, where he did done extensive research on storytelling and HIV/AIDS. Odsess-Rubin has collaborated with Lincoln Center, Carnegie Hall, The Guthrie Theater, American Conservatory Theatre, New York Theatre Workshop, New Conservatory Theatre Center, MCC Theater, and Refugee Youth Summer Academy. Headshot: Desmond Picotte",National Queer Theater,https://www.linkedin.com/in/aodsessrubin


### Your mentor suggestions will be dispalyed below:

In [0]:
mentors = find_potential_mentors(user_id, user_with_meta_industries, profiles_df, k=5)
mentors.select("id", "name", "about", "current_company:name", "url").display()

id,name,about,current_company:name,url
valerija-m-87962020,Valerija M.,"Biostatistician with over 20 years of experience, primarily in phase III clinical trials and observational studies. Interested in health outcome research, hierarchical modeling, Bayesian modeling, SAS, R. RESEARCH EXPERIENCE - Collaborated with medical researchers as a primary statistician on manuscripts, abstract and poster presentations - Collaborated with medical researchers as a primary statistician on manuscripts, abstract and poster presentations - Conducted statistical analysis in SAS, R - Trained research staff on study design and procedures - Created reports Data and Safety Monitoring Committees",,https://www.linkedin.com/in/valerija-m-87962020
darling-yanes-67556592,Darling Yanes,Dedicated Speech-Language Pathologist with a professional presence and an empathetic understanding of patients dealing with communication and swallowing disorders. Experience with trach and vent patients and critical care.,Genesis Rehab Services,https://www.linkedin.com/in/darling-yanes-67556592
michelle-lohman-8a6a4920,Michelle Lohman,I am passionate about helping others!,Realty ONE Group,https://www.linkedin.com/in/michelle-lohman-8a6a4920
jim-davis-1437ab7,Jim Davis,"Experience: Jim Davis, after 25 years' work in community mental health/substance abuse and private practice, discovered that disaster response and recovery work offered unique opportunities to apply his full range of experience and skills. Goals: Consultation regarding— Advance the development, testing, and refinement of best practices in the field of disaster response & recovery, especially as applied to capacity-building, behavioral health and case management. Particular emphasis on integration of lessons learned from past disasters and collaborative work between government, NGO, and faith-based providers.",,https://www.linkedin.com/in/jim-davis-1437ab7
john-d-cruz-chmm-mba-44587877,"John D. Cruz, CHMM, MBA","Experienced TSDF,RCRA, NORM, EH&S and wastewater manager with a proven history in a multi-task environment.",American Allwaste,https://www.linkedin.com/in/john-d-cruz-chmm-mba-44587877


## How are mentors suggsted?

Our AI-powered system carefully analyzes your profile to find the best mentor matches. We identify similarities in career paths, skills, and interests while ensuring a meaningful gap in experience levels. This way, we connect you with someone who can provide valuable insights and guidance, creating a mutually beneficial mentorship experience!