In [1]:
import pandas as pd
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import LabelBinarizer
import ast


In [6]:
mentors = pd.read_csv('../generated_data/mentor.csv', usecols=lambda c: not c.startswith('Unnamed:'))
mentees = pd.read_csv('../generated_data/mentee.csv', usecols=lambda c: not c.startswith('Unnamed:'))

In [5]:
def clean_data(mentors, mentees):
    mentees.drop(columns=['first_name', 'last_name'], inplace=True)
    mentees['profile_id'] = mentors['profile_id'].apply(lambda str: str.split('#')[1])
    mentees.set_index('profile_id', inplace=True)
    
    mentors.drop(columns=['first_name', 'last_name'], inplace=True)
    mentors['profile_id'] = mentors['profile_id'].apply(lambda str: str.split('#')[1])    
    mentors.set_index('profile_id', inplace=True)
    
    mentors = mentors.join(mentors['time_zone'].apply(pd.Series).stack().str.get_dummies().groupby(level=0).sum())
    mentors.drop(columns='time_zone', inplace=True)

    mentors['skill_rank'] = mentors['skill_rank'].apply(ast.literal_eval)
    mentors = mentors.join(mentors['skill_rank'].apply(pd.Series).stack().str.get_dummies().groupby(level=0).sum())
    mentors.drop(columns='skill_rank', inplace=True)

    mentors['tech_or_career'] = mentors['tech_or_career'].apply(ast.literal_eval)
    mentors = mentors.join(mentors['tech_or_career'].apply(pd.Series).stack().str.get_dummies().groupby(level=0).sum())
    mentors.drop(columns='tech_or_career', inplace=True)


    mentors['skills'] = mentors['skills'].apply(ast.literal_eval)
    mentors = mentors.join(mentors['skills'].apply(pd.Series).stack().str.get_dummies().groupby(level=0).sum())
    mentors.drop(columns='skills', inplace=True)
    return mentors, mentees

In [7]:
mentors, mentees = clean_data(mentors, mentees)

In [8]:
mentors

Unnamed: 0_level_0,Central,Eastern,Mountain,Pacific,Advanced,Beginner,Expert,Intermediate,Career Preparation,Technical,...,JavaScript,Microsoft Azure,MongoDB,Node.JS,PHP,PostgreSQL,Python,React,Ruby,SQL
profile_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
11996681750350,0,1,0,0,0,0,1,1,1,0,...,0,0,0,0,0,0,0,0,1,0
35370095635796,0,0,0,1,0,0,0,1,1,0,...,1,0,0,0,0,0,0,0,0,0
669953825711,1,0,0,0,0,0,0,1,1,1,...,0,0,0,0,0,0,0,0,1,0
42936362672176,0,0,0,1,0,0,0,1,1,1,...,0,1,0,0,1,0,0,0,0,0
63309130194498,1,0,0,0,1,1,1,1,0,1,...,1,0,0,0,0,1,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39597099007656,0,0,0,1,0,1,0,1,1,0,...,1,0,0,0,0,0,0,0,0,0
30911687552052,0,1,0,0,0,1,1,0,1,0,...,0,0,0,0,0,0,1,0,0,0
47259932447580,1,0,0,0,1,1,1,1,1,1,...,0,0,0,0,1,0,0,0,0,0
7388400741412,0,1,0,0,0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,1,0


In [9]:
def create_model(df, neighbors_number):
    '''get similar songs'''
    model = NearestNeighbors(n_neighbors=min(
        neighbors_number+1, df.shape[0]))
    model.fit(df)
    return model

In [10]:
model = create_model(mentors, neighbors_number=10)