In [139]:
import pandas as pd
import numpy as np
import scipy as sp
from sklearn.neighbors import NearestNeighbors
import ast


In [140]:
mentors = pd.read_csv('../generated_data/mentor.csv', usecols=lambda c: not c.startswith('Unnamed:'))
mentees = pd.read_csv('../generated_data/mentee.csv', usecols=lambda c: not c.startswith('Unnamed:'))

In [141]:
def ohe(mentors, mentees):
    mentees.drop(columns=['first_name', 'last_name'], inplace=True)
    mentees['profile_id'] = mentees['profile_id'].apply(lambda str: str.split('#')[1])    
    mentees.set_index('profile_id', inplace=True)
    
    mentees = mentees.join(mentees['time_zone'].apply(pd.Series).stack().str.get_dummies().groupby(level=0).sum())
    mentees.drop(columns='time_zone', inplace=True)

    mentees = mentees.join(mentees['skill_rank'].apply(pd.Series).stack().str.get_dummies().groupby(level=0).sum())
    mentees.drop(columns='skill_rank', inplace=True)

    mentees['tech_or_career'] = mentees['tech_or_career'].apply(ast.literal_eval)
    mentees = mentees.join(mentees['tech_or_career'].apply(pd.Series).stack().str.get_dummies().groupby(level=0).sum())
    mentees.drop(columns='tech_or_career', inplace=True)


    mentees['skills'] = mentees['skills'].apply(ast.literal_eval)
    mentees = mentees.join(mentees['skills'].apply(pd.Series).stack().str.get_dummies().groupby(level=0).sum())
    mentees.drop(columns='skills', inplace=True)

    mentors.drop(columns=['first_name', 'last_name'], inplace=True)
    mentors['profile_id'] = mentors['profile_id'].apply(lambda str: str.split('#')[1])    
    mentors.set_index('profile_id', inplace=True)
    
    mentors = mentors.join(mentors['time_zone'].apply(pd.Series).stack().str.get_dummies().groupby(level=0).sum())
    mentors.drop(columns='time_zone', inplace=True)

    mentors['skill_rank'] = mentors['skill_rank'].apply(ast.literal_eval)
    mentors = mentors.join(mentors['skill_rank'].apply(pd.Series).stack().str.get_dummies().groupby(level=0).sum())
    mentors.drop(columns='skill_rank', inplace=True)

    mentors['tech_or_career'] = mentors['tech_or_career'].apply(ast.literal_eval)
    mentors = mentors.join(mentors['tech_or_career'].apply(pd.Series).stack().str.get_dummies().groupby(level=0).sum())
    mentors.drop(columns='tech_or_career', inplace=True)


    mentors['skills'] = mentors['skills'].apply(ast.literal_eval)
    mentors = mentors.join(mentors['skills'].apply(pd.Series).stack().str.get_dummies().groupby(level=0).sum())
    mentors.drop(columns='skills', inplace=True)
    return mentors, mentees

In [142]:
mentors, mentees = ohe(mentors, mentees)

In [150]:
#generate a fake review matrix
reviews = sp.sparse.random(mentees.shape[0], mentors.shape[0], density=0.1, random_state=42)
reviews = pd.DataFrame(reviews.todense(), index=mentees.index, columns=mentors.index)

In [161]:
combined = reviews.merge(mentees, left_on = 'profile_id', right_on = 'profile_id', how = 'left')


In [183]:
combined.columns

Index(['11996681750350', '35370095635796', '669953825711', '42936362672176',
       '63309130194498', '15095524536358', '33900753313979', '31786410112246',
       '51254476683696', '8732351419446', '6779804862565', '45253072381561',
       '78965207533', '63230246147329', '53556494378032', '15739495146142',
       '7743178596546', '58951482985289', '27742392291718', '29322709338287',
       '63121684978234', '65206608419743', '66361726368093', '15717367537539',
       '55424401629879', '13277216717264', '68341711426252', '33504899213232',
       '25031268776199', '8284601696244', 'Central', 'Eastern', 'Mountain',
       'Pacific', 'Advanced', 'Beginner', 'Expert', 'Intermediate',
       'Career Preparation', 'Technical', 'Android', 'Angular', 'Axios', 'C#',
       'C++', 'CSS', 'Django', 'HTML', 'IOS', 'Java', 'JavaScript',
       'Microsoft Azure', 'MongoDB', 'Node.JS', 'PHP', 'PostgreSQL', 'Python',
       'React', 'Ruby', 'SQL'],
      dtype='object')

In [163]:
mentees.columns

Index(['Central', 'Eastern', 'Mountain', 'Pacific', 'Advanced', 'Beginner',
       'Expert', 'Intermediate', 'Career Preparation', 'Technical', 'Android',
       'Angular', 'Axios', 'C#', 'C++', 'CSS', 'Django', 'HTML', 'IOS', 'Java',
       'JavaScript', 'Microsoft Azure', 'MongoDB', 'Node.JS', 'PHP',
       'PostgreSQL', 'Python', 'React', 'Ruby', 'SQL'],
      dtype='object')

In [180]:
def create_model(df, neighbors_number):
    model = NearestNeighbors(n_neighbors=min(
        neighbors_number+1, df.shape[0]))
    model.fit(df)
    return model

In [181]:
model = create_model(combined, neighbors_number=10)

In [189]:
test = np.array([0,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0])


In [197]:
model.kneighbors([test], 10)



(array([[1.63332607, 1.78075732, 1.84047597, 2.04489126, 2.17889817,
         2.19666162, 2.20802395, 2.23606798, 2.37898021, 2.42046274]]),
 array([[ 63,  12,  10,  14, 213, 198, 253, 278,  54,  80]], dtype=int64))

In [198]:
combined.iloc[63]

11996681750350        0.000000
35370095635796        0.000000
669953825711          0.000000
42936362672176        0.000000
63309130194498        0.000000
15095524536358        0.000000
33900753313979        0.000000
31786410112246        0.000000
51254476683696        0.000000
8732351419446         0.000000
6779804862565         0.000000
45253072381561        0.000000
78965207533           0.000000
63230246147329        0.000000
53556494378032        0.000000
15739495146142        0.000000
7743178596546         0.000000
58951482985289        0.000000
27742392291718        0.000000
29322709338287        0.000000
63121684978234        0.000000
65206608419743        0.000000
66361726368093        0.221451
15717367537539        0.000000
55424401629879        0.000000
13277216717264        0.000000
68341711426252        0.000000
33504899213232        0.000000
25031268776199        0.000000
8284601696244         0.786583
Central               0.000000
Eastern               0.000000
Mountain

In [227]:
mentors.loc['8284601696244']

Central               1
Eastern               0
Mountain              0
Pacific               0
Advanced              0
Beginner              0
Expert                1
Intermediate          1
Career Preparation    1
Technical             1
Android               0
Angular               0
Axios                 0
C#                    1
C++                   0
CSS                   0
Django                0
HTML                  0
IOS                   0
Java                  1
JavaScript            0
Microsoft Azure       1
MongoDB               1
Node.JS               0
PHP                   0
PostgreSQL            0
Python                0
React                 0
Ruby                  0
SQL                   0
Name: 8284601696244, dtype: int64