#   Comparison of a base KNN to a KNN w/ratings

In [186]:
import pandas as pd
import numpy as np
import scipy as sp
from sklearn.neighbors import NearestNeighbors
import ast
import requests

In [187]:
response = requests.post('http://underdog-devs-ds-a-dev.us-east-1.elasticbeanstalk.com/Mentors/read')
mentors = pd.DataFrame(response.json()['result'])

In [188]:
response = requests.post('http://underdog-devs-ds-a-dev.us-east-1.elasticbeanstalk.com/Mentees/read')
mentees = pd.DataFrame(response.json()['result'])

One hot encoding dataset generated by local_utils

In [189]:
def ohe(mentors, mentees):

    mentors = mentors.drop(columns=['first_name', 'last_name',
                           'email', 'city', 'state', 'validateStatus', 'country'])

    mentors = pd.merge(mentors, pd.get_dummies(mentors, columns=['current_comp', 'experience_level', 'job_help', 'industry_knowledge',
                          'pair_programming', 'other_info', 'formerly_incarcerated', 'list_convictions', 'underrepresented_group', 'low_income']).fillna(0))

    mentors = mentors.join(mentors['subject'].apply(
        pd.Series).stack().str.get_dummies().groupby(level=0).sum())

    mentors = mentors.drop(columns=['subject', 'current_comp', 'experience_level', 'job_help', 'industry_knowledge',
                           'pair_programming', 'other_info', 'formerly_incarcerated', 'list_convictions', 'underrepresented_group', 'low_income'])
    
    mentors = mentors.set_index('profile_id')
    mentors.index.rename('mentors_profile_id', inplace=True)

    #------------------------------------------------------------

    mentees = mentees.drop(columns=['first_name', 'last_name',
                           'email', 'city', 'state', 'validateStatus', 'country', 'list_convictions'])

    mentees = pd.merge(mentees, pd.get_dummies(mentees, columns=['experience_level', 'job_help', 'industry_knowledge',
                          'pair_programming', 'other_info', 'formerly_incarcerated', 'underrepresented_group', 'low_income']).fillna(0))

    mentees = mentees.join(mentees['subject'].apply(
        pd.Series).stack().str.get_dummies().groupby(level=0).sum())
    mentees = mentees.drop(columns=['subject', 'experience_level', 'job_help', 'industry_knowledge',
                           'pair_programming', 'other_info', 'formerly_incarcerated', 'underrepresented_group', 'low_income'])

    mentees = mentees.set_index('profile_id')
    mentees.index.rename('mentees_profile_id', inplace=True)

    return mentors, mentees


In [190]:
#the driving force of selection is mentee to mentor <---- mentee like system
mentors, mentees = ohe(mentors, mentees)

# Generate a fake review matrix

In [191]:
#This needs to be changed to the review data from the live server eventually
reviews = sp.sparse.random(mentees.shape[0], mentors.shape[0], density=0.1, random_state=42)
reviews = pd.DataFrame(reviews.todense(), index=mentees.index, columns=mentors.index)
reviews = reviews.melt(ignore_index=False ,value_name='rating')

In [192]:
reviews

Unnamed: 0_level_0,mentors_profile_id,rating
mentees_profile_id,Unnamed: 1_level_1,Unnamed: 2_level_1
z781i5e3EhH82A7u,vDw14633y666xVTo,0.000000
j57C2e118XEN6G6u,vDw14633y666xVTo,0.000000
Z7352L2zbAA350Ho,vDw14633y666xVTo,0.000000
O1q51b0fl848dYa6,vDw14633y666xVTo,0.000000
eDBrJ3D565016ei4,vDw14633y666xVTo,0.000000
...,...,...
L3W68XeR1336Hd6Z,nSw27773p075eafo,0.000000
64TB2r3s33ylO02B,nSw27773p075eafo,0.279276
d1fh0aoh,nSw27773p075eafo,0.000000
j6xbsk9w,nSw27773p075eafo,0.000000


In [193]:
reviews = reviews.pivot(columns = 'mentors_profile_id', values = 'rating').fillna(0)

KeyError: 'mentees_profile_id'

In [None]:
mentors.columns

Index(['current_comp_Amazin', 'current_comp_Amozonian', 'current_comp_Boogle',
       'current_comp_Lahoo', 'current_comp_Macrohard',
       'current_comp_Pineapple', 'current_comp_Poptrist',
       'current_comp_Toyota', 'current_comp_Unemployed',
       'experience_level_Advanced', 'experience_level_Beginner',
       'experience_level_Expert', 'experience_level_Intermediate',
       'experience_level_beginner', 'experience_level_expert',
       'job_help_False', 'job_help_True', 'job_help_true',
       'industry_knowledge_False', 'industry_knowledge_True',
       'industry_knowledge_true', 'pair_programming_False',
       'pair_programming_True', 'pair_programming_false', 'other_info_',
       'other_info_New Job', 'other_info_Notes', 'other_info_none',
       'formerly_incarcerated_False',
       'list_convictions_robbery, dance battle, etc...',
       'underrepresented_group_False', 'low_income_True', 'Android: Java',
       'Career Development', 'Data Science: Python', 'General Pr

In [None]:
#merge the reviews on profile id
combined = reviews.merge(mentees, left_index=True, right_index=True)

In [None]:
#KNN model
def create_model(df, neighbors_number):
    model = NearestNeighbors(n_neighbors=min(
        neighbors_number+1, df.shape[0]), metric='cosine')
    model.fit(df)
    return model

# Create a KNN with ratings and one without

In [None]:
combined

Unnamed: 0_level_0,0071UfJ4WLt18Z3F,00u13oned0U8XP8Mb4x7,0lO7us717v3UI31U,224Ks8ALel5OW253,2gvkshpc,37T2KouYl5477V1o,3e8pkv7m,6066s0sv35RnW5Eo,632KLg2a8I4yuH71,6gc4Q834RO1h8VT2,...,low_income_False,low_income_True,Android: Java,Career Development,Data Science: Python,General Programming,"Web: HTML, CSS, JavaScript",backend,career,iOS: Swift
mentees_profile_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
001lJh12502oVmee,0.855696,0.0,0.456237,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.000000,...,0,1,1,0,0,0,0,0,0,0
0AII175WS04xG1N3,0.000000,0.0,0.000000,0.793261,0.000000,0.0,0.0,0.000000,0.0,0.000000,...,0,1,0,0,0,0,0,0,0,1
0L3YtO8sC2555oS4,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.000000,...,1,0,0,0,0,1,0,0,0,0
0xx22Xf564qn6Rp4,0.000000,0.0,0.000000,0.212644,0.000000,0.0,0.0,0.000000,0.0,0.000000,...,0,1,0,0,0,1,0,0,0,0
1138hHTZ32O02PMj,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.000000,...,1,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
x8564JhRkaU72D60,0.000000,0.0,0.000000,0.976800,0.343167,0.0,0.0,0.000000,0.0,0.000000,...,1,0,0,1,0,0,0,0,0,0
xjDnK275381UU0R7,0.000000,0.0,0.000000,0.000000,0.591697,0.0,0.0,0.000000,0.0,0.000000,...,0,1,1,0,0,0,0,0,0,0
yh3i2p2M0e32h34o,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.000000,...,1,0,0,1,0,0,0,0,0,0
z781i5e3EhH82A7u,0.000000,0.0,0.000000,0.126265,0.000000,0.0,0.0,0.000000,0.0,0.000000,...,0,1,0,0,0,0,1,0,0,0


In [None]:
mentees.shape

(103, 26)

In [None]:
combined

In [None]:
#Create model
knn_collab = create_model(combined, neighbors_number=10)
knn = create_model(mentors, neighbors_number=10)


#### Perhaps a better approach would be to filter the data further and then apply KNN based on rating so that the weights are focused more on ratings rather than looking at the distance of other features?

random mentee#6967999171653,Raiden,Jones,"['Career Preparation', 'Technical']","['PHP', 'React', 'C++']",Advanced,Mountain

In [None]:
knn_collab_test = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0,
                            1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0])

knn_test = np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0,
                     1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1])


In [None]:
#run a test on knn
#note that this returns MENTEES similar to the current mentee

dist_collab, indices_collab = knn_collab.kneighbors([knn_collab_test], 5)
#output indices of mentees that are similar to the current mentee
dist_knn, indices_knn = knn.kneighbors([knn_test], 5)



In [None]:
combined

Unnamed: 0_level_0,0071UfJ4WLt18Z3F,00u13oned0U8XP8Mb4x7,0lO7us717v3UI31U,224Ks8ALel5OW253,2gvkshpc,37T2KouYl5477V1o,3e8pkv7m,6066s0sv35RnW5Eo,632KLg2a8I4yuH71,6gc4Q834RO1h8VT2,...,low_income_False,low_income_True,Android: Java,Career Development,Data Science: Python,General Programming,"Web: HTML, CSS, JavaScript",backend,career,iOS: Swift
mentees_profile_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
001lJh12502oVmee,0.855696,0.0,0.456237,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.000000,...,0,1,1,0,0,0,0,0,0,0
0AII175WS04xG1N3,0.000000,0.0,0.000000,0.793261,0.000000,0.0,0.0,0.000000,0.0,0.000000,...,0,1,0,0,0,0,0,0,0,1
0L3YtO8sC2555oS4,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.000000,...,1,0,0,0,0,1,0,0,0,0
0xx22Xf564qn6Rp4,0.000000,0.0,0.000000,0.212644,0.000000,0.0,0.0,0.000000,0.0,0.000000,...,0,1,0,0,0,1,0,0,0,0
1138hHTZ32O02PMj,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.000000,...,1,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
x8564JhRkaU72D60,0.000000,0.0,0.000000,0.976800,0.343167,0.0,0.0,0.000000,0.0,0.000000,...,1,0,0,1,0,0,0,0,0,0
xjDnK275381UU0R7,0.000000,0.0,0.000000,0.000000,0.591697,0.0,0.0,0.000000,0.0,0.000000,...,0,1,1,0,0,0,0,0,0,0
yh3i2p2M0e32h34o,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.000000,...,1,0,0,1,0,0,0,0,0,0
z781i5e3EhH82A7u,0.000000,0.0,0.000000,0.126265,0.000000,0.0,0.0,0.000000,0.0,0.000000,...,0,1,0,0,0,0,1,0,0,0


In [None]:
indices_collab

array([[73, 60, 43, 99, 26]], dtype=int64)

In [None]:
mentors

Unnamed: 0_level_0,current_comp_Amazin,current_comp_Amozonian,current_comp_Boogle,current_comp_Lahoo,current_comp_Macrohard,current_comp_Pineapple,current_comp_Poptrist,current_comp_Toyota,current_comp_Unemployed,experience_level_Advanced,...,low_income_True,Android: Java,Career Development,Data Science: Python,General Programming,"Web: HTML, CSS, JavaScript",backend,front end,frontend,iOS: Swift
mentors_profile_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
vDw14633y666xVTo,0,0,0,1,0,0,0,0,0,1,...,0,1,0,0,0,0,0,0,0,0
H8L6d7kG2E1J8f58,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
37T2KouYl5477V1o,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
224Ks8ALel5OW253,0,0,0,0,0,1,0,0,0,1,...,0,0,0,0,0,1,0,0,0,0
t82ZE41q1fHbx086,0,0,0,0,0,1,0,0,0,1,...,0,0,1,0,0,0,0,0,0,0
UOMnnoy5005162U5,0,1,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
k3U0su6268AGiS74,0,1,0,0,0,0,0,0,0,1,...,0,0,0,0,1,0,0,0,0,0
6gc4Q834RO1h8VT2,0,0,1,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
cC8iu2X3681BI7Z6,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
I8263IK0QV74Foh1,0,1,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0


In [None]:
collab_mentor_list = []
for index in indices_collab:
    collab_mentor_list.append(mentors.loc[combined.iloc[index]])

knn_list = []
for index in indices_knn:
    knn_list.append(mentors.loc[combined.iloc[index]])

ValueError: Cannot index with multidimensional key

In [None]:
collab_df = pd.DataFrame(collab_mentor_list[0])
collab_df

Unnamed: 0_level_0,Central,Eastern,Mountain,Pacific,Advanced,Beginner,Expert,Intermediate,Career Preparation,Technical,Android,Angular,Axios,C#,C++,CSS,Django,HTML,IOS,Java,JavaScript,Microsoft Azure,MongoDB,Node.JS,PHP,PostgreSQL,Python,React,Ruby,SQL
mentor_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1
8284601696244,1,0,0,0,0,0,1,1,1,1,0,0,0,1,0,0,0,0,0,1,0,1,1,0,0,0,0,0,0,0
15095524536358,0,0,0,1,0,1,0,0,1,0,1,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0
25031268776199,1,0,0,0,1,0,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
51254476683696,1,0,0,0,1,1,1,0,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
66361726368093,0,0,1,0,1,0,1,1,0,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [None]:
knn_df = pd.DataFrame(knn_list[0])
knn_df

Unnamed: 0_level_0,Central,Eastern,Mountain,Pacific,Advanced,Beginner,Expert,Intermediate,Career Preparation,Technical,Android,Angular,Axios,C#,C++,CSS,Django,HTML,IOS,Java,JavaScript,Microsoft Azure,MongoDB,Node.JS,PHP,PostgreSQL,Python,React,Ruby,SQL
mentor_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1
63121684978234,0,1,0,0,0,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,1
45253072381561,1,0,0,0,0,1,0,0,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0
63230246147329,0,0,0,1,0,0,1,0,1,1,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
42936362672176,0,0,0,1,0,0,0,1,1,1,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0
8732351419446,0,0,0,1,0,1,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0


#### We see that there is some value in using a rating based KNN as the mentor with a higher rating basis would've scored better with the given review matrix.  

In [None]:
# KNN model
combined[(combined['mentor_id'] == '63121684978234') & (combined['rating'] > 0)].rating.mean()

0.4541850826707502

In [None]:
# KNN model with rating
combined[(combined['mentor_id'] == '8284601696244') & (combined['rating'] > 0)].rating.mean()


0.5506750891461398