#   Comparison of a base KNN to a KNN w/ratings

In [1]:
import pandas as pd
import numpy as np
import scipy as sp
from sklearn.neighbors import NearestNeighbors
import ast
import requests

In [17]:
response = requests.post('http://underdog-devs-ds-a-dev.us-east-1.elasticbeanstalk.com/Mentors/read')
mentors = pd.DataFrame(response.json()['result'])

In [18]:
response = requests.post('http://underdog-devs-ds-a-dev.us-east-1.elasticbeanstalk.com/Mentees/read')
mentees = pd.DataFrame(response.json()['result'])

In [19]:
mentors

Unnamed: 0,profile_id,email,city,state,country,first_name,last_name,current_comp,subject,experience_level,job_help,industry_knowledge,pair_programming,other_info,validateStatus,formerly_incarcerated,list_convictions,underrepresented_group,low_income
0,vDw14633y666xVTo,fake@email.com,Ashland,Oregon,USA,Aspyn,Thompson,Boogle,Android: Java,Advanced,False,True,True,Notes,,,,,
1,H8L6d7kG2E1J8f58,fake@email.com,Ashland,Oregon,USA,Cole,Young,Pineapple,"Web: HTML, CSS, JavaScript",Expert,False,True,True,Notes,,,,,
2,37T2KouYl5477V1o,fake@email.com,Ashland,Oregon,USA,Caden,Perez,Poptrist,iOS: Swift,Intermediate,False,True,True,Notes,,,,,
3,224Ks8ALel5OW253,fake@email.com,Ashland,Oregon,USA,Alaia,Rogers,Pineapple,"Web: HTML, CSS, JavaScript",Advanced,False,True,True,Notes,,,,,
4,t82ZE41q1fHbx086,fake@email.com,Ashland,Oregon,USA,Christopher,Mendoza,Pineapple,Career Development,Advanced,True,True,True,Notes,,,,,
5,UOMnnoy5005162U5,fake@email.com,Ashland,Oregon,USA,Madalynn,Nguyen,Amozonian,Android: Java,Intermediate,False,True,True,Notes,,,,,
6,k3U0su6268AGiS74,fake@email.com,Ashland,Oregon,USA,Elaine,Nguyen,Amozonian,General Programming,Advanced,False,True,True,Notes,,,,,
7,6gc4Q834RO1h8VT2,fake@email.com,Ashland,Oregon,USA,Noemi,Myers,Boogle,Android: Java,Expert,False,True,True,Notes,,,,,
8,cC8iu2X3681BI7Z6,fake@email.com,Ashland,Oregon,USA,Penelope,Hall,Amozonian,iOS: Swift,Beginner,False,True,True,Notes,,,,,
9,I8263IK0QV74Foh1,fake@email.com,Ashland,Oregon,USA,Brady,Williams,Amozonian,Career Development,Beginner,True,True,True,Notes,,,,,


One hot encoding dataset generated by local_utils

In [7]:
def ohe(mentors, mentees):
    mentees.drop(columns=['first_name', 'last_name'], inplace=True)
    mentees['profile_id'] = mentees['profile_id'].apply(lambda str: str.split('#')[1])    
    mentees.set_index('profile_id', inplace=True)
    
    mentees = mentees.join(mentees['time_zone'].apply(pd.Series).stack().str.get_dummies().groupby(level=0).sum())
    mentees.drop(columns='time_zone', inplace=True)

    mentees = mentees.join(mentees['skill_rank'].apply(pd.Series).stack().str.get_dummies().groupby(level=0).sum())
    mentees.drop(columns='skill_rank', inplace=True)

    mentees['tech_or_career'] = mentees['tech_or_career'].apply(ast.literal_eval)
    mentees = mentees.join(mentees['tech_or_career'].apply(pd.Series).stack().str.get_dummies().groupby(level=0).sum())
    mentees.drop(columns='tech_or_career', inplace=True)


    mentees['skills'] = mentees['skills'].apply(ast.literal_eval)
    mentees = mentees.join(mentees['skills'].apply(pd.Series).stack().str.get_dummies().groupby(level=0).sum())
    mentees.drop(columns='skills', inplace=True)

    mentors.drop(columns=['first_name', 'last_name'], inplace=True)
    mentors['profile_id'] = mentors['profile_id'].apply(lambda str: str.split('#')[1])    
    mentors.set_index('profile_id', inplace=True)
    mentors.index.rename('mentor_id', inplace=True)
    
    mentors = mentors.join(mentors['time_zone'].apply(pd.Series).stack().str.get_dummies().groupby(level=0).sum())
    mentors.drop(columns='time_zone', inplace=True)

    mentors['skill_rank'] = mentors['skill_rank'].apply(ast.literal_eval)
    mentors = mentors.join(mentors['skill_rank'].apply(pd.Series).stack().str.get_dummies().groupby(level=0).sum())
    mentors.drop(columns='skill_rank', inplace=True)

    mentors['tech_or_career'] = mentors['tech_or_career'].apply(ast.literal_eval)
    mentors = mentors.join(mentors['tech_or_career'].apply(pd.Series).stack().str.get_dummies().groupby(level=0).sum())
    mentors.drop(columns='tech_or_career', inplace=True)


    mentors['skills'] = mentors['skills'].apply(ast.literal_eval)
    mentors = mentors.join(mentors['skills'].apply(pd.Series).stack().str.get_dummies().groupby(level=0).sum())
    mentors.drop(columns='skills', inplace=True)
    return mentors, mentees

In [8]:
#the driving force of selection is mentee to mentor <---- mentee like system
mentors, mentees = ohe(mentors, mentees)

# Generate a fake review matrix

In [22]:

reviews = sp.sparse.random(mentees.shape[0], mentors.shape[0], density=0.1, random_state=42)
reviews = pd.DataFrame(reviews.todense(), index=mentees.index, columns=mentors.index)
reviews = reviews.melt(ignore_index=False ,value_name='rating')

In [23]:
#merge the reviews on profile id
combined = reviews.merge(mentees, left_index=True, right_index=True, how = 'left')

In [24]:
#KNN model
def create_model(df, neighbors_number):
    model = NearestNeighbors(n_neighbors=min(
        neighbors_number+1, df.shape[0]), metric='cosine')
    model.fit(df)
    return model

# Create a KNN with ratings and one without

In [14]:
#Create model
knn_collab = create_model(combined.drop(columns=['mentor_id']), neighbors_number=10)
knn = create_model(mentors, neighbors_number=10)


#### Perhaps a better approach would be to filter the data further and then apply KNN based on rating so that the weights are focused more on ratings rather than looking at the distance of other features?

random mentee#6967999171653,Raiden,Jones,"['Career Preparation', 'Technical']","['PHP', 'React', 'C++']",Advanced,Mountain

In [15]:
knn_collab_test = np.array([1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0,
                1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0])

knn_test = np.array([0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0,
                1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0])

In [16]:
#run a test on knn
#note that this returns MENTEES similar to the current mentee

dist_collab, indices_collab = knn_collab.kneighbors([knn_collab_test], 5)
#output indices of mentees that are similar to the current mentee
dist_knn, indices_knn = knn.kneighbors([knn_test], 5)



In [17]:
collab_mentor_list = []
for index in indices_collab:
    collab_mentor_list.append(mentors.loc[combined.iloc[index]['mentor_id']])

knn_list = []
for index in indices_knn:
    knn_list.append(mentors.loc[combined.iloc[index]['mentor_id']])

In [19]:
collab_df = pd.DataFrame(collab_mentor_list[0])
collab_df

Unnamed: 0_level_0,Central,Eastern,Mountain,Pacific,Advanced,Beginner,Expert,Intermediate,Career Preparation,Technical,Android,Angular,Axios,C#,C++,CSS,Django,HTML,IOS,Java,JavaScript,Microsoft Azure,MongoDB,Node.JS,PHP,PostgreSQL,Python,React,Ruby,SQL
mentor_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1
8284601696244,1,0,0,0,0,0,1,1,1,1,0,0,0,1,0,0,0,0,0,1,0,1,1,0,0,0,0,0,0,0
15095524536358,0,0,0,1,0,1,0,0,1,0,1,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0
25031268776199,1,0,0,0,1,0,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
51254476683696,1,0,0,0,1,1,1,0,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
66361726368093,0,0,1,0,1,0,1,1,0,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [20]:
knn_df = pd.DataFrame(knn_list[0])
knn_df

Unnamed: 0_level_0,Central,Eastern,Mountain,Pacific,Advanced,Beginner,Expert,Intermediate,Career Preparation,Technical,Android,Angular,Axios,C#,C++,CSS,Django,HTML,IOS,Java,JavaScript,Microsoft Azure,MongoDB,Node.JS,PHP,PostgreSQL,Python,React,Ruby,SQL
mentor_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1
63121684978234,0,1,0,0,0,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,1
45253072381561,1,0,0,0,0,1,0,0,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0
63230246147329,0,0,0,1,0,0,1,0,1,1,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
42936362672176,0,0,0,1,0,0,0,1,1,1,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0
8732351419446,0,0,0,1,0,1,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0


#### We see that there is some value in using a rating based KNN as the mentor with a higher rating basis would've scored better with the given review matrix.  

In [26]:
# KNN model
combined[(combined['mentor_id'] == '63121684978234') & (combined['rating'] > 0)].rating.mean()

0.4541850826707502

In [25]:
# KNN model with rating
combined[(combined['mentor_id'] == '8284601696244') & (combined['rating'] > 0)].rating.mean()


0.5506750891461398