#   Comparison of a base KNN to a KNN w/ratings

In [254]:
import pandas as pd
import numpy as np
import scipy as sp
from sklearn.neighbors import NearestNeighbors
import ast
import requests

In [255]:
response = requests.post('http://underdog-devs-ds-a-dev.us-east-1.elasticbeanstalk.com/Mentors/read')
mentors = pd.DataFrame(response.json()['result'])

In [256]:
response = requests.post('http://underdog-devs-ds-a-dev.us-east-1.elasticbeanstalk.com/Mentees/read')
mentees = pd.DataFrame(response.json()['result'])

In [257]:
mentees

Unnamed: 0,profile_id,first_name,last_name,email,city,state,country,formerly_incarcerated,underrepresented_group,low_income,list_convictions,subject,experience_level,job_help,industry_knowledge,pair_programming,other_info,validateStatus
0,z781i5e3EhH82A7u,John,Evans,fake@email.com,Ashland,Oregon,USA,True,True,True,"[Infraction, Felony]","Web: HTML, CSS, JavaScript",Beginner,False,False,True,Notes,
1,j57C2e118XEN6G6u,Tyler,Rogers,fake@email.com,Ashland,Oregon,USA,True,True,True,"[Misdemeanor, Infraction]","Web: HTML, CSS, JavaScript",Intermediate,False,False,True,Notes,
2,Z7352L2zbAA350Ho,Dayana,Reed,fake@email.com,Ashland,Oregon,USA,True,True,True,"[Infraction, Felony]",Data Science: Python,Intermediate,False,False,True,Notes,
3,O1q51b0fl848dYa6,Westin,Foster,fake@email.com,Ashland,Oregon,USA,False,False,True,[],Data Science: Python,Beginner,False,True,True,Notes,
4,eDBrJ3D565016ei4,Orlando,Mendoza,fake@email.com,Ashland,Oregon,USA,True,True,False,"[Infraction, Misdemeanor, Felony]",iOS: Swift,Expert,False,False,False,Notes,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99,64TB2r3s33ylO02B,Graham,Howard,fake@email.com,Ashland,Oregon,USA,True,False,True,"[Misdemeanor, Infraction, Felony]",General Programming,Intermediate,False,False,True,Notes,
100,,,,,,,,,,,,,,,,,,
101,d1fh0aoh,joel,bob,test9f9@gmail.com,"California, USA","California, USA",usa,False,False,True,"robbery, dance battle, etc...",backend,Expert,,,,,pending
102,j6xbsk9w,test2,test2,testy2@test.com,test city,California,USA,False,False,True,,career,expert,True,,,,pending


One hot encoding dataset generated by local_utils

In [258]:
def ohe(mentors, mentees):
    
    mentors = mentors.drop(columns=['profile_id', 'first_name', 'last_name', 'email', 'city', 'state', 'validateStatus', 'country'])

    test = pd.get_dummies(mentors, columns=['current_comp', 'experience_level', 'job_help', 'industry_knowledge', 'pair_programming', 'other_info', 'formerly_incarcerated', 'list_convictions', 'underrepresented_group', 'low_income'])

    test = test.fillna(0)
    
    mentors = pd.merge(mentors, test)

    mentors = mentors.join(mentors['subject'].apply(pd.Series).stack().str.get_dummies().groupby(level=0).sum())
    mentors = mentors.drop(columns=['subject', 'current_comp', 'experience_level', 'job_help', 'industry_knowledge', 'pair_programming', 'other_info', 'formerly_incarcerated', 'list_convictions', 'underrepresented_group', 'low_income'])

    mentees = mentees.drop(columns=['profile_id', 'first_name', 'last_name', 'email', 'city', 'state', 'validateStatus', 'country', 'list_convictions'])

    test = pd.get_dummies(mentees, columns=['experience_level', 'job_help', 'industry_knowledge', 'pair_programming', 'other_info', 'formerly_incarcerated','underrepresented_group', 'low_income'])

    test = test.fillna(0)
    
    mentees = pd.merge(mentees, test)

    mentees = mentees.join(mentees['subject'].apply(pd.Series).stack().str.get_dummies().groupby(level=0).sum())
    mentees = mentees.drop(columns=['subject', 'experience_level', 'job_help', 'industry_knowledge', 'pair_programming', 'other_info', 'formerly_incarcerated', 'underrepresented_group', 'low_income'])

    return mentors, mentees
    
    


In [259]:
#the driving force of selection is mentee to mentor <---- mentee like system
mentors, mentees = ohe(mentors, mentees)

# Generate a fake review matrix

In [260]:

reviews = sp.sparse.random(mentees.shape[0], mentors.shape[0], density=0.1, random_state=42)
reviews = pd.DataFrame(reviews.todense(), index=mentees.index, columns=mentors.index)
reviews = reviews.melt(ignore_index=False ,value_name='rating')

In [261]:
mentors.columns

Index(['current_comp_Amazin', 'current_comp_Amozonian', 'current_comp_Boogle',
       'current_comp_Lahoo', 'current_comp_Macrohard',
       'current_comp_Pineapple', 'current_comp_Poptrist',
       'current_comp_Toyota', 'current_comp_Unemployed',
       'experience_level_Advanced', 'experience_level_Beginner',
       'experience_level_Expert', 'experience_level_Intermediate',
       'experience_level_beginner', 'experience_level_expert',
       'job_help_False', 'job_help_True', 'job_help_true',
       'industry_knowledge_False', 'industry_knowledge_True',
       'industry_knowledge_true', 'pair_programming_False',
       'pair_programming_True', 'pair_programming_false', 'other_info_',
       'other_info_New Job', 'other_info_Notes', 'other_info_none',
       'formerly_incarcerated_False',
       'list_convictions_robbery, dance battle, etc...',
       'underrepresented_group_False', 'low_income_True', 'Android: Java',
       'Career Development', 'Data Science: Python', 'General Pr

In [262]:
#merge the reviews on profile id
combined = reviews.merge(mentees, left_index=True, right_index=True, how = 'left')

In [263]:
#KNN model
def create_model(df, neighbors_number):
    model = NearestNeighbors(n_neighbors=min(
        neighbors_number+1, df.shape[0]), metric='cosine')
    model.fit(df)
    return model

# Create a KNN with ratings and one without

In [264]:
#Create model
#knn_collab = create_model(combined, neighbors_number=10)
knn = create_model(mentors, neighbors_number=10)


#### Perhaps a better approach would be to filter the data further and then apply KNN based on rating so that the weights are focused more on ratings rather than looking at the distance of other features?

random mentee#6967999171653,Raiden,Jones,"['Career Preparation', 'Technical']","['PHP', 'React', 'C++']",Advanced,Mountain

In [265]:
combined.shape

(190630, 28)

In [266]:
#knn_collab_test = np.array([1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0,
#                1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0])

knn_test = np.array([0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0,
                1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [267]:
#run a test on knn
#note that this returns MENTEES similar to the current mentee

#dist_collab, indices_collab = knn_collab.kneighbors([knn_collab_test], 5)
#output indices of mentees that are similar to the current mentee
dist_knn, indices_knn = knn.kneighbors([knn_test], 5)

In [268]:
collab_mentor_list = []
for index in indices_collab:
    collab_mentor_list.append(mentors.loc[combined.iloc[index]['mentor_id']])

knn_list = []
for index in indices_knn:
    knn_list.append(mentors.loc[combined.iloc[index]['mentor_id']])

KeyError: 'mentor_id'

In [None]:
collab_df = pd.DataFrame(collab_mentor_list[0])
collab_df

Unnamed: 0_level_0,Central,Eastern,Mountain,Pacific,Advanced,Beginner,Expert,Intermediate,Career Preparation,Technical,Android,Angular,Axios,C#,C++,CSS,Django,HTML,IOS,Java,JavaScript,Microsoft Azure,MongoDB,Node.JS,PHP,PostgreSQL,Python,React,Ruby,SQL
mentor_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1
8284601696244,1,0,0,0,0,0,1,1,1,1,0,0,0,1,0,0,0,0,0,1,0,1,1,0,0,0,0,0,0,0
15095524536358,0,0,0,1,0,1,0,0,1,0,1,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0
25031268776199,1,0,0,0,1,0,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
51254476683696,1,0,0,0,1,1,1,0,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
66361726368093,0,0,1,0,1,0,1,1,0,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [None]:
knn_df = pd.DataFrame(knn_list[0])
knn_df

Unnamed: 0_level_0,Central,Eastern,Mountain,Pacific,Advanced,Beginner,Expert,Intermediate,Career Preparation,Technical,Android,Angular,Axios,C#,C++,CSS,Django,HTML,IOS,Java,JavaScript,Microsoft Azure,MongoDB,Node.JS,PHP,PostgreSQL,Python,React,Ruby,SQL
mentor_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1
63121684978234,0,1,0,0,0,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,1
45253072381561,1,0,0,0,0,1,0,0,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0
63230246147329,0,0,0,1,0,0,1,0,1,1,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
42936362672176,0,0,0,1,0,0,0,1,1,1,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0
8732351419446,0,0,0,1,0,1,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0


#### We see that there is some value in using a rating based KNN as the mentor with a higher rating basis would've scored better with the given review matrix.  

In [None]:
# KNN model
combined[(combined['mentor_id'] == '63121684978234') & (combined['rating'] > 0)].rating.mean()

0.4541850826707502

In [None]:
# KNN model with rating
combined[(combined['mentor_id'] == '8284601696244') & (combined['rating'] > 0)].rating.mean()


0.5506750891461398