# Setup Django

In [1]:
import os, sys
# PWD = os.getenv('PWD')
# os.chdir(PWD)
# sys.path.insert(0, os.getenv('PWD'))
# os.environ.setdefault("DJANGO_SETTINGS_MODULE", "local_settings.py")
# import django
# django.setup()

In [2]:
os.environ["DJANGO_ALLOW_ASYNC_UNSAFE"] = "true"

# Preprocessing

In [3]:
from apps.kstorage.models import User, Project
import pandas as pd
import numpy as np

In [4]:
# Create user dataframe
u_df = pd.DataFrame(list(User.objects.all().values()), index=User.objects.values_list('id', flat=True))
u_df.head()

Unnamed: 0,id,email,role,faculty_id,fields,skills,joined_projects,starred_projects,viewed_projects,followed_projects,year
4,4,eit@gmail.com,student,3,"[[2838, 2859, 2863], [2673, -1, -1]]",[React],"[65, 64]",[64],[],"[64, 68, 66, 67]",1
14,14,user1@gmail.com,student,3,"[[2585, -1, -1], [2469, -1, -1]]","[dsd, sdf, dsers]","[73, 72, 70, 69]",[69],"[69, 65, 66]",[69],3
13,13,ben@gmail.com,student,1,"[[2763, -1, -1]]","[Chinese, Excel]",[],"[67, 66, 65, 64]",[],"[65, 64, 68, 67, 66]",1
5,5,earn@gmail.com,student,2,"[[2763, 2813, -1]]","[AstronomicalScience, English]","[67, 66]",[66],[],"[68, 66, 67, 64, 65]",1


In [5]:
# Create project dataframe
p_df = pd.DataFrame(list(Project.objects.all().values()), index=Project.objects.values_list('id', flat=True))
p_df.head()

Unnamed: 0,id,title,project_status,fields,tags,created_at,updated_at
64,64,Web project KMITL,,"[[2422, -1, -1], [2422, 2423, 2425]]","[kmitl, web, project]",2020-03-22 10:19:12.782000+00:00,2020-03-22 10:19:12.782000+00:00
65,65,This is the second project,,"[[2520, 2521, 2522], [2520, 2521, 2525]]","[second, hello]",2020-03-22 12:53:32.268000+00:00,2020-03-22 12:53:32.268000+00:00
66,66,Drone for delivery in KMITL,,"[[2585, 2618, 2622], [2585, 2618, 2625]]","[drone, delivery]",2020-03-22 13:43:23.884000+00:00,2020-03-22 13:43:23.884000+00:00
67,67,This is a new project,,"[[2673, 2713, 2722], [2673, 2713, 2725]]",[new],2020-03-28 13:12:35.584000+00:00,2020-03-28 13:12:35.584000+00:00
69,69,Ipad pro XL,,"[[2422, -1, -1]]","[a, d, e]",2020-05-22 10:36:57.361000+00:00,2020-05-22 10:36:57.361000+00:00


# Recommend Project to User based on its contents
Recommend a project P to user U based on U's `fields`, `department`, and `skills` and P's `fields` and `tags`.

## Based on Project's and User's fields
Create an empty matrix of relation between users and projects. `index` is user id. `column` is project id.

### RelationCalculator Class
`RelationCalculator` is a base abstract class where only abstract class method `calc_relation` available.

`RelationCalcByFields` is a subclass of `RelationCalculator`.

`field_similarity(f1, f2)` is a method that calculate similarity of 2 fields.

Similarity range from [0, 1]. **0** means not similar at all. **1** means exactly the same.

Example input: `field_a = [1,2,3]`, `field_b = [1,2-1]`

`field_a` is `3`. 
- Field `3` is in **group** `2` and **division** `1`.

`field_b` is `2`.
- Field `2` is **group** in **division** `1`.

`calc_sim_by_fields(fields_1, fields_2)` is a method that calculate similarity of 2 **list** of fields.

Similarity range from [0, 1]. **0** means not similar at all. **1** means exactly the same. It is calculated using **cosine similarity** of 2 vectors.

Example input: `fields_a = [[1,2,3], [4,5,-1]]` and `fields_b = [[1,-1,-1], [7,-1,-1]]`

`fields_a` contains 2 fields, `3` and `5`. 
- Field `3` is in **group** `2` and **division** `1`. 
- Field `5` is a **group** in **division** `4`.


** Inspired by [this answer on stackoverflow](https://stackoverflow.com/questions/1746501/can-someone-give-an-example-of-cosine-similarity-in-a-very-simple-graphical-wa)

In [6]:
from research.project_recommender.relation_calculator import RelationCalcByFields

relation_fields = RelationCalcByFields()
# Example of field_similarity usage
print(relation_fields.field_similarity([1,2,3], [1,2,3]))
print(relation_fields.field_similarity([1,2,3], [1,2,-1]))
print(relation_fields.field_similarity([1,2,3], [1,-1,-1]))
print(relation_fields.field_similarity([1,2,3], [7,9,10]))

1
0.7
0.3
0


In [7]:
print(relation_fields.calc_sim_by_fields([[4,5,6], [7,8,9], [1,2,3]], [[1,2,3], [4,5,6], [7,8,9]]))

print(relation_fields.calc_sim_by_fields([[4,5,-1], [7,8,9], [1,2,3]], [[1,2,3], [4,5,6], [7,8,9]]))

print(relation_fields.calc_sim_by_fields([[7,8,9], [1,2,3]], [[1,2,3], [4,5,6], [7,8,9]]))

print(relation_fields.calc_sim_by_fields([[1,2,3]], [[4,5,6], [7,8,9]]))

1.0
0.9742120343839542
0.816496580927726
0.0


In [8]:
relation_fields.calc_relation(User.objects.first(), Project.objects.first())

0.0

### Calculate similarity of user and project

In [9]:
from research.project_recommender.relationship import UserProjectRelationship

In [10]:
u_p_sim = UserProjectRelationship()
u_p_sim.fill_relations()

Unnamed: 0,64,65,66,67,69,70,72,73
4,0,0,0.0,0.421639,0,0.236102,0.397525,0
14,0,0,0.421639,0.0,0,0.236102,0.0,0
13,0,0,0.0,0.0,0,0.326933,0.0,0
5,0,0,0.0,0.0,0,0.613935,0.0,0


In [11]:
# Retrieve a user, say 13.
sorted_projects = u_p_sim.get_relations().loc[[13]].melt().sort_values('value',ascending=False)
sorted_projects

Unnamed: 0,variable,value
5,70,0.326933
0,64,0.0
1,65,0.0
2,66,0.0
3,67,0.0
4,69,0.0
6,72,0.0
7,73,0.0


In [12]:
sorted_projects.head(100)['variable'].to_list()

[70, 64, 65, 66, 67, 69, 72, 73]

## Based on Projects that a User interacts with in the past

### How to compare similarity of two projects P and Q
- Fields
- Tags
- Members

### Comparing Project fields

In [13]:
from research.project_recommender.relationship import ProjectRelationship

In [14]:
p_p_sim = ProjectRelationship()
p_p_relation = p_p_sim.fill_relations()
p_p_relation

Unnamed: 0,64,65,66,67,69,70,72,73
64,1.0,0,0.0,0,0.880471,0.0,0.0,0
65,0.0,1,0.0,0,0.0,0.0,0.0,0
66,0.0,0,1.0,0,0.0,0.349105,0.0,0
67,0.0,0,0.0,1,0.0,0.0,0.0,0
69,0.880471,0,0.0,0,1.0,0.0,0.0,0
70,0.0,0,0.349105,0,0.0,1.0,0.57735,0
72,0.0,0,0.0,0,0.0,0.57735,1.0,0
73,0.0,0,0.0,0,0.0,0.0,0.0,1


### Compare project with past user interactions

In [15]:
weight = 10
n = 10
for i in range(n):
    print(weight - i * (weight / n))

10.0
9.0
8.0
7.0
6.0
5.0
4.0
3.0
2.0
1.0


In [29]:
from apps.ml.models import Relation
from research.project_recommender.relation_calculator import RelationCalculator
from apps.kstorage.models import User, Project
import pickle
import numpy as np

class RelationCalcByInteractions(RelationCalculator):
    MEMBER_WEIGHT = 4
    STAR_WEIGHT = 3
    FOLLOW_WEIGHT = 3
    VIEW_WEIGHT = 2
    
    MAX_MEMBERS = 3
    MAX_STARS = 8
    MAX_FOLLOWS = 8
    MAX_VIEWS = 16
    
    def __init__(self):
        self.project_df = pickle.loads(Relation.objects.filter(row_type=Project.__name__, col_type=Project.__name__)\
                                  .last().data_frame)

    def calc_relation(self, user: User, project: Project):
        try:
            if self.__check_conditions(user, project) == False:
                return 0
            
            sim_list = []
            print("member")
            m_sim = self.calc_by_weighted_values(project, user.joined_projects, self.MEMBER_WEIGHT, self.MAX_MEMBERS)
            sim_list.append(m_sim)
            print("\nstar")
            s_sim = self.calc_by_weighted_values(project, user.starred_projects, self.STAR_WEIGHT, self.MAX_STARS)
            sim_list.append(s_sim)
            print("\nfollow")
            f_sim = self.calc_by_weighted_values(project, user.followed_projects, self.FOLLOW_WEIGHT, self.MAX_FOLLOWS)
            sim_list.append(f_sim)
            print("\nview")
            v_sim = self.calc_by_weighted_values(project, user.viewed_projects, self.VIEW_WEIGHT, self.MAX_VIEWS)
            sim_list.append(v_sim)
            
            return np.average(sim_list)
            
        except Exception as e:
            print("Exception while calculating relation,", str(e))
            return 0
        
    def __check_conditions(self, user, project):
        if project.id in user.joined_projects:
            return False
        return True
    
    def calc_by_weighted_values(self, target_project, comparing_project_ids, max_weight, max_n_projects):
        if len(comparing_project_ids) <= 0:
            return 0
        count = 0
        sim_list = []
        weight_list = []
        current_weight = max_weight
        
        for comparing_id in comparing_project_ids:
            if not comparing_id in self.project_df:
                continue
            
            row_id = target_project.id
            similarity = self.project_df.loc[row_id, comparing_id]
            
            sim_list.append(similarity)
            weight_list.append(current_weight)
            
            count += 1
            current_weight = self.__normalize_weight(max_weight, max_n_projects, count)
            
        weighted_avg = np.average(sim_list, weights=weight_list)
        
        print("sim_list", sim_list)
        print("weight_list", weight_list)
        print("weighted_avg", weighted_avg)
        
        return weighted_avg
        
    def __normalize_weight(self, max_weight, max_n_projects, count):
        return max_weight - (count * (max_weight / max_n_projects))


In [30]:
interaction_calc = RelationCalcByInteractions()
interaction_calc.calc_relation(User.objects.get(pk=14), Project.objects.first())

member
sim_list [0.0, 0.0, 0.0, 0.8804710999221753]
weight_list [4, 2.666666666666667, 1.3333333333333335, 0.0]
weighted_avg 0.0

star
sim_list [0.8804710999221753]
weight_list [3]
weighted_avg 0.8804710999221753

follow
sim_list [0.8804710999221753]
weight_list [3]
weighted_avg 0.8804710999221753

view
sim_list [0.8804710999221753, 0.0, 0.0]
weight_list [2, 1.875, 1.75]
weighted_avg 0.31305639108344013


0.5184996477319477

In [34]:
interaction_calc = RelationCalcByInteractions()
from research.project_recommender.relationship import UserProjectRelationship

u_p_sim = UserProjectRelationship(calculator = RelationCalcByInteractions)
u_p_relation = u_p_sim.fill_relations()
u_p_relation

# for u in User.objects.all():
    
#     for p in Project.objects.all():
#         interaction_calc.calc_relation(u, p)
#         print("----")



member
sim_list [0.0, 0.0]
weight_list [4, 2.666666666666667]
weighted_avg 0.0

star
sim_list [0.0]
weight_list [3]
weighted_avg 0.0

follow
sim_list [0.0, 1.0, 0.0]
weight_list [3, 2.625, 2.25]
weighted_avg 0.3333333333333333

view
member
sim_list [0.0, 0.0]
weight_list [4, 2.666666666666667]
weighted_avg 0.0

star
sim_list [0.0]
weight_list [3]
weighted_avg 0.0

follow
sim_list [0.0, 0.0, 1.0]
weight_list [3, 2.625, 2.25]
weighted_avg 0.2857142857142857

view
member
sim_list [0.0, 0.8804710999221753]
weight_list [4, 2.666666666666667]
weighted_avg 0.35218843996887017

star
sim_list [0.8804710999221753]
weight_list [3]
weighted_avg 0.8804710999221753

follow
sim_list [0.8804710999221753, 0.0, 0.0]
weight_list [3, 2.625, 2.25]
weighted_avg 0.3354175618751144

view
member
sim_list [0.0, 0.0]
weight_list [4, 2.666666666666667]
weighted_avg 0.0

star
sim_list [0.0]
weight_list [3]
weighted_avg 0.0

follow
sim_list [0.0, 0.3491047889269221, 0.0]
weight_list [3, 2.625, 2.25]
weighted_avg 0.

Unnamed: 0,64,65,66,67,69,70,72,73
4,0.0,0.0,0.0833333,0.0714286,0.392019,0.0290921,0,0
14,0.5185,0.0833333,0.0923238,0.0,0.0,0.0,0,0
13,0.115385,0.134615,0.115385,0.134615,0.101593,0.0402813,0,0
5,0.0576923,0.0480769,0.0,0.0,0.0507964,0.149041,0,0
