# Setup Django

In [1]:
import os, sys
# PWD = os.getenv('PWD')
# os.chdir(PWD)
# sys.path.insert(0, os.getenv('PWD'))
# os.environ.setdefault("DJANGO_SETTINGS_MODULE", "local_settings.py")
# import django
# django.setup()

In [2]:
os.environ["DJANGO_ALLOW_ASYNC_UNSAFE"] = "true"

# Preprocessing

In [3]:
from apps.kstorage.models import User, Project
import pandas as pd
import numpy as np

In [4]:
# Create user dataframe
u_df = pd.DataFrame(list(User.objects.all().values()), index=User.objects.values_list('id', flat=True))
u_df.head()

Unnamed: 0,id,email,role,faculty_id,fields,skills,joined_projects,starred_projects,viewed_projects,followed_projects,year
4,4,eit@gmail.com,student,3,"[[2838, 2859, 2863], [2673, -1, -1]]",[React],"[65, 64]",[64],[],"[64, 68, 66, 67]",1
14,14,user1@gmail.com,student,3,"[[2585, -1, -1], [2469, -1, -1]]","[dsd, sdf, dsers]","[73, 72, 70, 69]",[69],"[69, 65, 66]",[69],3
13,13,ben@gmail.com,student,1,"[[2763, -1, -1]]","[Chinese, Excel]",[],"[67, 66, 65, 64]",[],"[65, 64, 68, 67, 66]",1
5,5,earn@gmail.com,student,2,"[[2763, 2813, -1]]","[AstronomicalScience, English]","[67, 66]",[66],[],"[68, 66, 67, 64, 65]",1


In [5]:
# Create project dataframe
p_df = pd.DataFrame(list(Project.objects.all().values()), index=Project.objects.values_list('id', flat=True))
p_df.head()

Unnamed: 0,id,title,project_status,fields,tags,created_at,updated_at
64,64,Web project KMITL,,"[[2422, -1, -1], [2422, 2423, 2425]]","[kmitl, web, project]",2020-03-22 10:19:12.782000+00:00,2020-03-22 10:19:12.782000+00:00
65,65,This is the second project,,"[[2520, 2521, 2522], [2520, 2521, 2525]]","[second, hello]",2020-03-22 12:53:32.268000+00:00,2020-03-22 12:53:32.268000+00:00
66,66,Drone for delivery in KMITL,,"[[2585, 2618, 2622], [2585, 2618, 2625]]","[drone, delivery]",2020-03-22 13:43:23.884000+00:00,2020-03-22 13:43:23.884000+00:00
67,67,This is a new project,,"[[2673, 2713, 2722], [2673, 2713, 2725]]",[new],2020-03-28 13:12:35.584000+00:00,2020-03-28 13:12:35.584000+00:00
69,69,Ipad pro XL,,"[[2422, -1, -1]]","[a, d, e]",2020-05-22 10:36:57.361000+00:00,2020-05-22 10:36:57.361000+00:00


# Recommend Project to User based on its contents
Recommend a project P to user U based on U's `fields`, `department`, and `skills` and P's `fields` and `tags`.

## Based on Project's and User's fields
Create an empty matrix of relation between users and projects. `index` is user id. `column` is project id.

### RelationCalculator Class
`RelationCalculator` is a base abstract class where only abstract class method `calc_relation` available.

`RelationCalcByFields` is a subclass of `RelationCalculator`.

`field_similarity(f1, f2)` is a method that calculate similarity of 2 fields.

Similarity range from [0, 1]. **0** means not similar at all. **1** means exactly the same.

Example input: `field_a = [1,2,3]`, `field_b = [1,2-1]`

`field_a` is `3`. 
- Field `3` is in **group** `2` and **division** `1`.

`field_b` is `2`.
- Field `2` is **group** in **division** `1`.

`calc_sim_by_fields(fields_1, fields_2)` is a method that calculate similarity of 2 **list** of fields.

Similarity range from [0, 1]. **0** means not similar at all. **1** means exactly the same. It is calculated using **cosine similarity** of 2 vectors.

Example input: `fields_a = [[1,2,3], [4,5,-1]]` and `fields_b = [[1,-1,-1], [7,-1,-1]]`

`fields_a` contains 2 fields, `3` and `5`. 
- Field `3` is in **group** `2` and **division** `1`. 
- Field `5` is a **group** in **division** `4`.


** Inspired by [this answer on stackoverflow](https://stackoverflow.com/questions/1746501/can-someone-give-an-example-of-cosine-similarity-in-a-very-simple-graphical-wa)

In [6]:
import itertools
from scipy.spatial.distance import cosine
from abc import ABC, abstractmethod


class RelationCalculator(ABC):
    @classmethod
    @abstractmethod
    def calc_relation(cls, obj_a, obj_b):
        pass


class RelationCalcByFields(RelationCalculator):
    @classmethod
    def calc_relation(cls, obj_a, obj_b):
        sim = cls.calc_sim_by_fields(obj_a.fields, obj_b.fields)
        return sim 

    @staticmethod
    def field_similarity(f1, f2):
        if f1[0] == f2[0]:
            if f1[1] == f2[1]:
                if f1[2] == f2[2]:
                    return 1
                return 0.7
            return 0.3
        return 0

    @staticmethod
    def unique_fields(field_list_1, field_list_2):
        '''
            Return a list of unique fields
            e.g. [[1,2],[8,9]] + [[8,9],[10,11]] = [[1,2],[8,9],[10,11]]
        '''
        f = field_list_1 + field_list_2
        f.sort()
        return list(f for f, _ in itertools.groupby(f))

    @classmethod
    def calc_sim_by_fields(cls, field_list_1, field_list_2):
        unique = cls.unique_fields(field_list_1, field_list_2)
        sim_list_1 = list()
        sim_list_2 = list()
        for field in unique:
            # Similarity vector of list of fields 1
            sim_temp = []
            for f1 in field_list_1:
                sim_temp.append(cls.field_similarity(field, f1))
            sim_list_1.append(max(sim_temp))

            # Similarity vector of list of fields 2
            sim_temp = []
            for f2 in field_list_2:
                sim_temp.append(cls.field_similarity(field, f2))
            sim_list_2.append(max(sim_temp))

        val_out = 1 - cosine(sim_list_1, sim_list_2)
        return val_out


class RelationCalcByInteractions(RelationCalculator):

    @classmethod
    def calc_relation(cls, obj_a, obj_b):
        return 0


In [7]:
# Example of field_similarity usage
print(RelationCalcByFields.field_similarity([1,2,3], [1,2,3]))
print(RelationCalcByFields.field_similarity([1,2,3], [1,2,-1]))
print(RelationCalcByFields.field_similarity([1,2,3], [1,-1,-1]))
print(RelationCalcByFields.field_similarity([1,2,3], [7,9,10]))

1
0.7
0.3
0


In [8]:
print(RelationCalcByFields.calc_sim_by_fields([[4,5,6], [7,8,9], [1,2,3]], [[1,2,3], [4,5,6], [7,8,9]]))

print(RelationCalcByFields.calc_sim_by_fields([[4,5,-1], [7,8,9], [1,2,3]], [[1,2,3], [4,5,6], [7,8,9]]))

print(RelationCalcByFields.calc_sim_by_fields([[7,8,9], [1,2,3]], [[1,2,3], [4,5,6], [7,8,9]]))

print(RelationCalcByFields.calc_sim_by_fields([[1,2,3]], [[4,5,6], [7,8,9]]))

1.0
0.9742120343839542
0.816496580927726
0.0


In [9]:
RelationCalcByFields.calc_relation(User.objects.first(), Project.objects.first())

0.0

### Calculate similarity of user and project

In [10]:
import pandas as pd
import pickle
from abc import ABC, abstractmethod

from apps.kstorage.models import User, Project

class Relationship(ABC):
    '''
        Based class for relationships
    '''

    def __init__(self, index, columns, calculator):
        self.relations_df = pd.DataFrame(index=index, columns=columns)
        self.calculator = calculator

    @abstractmethod
    def fill_relations(self):
        pass

    def get_relations(self):
        return self.relations_df

    def get_picked_relations(self):
        return pickle.dumps(self.relations_df)

    def row_count(self):
        return len(self.relations_df)

    def col_count(self):
        return len(self.relations_df.columns)


class UserProjectRelationship(Relationship):
    '''
        Handle relations for users and projects
    '''

    def __init__(self, index=User.objects.values_list('id', flat=True),
                 columns=Project.objects.values_list('id', flat=True),
                 calculator=RelationCalcByFields):
        super().__init__(index, columns, calculator)
        self.users = User.objects.all()
        self.projects = Project.objects.all()

    def fill_relations(self):
        for user in self.users:
            for project in self.projects:
                sim = self.calculator.calc_relation(user, project)
                self.relations_df.loc[user.id, project.id] = sim
        return self.relations_df

In [11]:
u_p_sim = UserProjectRelationship()
u_p_sim.fill_relations()

Unnamed: 0,64,65,66,67,69,70,72,73
4,0,0,0.0,0.421639,0,0.236102,0.397525,0
14,0,0,0.421639,0.0,0,0.236102,0.0,0
13,0,0,0.0,0.0,0,0.326933,0.0,0
5,0,0,0.0,0.0,0,0.613935,0.0,0


In [12]:
# Retrieve a user, say 13.
sorted_projects = u_p_sim.get_relations().loc[[13]].melt().sort_values('value',ascending=False)
sorted_projects

Unnamed: 0,variable,value
5,70,0.326933
0,64,0.0
1,65,0.0
2,66,0.0
3,67,0.0
4,69,0.0
6,72,0.0
7,73,0.0


In [13]:
sorted_projects.head(100)['variable'].to_list()

[70, 64, 65, 66, 67, 69, 72, 73]

## Based on Projects that a User interacts with in the past

### How to compare similarity of two projects P and Q
- Fields
- Tags
- Members

### Comparing Project fields

In [14]:
class ProjectRelationship(Relationship):
    '''
        Handle relations for users and projects
    '''

    def __init__(self, index=Project.objects.values_list('id', flat=True),
                 columns=Project.objects.values_list('id', flat=True),
                 calculator=RelationCalcByFields):
        super().__init__(index, columns, calculator)
        self.projects = Project.objects.all()
        self.sim_calc = RelationCalcByFields()

    def fill_relations(self):
        for project_row in self.projects:
            for project_col in self.projects:
                sim = self.calculator.calc_relation(project_row, project_col)
                self.relations_df.loc[project_row.id, project_col.id] = sim
        return self.relations_df


In [15]:
p_p_sim = ProjectRelationship()
p_p_sim.fill_relations()

Unnamed: 0,64,65,66,67,69,70,72,73
64,1.0,0,0.0,0,0.880471,0.0,0.0,0
65,0.0,1,0.0,0,0.0,0.0,0.0,0
66,0.0,0,1.0,0,0.0,0.349105,0.0,0
67,0.0,0,0.0,1,0.0,0.0,0.0,0
69,0.880471,0,0.0,0,1.0,0.0,0.0,0
70,0.0,0,0.349105,0,0.0,1.0,0.57735,0
72,0.0,0,0.0,0,0.0,0.57735,1.0,0
73,0.0,0,0.0,0,0.0,0.0,0.0,1


### Compare project with past user interactions

In [16]:
joined_projects = u_df.iloc[0]['joined_projects']
joined_projects

[65, 64]

In [17]:
for i in range(len(p_df)):
    print(p_df.iloc[i])

id                                                  64
title                                Web project KMITL
project_status                                    None
fields            [[2422, -1, -1], [2422, 2423, 2425]]
tags                             [kmitl, web, project]
created_at            2020-03-22 10:19:12.782000+00:00
updated_at            2020-03-22 10:19:12.782000+00:00
Name: 64, dtype: object
id                                                      65
title                           This is the second project
project_status                                        None
fields            [[2520, 2521, 2522], [2520, 2521, 2525]]
tags                                       [second, hello]
created_at                2020-03-22 12:53:32.268000+00:00
updated_at                2020-03-22 12:53:32.268000+00:00
Name: 65, dtype: object
id                                                      66
title                          Drone for delivery in KMITL
project_status                      