# Setup Django

In [1]:
import os, sys
# PWD = os.getenv('PWD')
# os.chdir(PWD)
# sys.path.insert(0, os.getenv('PWD'))
# os.environ.setdefault("DJANGO_SETTINGS_MODULE", "local_settings.py")
# import django
# django.setup()

In [2]:
os.environ["DJANGO_ALLOW_ASYNC_UNSAFE"] = "true"

# Preprocessing

In [3]:
from apps.kstorage.models import User, Project
import pandas as pd
import numpy as np
import joblib

In [4]:
# Create user dataframe
u_df = pd.DataFrame(list(User.objects.all().values()), index=User.objects.values_list('id', flat=True))
u_df.head()

Unnamed: 0,id,email,role,faculty_id,expertises,skills,year
4,4,eit@gmail.com,student,3,"[[2838, 2859, 2863], [2673, -1, -1]]",[React],1
14,14,user1@gmail.com,student,3,"[[2585, -1, -1], [2469, -1, -1]]","[dsd, sdf, dsers]",3
13,13,ben@gmail.com,student,1,"[[2763, -1, -1]]","[Chinese, Excel]",1
5,5,earn@gmail.com,student,2,"[[2763, 2813, -1]]","[AstronomicalScience, English]",1


In [5]:
# Create project dataframe
p_df = pd.DataFrame(list(Project.objects.all().values()), index=Project.objects.values_list('id', flat=True))
p_df.head()

Unnamed: 0,id,title,project_status,categories,tags,created_at,updated_at
64,64,Web project KMITL,,"[[2422, -1, -1], [2422, 2423, 2425]]","[kmitl, web, project]",2020-03-22 10:19:12.782000+00:00,2020-03-22 10:19:12.782000+00:00
65,65,This is the second project,,"[[2520, 2521, 2522], [2520, 2521, 2525]]","[second, hello]",2020-03-22 12:53:32.268000+00:00,2020-03-22 12:53:32.268000+00:00
66,66,Drone for delivery in KMITL,,"[[2585, 2618, 2622], [2585, 2618, 2625]]","[drone, delivery]",2020-03-22 13:43:23.884000+00:00,2020-03-22 13:43:23.884000+00:00
67,67,This is a new project,,"[[2673, 2713, 2722], [2673, 2713, 2725]]",[new],2020-03-28 13:12:35.584000+00:00,2020-03-28 13:12:35.584000+00:00
69,69,Ipad pro XL,,"[[2422, -1, -1]]","[a, d, e]",2020-05-22 10:36:57.361000+00:00,2020-05-22 10:36:57.361000+00:00


# Recommend Project to User based on its contents
Recommend a project P to user U based on U's `fields`, `department`, and `skills` and P's `fields` and `tags`.

## Based on Project's and User's fields
Create an empty matrix of relation between users and projects. `index` is user id. `column` is project id.

### Similarity function
`field_similarity(f1, f2)` is a method that calculate similarity of 2 fields.

Similarity range from [0, 1]. **0** means not similar at all. **1** means exactly the same.

Example input: `field_a = [1,2,3]`, `field_b = [1,2-1]`

`field_a` is `3`. 
- Field `3` is in **group** `2` and **division** `1`.

`field_b` is `2`.
- Field `2` is **group** in **division** `1`.

In [6]:
import itertools
import math
from scipy.spatial.distance import cosine

class FieldSimCalculator:
    def field_similarity(self, f1, f2):
        if f1[0] == f2[0]:
            if f1[1] == f2[1]:
                if f1[2] == f2[2]:
                    return 1
                return 0.7
            return 0.3
        return 0

    def unique_fields(self, field_list_1, field_list_2):
        '''
            Return a list of unique fields
            e.g. [[1,2],[8,9]] + [[8,9],[10,11]] = [[1,2],[8,9],[10,11]]
        '''
        f = field_list_1 + field_list_2
        f.sort()
        return list(f for f,_ in itertools.groupby(f))
    
    def calc_sim_by_fields(self, field_list_1, field_list_2):
        unique = self.unique_fields(field_list_1, field_list_2)
        sim_list_1 = list()
        sim_list_2 = list()
        for field in unique:
            # Similarity vector of list of fields 1
            sim_temp = []
            for f1 in field_list_1:
                sim_temp.append(self.field_similarity(field, f1))
            sim_list_1.append(max(sim_temp))

            # Similarity vector of list of fields 2
            sim_temp = []
            for f2 in field_list_2:
                sim_temp.append(self.field_similarity(field, f2))
            sim_list_2.append(max(sim_temp))

        val_out = 1 - cosine(sim_list_1, sim_list_2)
        return val_out

In [7]:
f_sim_calc = FieldSimCalculator()
# Example of field_similarity usage
print(f_sim_calc.field_similarity([1,2,3], [1,2,3]))
print(f_sim_calc.field_similarity([1,2,3], [1,2,-1]))
print(f_sim_calc.field_similarity([1,2,3], [1,-1,-1]))
print(f_sim_calc.field_similarity([1,2,3], [7,9,10]))

1
0.7
0.3
0


`calc_sim_by_fields(fields_1, fields_2)` is a method that calculate similarity of 2 **list** of fields.

Similarity range from [0, 1]. **0** means not similar at all. **1** means exactly the same. It is calculated using **cosine similarity** of 2 vectors.

Example input: `fields_a = [[1,2,3], [4,5,-1]]` and `fields_b = [[1,-1,-1], [7,-1,-1]]`

`fields_a` contains 2 fields, `3` and `5`. 
- Field `3` is in **group** `2` and **division** `1`. 
- Field `5` is a **group** in **division** `4`.


** Inspired by [this answer on stackoverflow](https://stackoverflow.com/questions/1746501/can-someone-give-an-example-of-cosine-similarity-in-a-very-simple-graphical-wa)

In [8]:
f_sim_calc = FieldSimCalculator()

f_sim_calc.calc_sim_by_fields([[4,5,6], [7,8,9], [1,2,3]], [[1,2,3], [4,5,6], [7,8,9]])

f_sim_calc.calc_sim_by_fields([[4,5,-1], [7,8,9], [1,2,3]], [[1,2,3], [4,5,6], [7,8,9]])

f_sim_calc.calc_sim_by_fields([[7,8,9], [1,2,3]], [[1,2,3], [4,5,6], [7,8,9]])

f_sim_calc.calc_sim_by_fields([[1,2,3]], [[4,5,6], [7,8,9]])

0.0

### Calculate similarity of user and project

In [9]:
from recommender.settings import BASE_DIR
import joblib
import pickle

class UserProjectSimilarity:
    def __init__(self):
        self.users = User.objects.all()
        self.projects = Project.objects.all()
        self.sim_calc = FieldSimCalculator()
        self.user_project_df = self.create_sim_table()

    def create_sim_table(self):
        # Create empty similarity table
        user_ids = User.objects.values_list('id', flat=True)
        project_ids = Project.objects.values_list('id', flat=True)
        user_project_df = pd.DataFrame(index=user_ids, columns=project_ids)
        return user_project_df

    def fill_sim_table(self):
        for user in self.users:
            for project in self.projects:
                u_fields = user.expertises
                p_fields = project.categories
                sim = self.sim_calc.calc_sim_by_fields(u_fields, p_fields)
                self.user_project_df.loc[user.id, project.id] = sim
        return self.user_project_df
    
    def get_sim_table(self):
        return self.user_project_df
    
    def get_picked_sim_table(self):
        return pickle.dumps(self.user_project_df)
    
    def get_users_count(self):
        return len(self.user_project_df)
    
    def get_projects_count(self):
        return len(self.user_project_df.columns)



In [10]:
u_p_sim = UserProjectSimilarity()
u_p_sim.create_sim_table()

Unnamed: 0,64,65,66,67,69,70,72,73
4,,,,,,,,
14,,,,,,,,
13,,,,,,,,
5,,,,,,,,


In [11]:
u_p_sim.get_picked_sim_table()

b'\x80\x03cpandas.core.frame\nDataFrame\nq\x00)\x81q\x01}q\x02(X\x05\x00\x00\x00_dataq\x03cpandas.core.internals.managers\nBlockManager\nq\x04)\x81q\x05(]q\x06(cpandas.core.indexes.base\n_new_Index\nq\x07cpandas.core.indexes.numeric\nInt64Index\nq\x08}q\t(X\x04\x00\x00\x00dataq\ncnumpy.core.multiarray\n_reconstruct\nq\x0bcnumpy\nndarray\nq\x0cK\x00\x85q\rC\x01bq\x0e\x87q\x0fRq\x10(K\x01K\x08\x85q\x11cnumpy\ndtype\nq\x12X\x02\x00\x00\x00i8q\x13K\x00K\x01\x87q\x14Rq\x15(K\x03X\x01\x00\x00\x00<q\x16NNNJ\xff\xff\xff\xffJ\xff\xff\xff\xffK\x00tq\x17b\x89C@@\x00\x00\x00\x00\x00\x00\x00A\x00\x00\x00\x00\x00\x00\x00B\x00\x00\x00\x00\x00\x00\x00C\x00\x00\x00\x00\x00\x00\x00E\x00\x00\x00\x00\x00\x00\x00F\x00\x00\x00\x00\x00\x00\x00H\x00\x00\x00\x00\x00\x00\x00I\x00\x00\x00\x00\x00\x00\x00q\x18tq\x19bX\x04\x00\x00\x00nameq\x1aNu\x86q\x1bRq\x1ch\x07h\x08}q\x1d(h\nh\x0bh\x0cK\x00\x85q\x1eh\x0e\x87q\x1fRq (K\x01K\x04\x85q!h\x15\x89C \x04\x00\x00\x00\x00\x00\x00\x00\x0e\x00\x00\x00\x00\x00\x00\x00\r\x

In [12]:
# Retrieve a user, say 13.
sorted_projects = u_p_sim.get_sim_table().loc[[13]].melt().sort_values('value',ascending=False)
sorted_projects

Unnamed: 0,variable,value
0,64,
1,65,
2,66,
3,67,
4,69,
5,70,
6,72,
7,73,


In [13]:
sorted_projects.head(100)['variable'].to_list()

[64, 65, 66, 67, 69, 70, 72, 73]

In [27]:
from apps.ml.models import UserProjectRelation
obj = UserProjectRelation.objects.last()
print(obj)
# df = pickle.loads(obj.data_frame)
# df

None


## Based on Projects that a User interacts with in the past

### How to compare similarity of two projects P and Q
- Fields
- Tags
- Members