#### Spotlight

The Quickstart Example

In [1]:
from spotlight.cross_validation import random_train_test_split
from spotlight.datasets.movielens import get_movielens_dataset
from spotlight.evaluation import mrr_score
from spotlight.factorization.implicit import ImplicitFactorizationModel

In [2]:
dataset = get_movielens_dataset(variant='100K')

train, test = random_train_test_split(dataset)

In [3]:
train

<Interactions dataset (944 users x 1683 items x 80000 interactions)>

Not helpful... need to go deeper:

In [4]:
train.user_ids

array([313, 731, 727, ..., 721, 724, 790], dtype=int32)

In [5]:
train.item_ids

array([ 588,    1,  465, ..., 1039,  304,  215], dtype=int32)

In [6]:
train.ratings

array([4., 2., 2., ..., 5., 4., 2.], dtype=float32)

And peek inside:

In [7]:
train.tocsr().todense()

matrix([[0., 0., 0., ..., 0., 0., 0.],
        [0., 5., 3., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 5., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

The model:

In [8]:
model = ImplicitFactorizationModel(n_iter=3,
                                   loss='bpr')
model.fit(train)

mrr = mrr_score(model, test)

In [9]:
mrr[:10]

array([0.03091495, 0.03727306, 0.02844995, 0.04047423, 0.00958493,
       0.01485202, 0.02034109, 0.09473128, 0.00718002, 0.0102306 ])

### GitHub Stars

Data retrived from scraping GitHub:

In [10]:
import pandas as pd 

df = pd.read_csv('data/stars.csv')

In [11]:
df.sample(5)

Unnamed: 0,user,repo,description,language,stargazers
48890,AXeL-dev,lettier/gifcurry,"😎 The open-source, Haskell-built video editor ...",Haskell,556
50224,pushpendrapratap,MessageKit/MessageKit,A community-driven replacement for JSQMessages...,Swift,3975
10216,RobertasMurnikovas,lk-geimfari/awesomo,An extensive list of cool open source projects...,JavaScript,7090
39328,antogerva,hugovk/top-pypi-packages,A regular dump of the most-downloaded packages...,HTML,38
11873,stribny,openvenues/libpostal,A C library for parsing/normalizing street add...,C,2664


In [12]:
df['language'].value_counts()

Python              25702
JavaScript           6459
Jupyter Notebook     3777
Go                   2571
C++                  2544
                    ...  
KiCad                   1
Hack                    1
Pure Data               1
VCL                     1
Papyrus                 1
Name: language, Length: 178, dtype: int64

In [13]:
df = df[df.language == 'Python']
df = df[~df['repo'].isin(['maxhumber/gif', 'maxhumber/gazpacho'])]

In [14]:
df.shape

(25380, 5)

In [15]:
len(df['repo'].unique())

12222

In [16]:
len(df['user'].unique())

326

In [17]:
df.head(3)

Unnamed: 0,user,repo,description,language,stargazers
0,sbarman-mi9,as-ideas/ForwardTacotron,⏩ Generating speech in a single forward pass w...,Python,97
1,sbarman-mi9,abhishekkrthakur/bert-sentiment,,Python,21
4,sbarman-mi9,EmilyAlsentzer/clinicalBERT,repository for Publicly Available Clinical BER...,Python,160


In [19]:
from spotlight.interactions import Interactions

In [20]:
# won't work
interactions = Interactions(df['user'], df['repo'])

TypeError: must be str, not int

"Everything must be a number"

In [21]:
from sklearn.preprocessing import LabelEncoder

In [22]:
user_encoder = LabelEncoder()
item_encoder = LabelEncoder()

In [23]:
users = user_encoder.fit_transform(df['user'])
items = item_encoder.fit_transform(df['repo'])

In [24]:
interactions = Interactions(users, items)

In [25]:
interactions

<Interactions dataset (326 users x 12222 items x 25380 interactions)>

Be a good Data Scientist:

In [26]:
import numpy as np
from spotlight.cross_validation import random_train_test_split

train, test = random_train_test_split(interactions, test_percentage=0.2, random_state=np.random.RandomState(42))

In [27]:
model = ImplicitFactorizationModel(loss='pointwise', n_iter=20)

In [28]:
model.fit(train)

In [30]:
df.head(3)

Unnamed: 0,user,repo,description,language,stargazers,rating
0,sbarman-mi9,as-ideas/ForwardTacotron,⏩ Generating speech in a single forward pass w...,Python,97,1
1,sbarman-mi9,abhishekkrthakur/bert-sentiment,,Python,21,1
4,sbarman-mi9,EmilyAlsentzer/clinicalBERT,repository for Publicly Available Clinical BER...,Python,160,1


Examining one user:

In [31]:
garrry = df[df['user'] == 'garrrychan']

In [32]:
garrry['repo'].values

array(['mnielsen/neural-networks-and-deep-learning',
       'brendan-rius/jupyter-c-kernel', 'google-research/uda',
       'hhatto/autopep8', 'MaxHalford/prince',
       'scikit-learn-contrib/sklearn-pandas', 'modin-project/modin',
       'VikParuchuri/apartment-finder', 'uber-research/parallax',
       'prabhupant/python-ds', 'tensorflow/nmt',
       'huggingface/transformers', 'h5py/h5py', 'google-research/bert',
       'sloria/TextBlob', 'openai/gpt-2', 'matsui528/rii',
       'ResidentMario/missingno', 'lmcinnes/enstop',
       'phatpiglet/autocorrect', 'barrust/pyspellchecker',
       'seatgeek/fuzzywuzzy', 'maxhumber/chart', 'apache/airflow',
       'graphql-python/graphene', 'maxhumber/marc', 'spotify/luigi',
       'garrrychan/recipe_recommender_system',
       'CamDavidsonPilon/lifetimes', 'maciejkula/spotlight',
       'lyst/lightfm', 'practical-recommender-systems/moviegeek',
       'uwescience/TrafficCruising-DSSG2017'], dtype=object)

Looking at the user_id:

In [33]:
u = user_encoder.transform(['garrrychan'])

And all of the items:

In [35]:
item_encoder.classes_

array(['00111000/Imports-in-Python', '05bit/peewee-async',
       '0Kee-Team/WatchAD', ..., 'zzw922cn/Automatic_Speech_Recognition',
       'zzzDavid/ICDAR-2019-SROIE', 'zzzeek/sqlalchemy'], dtype=object)

In [36]:
preds = model.predict(u, np.arange(len(item_encoder.classes_)))

In [37]:
pd.DataFrame({
    'repo': item_encoder.classes_,
    'pred': preds
}).sort_values('pred', ascending=False)

Unnamed: 0,repo,pred
11142,tiangolo/fastapi,9.747992
12129,zedr/clean-code-python,8.961095
8148,modin-project/modin,8.233457
4833,encode/httpx,8.140592
10121,satwikkansal/wtfpython,7.830858
...,...,...
7474,luopeixiang/named_entity_recognition,-13.510491
5434,getkeops/keops,-13.959396
2840,arogozhnikov/einops,-14.234956
7211,lazyprogrammer/machine_learning_examples,-14.273504


Evaulating the model:

In [38]:
from spotlight.evaluation import precision_recall_score

In [39]:
precision, recall = precision_recall_score(model, test, train, k=10)

In [40]:
precision.mean()

0.02712418300653595

In [41]:
recall.mean()

0.024246424185871543

Serialize:

In [42]:
import torch 

torch.save(model, 'model.spot')

In [43]:
del model

In [44]:
model = torch.load('model.spot')

Predict on another random user:

In [48]:
u = user_encoder.transform(['RandomOS'])

In [49]:
model.predict(u, np.arange(len(item_encoder.classes_)))

array([ -7.839106  ,  -7.163939  , -11.378734  , ...,  -5.098578  ,
        -5.776942  ,   0.34575325], dtype=float32)

In [54]:
pd.DataFrame({
    'repo': item_encoder.classes_,
    'pred': model.predict(u, np.arange(len(item_encoder.classes_)))
}).sort_values('pred', ascending=False).head(20)

Unnamed: 0,repo,pred
9446,python/cpython,13.89078
5951,huge-success/sanic,12.354092
11292,tqdm/tqdm,11.5965
12073,ytdl-org/youtube-dl,11.352339
2364,ageitgey/face_recognition,10.884702
10921,taoufik07/responder,10.590864
4836,encode/starlette,10.553424
5580,google/python-fire,10.094642
8633,nvbn/thefuck,9.899976
1593,PySimpleGUI/PySimpleGUI,9.815762


Actual likes:

In [53]:
df[df['user'] == 'RandomOS']['repo']

58885               tortoise/tortoise-orm
58887                 mingrammer/diagrams
58905                       spulec/uncurl
58913         mozilla-iot/webthing-python
58922                       spotify/luigi
58924                    python-trio/trio
58925                       ranger/ranger
58928              googlefonts/noto-emoji
58937                Synss/python-mbedtls
58940                    pipxproject/pipx
58941                       linkedin/shiv
58942                      pantsbuild/pex
58946        PythonCharmers/python-future
58947     alan-turing-institute/CleverCSV
58948                 emeryberger/scalene
58949                 aouinizied/nfstream
58950               ionelmc/python-hunter
58951                      wolever/pip2pi
58956          fossasia/open-event-server
58994                python-poetry/poetry
58997                     thumbor/thumbor
59004                       holoviz/panel
59019             0xInfection/Awesome-WAF
59020    swisskyrepo/PayloadsAllTh