#### Spotlight

The Quickstart Example

In [1]:
from spotlight.cross_validation import random_train_test_split
from spotlight.datasets.movielens import get_movielens_dataset
from spotlight.evaluation import mrr_score
from spotlight.factorization.implicit import ImplicitFactorizationModel

In [2]:
dataset = get_movielens_dataset(variant='100K')

train, test = random_train_test_split(dataset)

In [3]:
train

<Interactions dataset (944 users x 1683 items x 80000 interactions)>

Not helpful... need to go deeper:

In [4]:
train.user_ids

array([374, 782, 585, ..., 650, 257, 632], dtype=int32)

In [5]:
train.item_ids

array([111, 325,  60, ...,  50, 116, 588], dtype=int32)

In [6]:
train.ratings

array([2., 2., 4., ..., 5., 3., 2.], dtype=float32)

And peek inside:

In [7]:
train.tocsr().todense()

matrix([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 4., 0., ..., 0., 0., 0.],
        ...,
        [0., 5., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

The model:

In [8]:
model = ImplicitFactorizationModel(n_iter=3,
                                   loss='bpr')
model.fit(train)

mrr = mrr_score(model, test)

In [9]:
mrr[:10]

array([0.01659277, 0.04875664, 0.01378118, 0.18311409, 0.0117817 ,
       0.01385401, 0.01251839, 0.00914449, 0.0027933 , 0.01162295])

### GitHub Stars

Data retrived from scraping GitHub:

In [10]:
import pandas as pd 

df = pd.read_csv('data/stars.csv')

In [11]:
df.sample(5)

Unnamed: 0,user,repo,description,language,stargazers
403,edujtm,Rigellute/spotify-tui,Spotify for the terminal written in Rust 🚀,Rust,4620
17311,bm371613,paritytech/wasmi,Wasm interpreter in Rust,Rust,489
15929,Marc-g-Z,Teino1978-Corp/publify,A self hosted Web publishing platform on Rails...,Ruby,3
55144,denisfitz57,lemmy/BlockingQueue,"Tutorial ""Weeks of debugging can save you hour...",TLA,274
44064,enzoftware,dangcuuson/graphql-schema-typescript,Generate TypeScript from GraphQL's schema type...,TypeScript,145


In [12]:
df['language'].value_counts()

Python              25702
JavaScript           6459
Jupyter Notebook     3777
Go                   2571
C++                  2544
                    ...  
Ada                     1
Squirrel                1
Bro                     1
Inno Setup              1
Boo                     1
Name: language, Length: 178, dtype: int64

In [13]:
df = df[df.language == 'Python']
df = df[~df['repo'].isin(['maxhumber/gif', 'maxhumber/gazpacho'])]

In [14]:
df.shape

(25380, 5)

In [15]:
len(df['repo'].unique())

12222

In [16]:
len(df['user'].unique())

326

In [17]:
df.head(3)

Unnamed: 0,user,repo,description,language,stargazers
0,sbarman-mi9,as-ideas/ForwardTacotron,⏩ Generating speech in a single forward pass w...,Python,97
1,sbarman-mi9,abhishekkrthakur/bert-sentiment,,Python,21
4,sbarman-mi9,EmilyAlsentzer/clinicalBERT,repository for Publicly Available Clinical BER...,Python,160


In [18]:
from spotlight.interactions import Interactions

In [19]:
# won't work
interactions = Interactions(df['user'], df['repo'])

TypeError: must be str, not int

"Everything must be a number"

In [20]:
from sklearn.preprocessing import LabelEncoder

In [21]:
user_encoder = LabelEncoder()
item_encoder = LabelEncoder()

In [22]:
users = user_encoder.fit_transform(df['user'])
items = item_encoder.fit_transform(df['repo'])

In [23]:
interactions = Interactions(users, items)

In [24]:
interactions

<Interactions dataset (326 users x 12222 items x 25380 interactions)>

Be a good Data Scientist:

In [25]:
import numpy as np
from spotlight.cross_validation import random_train_test_split

train, test = random_train_test_split(interactions, test_percentage=0.2, random_state=np.random.RandomState(42))

In [26]:
model = ImplicitFactorizationModel(loss='pointwise', n_iter=20)

In [27]:
model.fit(train)

In [28]:
df.head(3)

Unnamed: 0,user,repo,description,language,stargazers
0,sbarman-mi9,as-ideas/ForwardTacotron,⏩ Generating speech in a single forward pass w...,Python,97
1,sbarman-mi9,abhishekkrthakur/bert-sentiment,,Python,21
4,sbarman-mi9,EmilyAlsentzer/clinicalBERT,repository for Publicly Available Clinical BER...,Python,160


Examining one user:

In [29]:
garrry = df[df['user'] == 'garrrychan']

In [30]:
garrry['repo'].values

array(['mnielsen/neural-networks-and-deep-learning',
       'brendan-rius/jupyter-c-kernel', 'google-research/uda',
       'hhatto/autopep8', 'MaxHalford/prince',
       'scikit-learn-contrib/sklearn-pandas', 'modin-project/modin',
       'VikParuchuri/apartment-finder', 'uber-research/parallax',
       'prabhupant/python-ds', 'tensorflow/nmt',
       'huggingface/transformers', 'h5py/h5py', 'google-research/bert',
       'sloria/TextBlob', 'openai/gpt-2', 'matsui528/rii',
       'ResidentMario/missingno', 'lmcinnes/enstop',
       'phatpiglet/autocorrect', 'barrust/pyspellchecker',
       'seatgeek/fuzzywuzzy', 'maxhumber/chart', 'apache/airflow',
       'graphql-python/graphene', 'maxhumber/marc', 'spotify/luigi',
       'garrrychan/recipe_recommender_system',
       'CamDavidsonPilon/lifetimes', 'maciejkula/spotlight',
       'lyst/lightfm', 'practical-recommender-systems/moviegeek',
       'uwescience/TrafficCruising-DSSG2017'], dtype=object)

Looking at the user_id:

In [31]:
u = user_encoder.transform(['garrrychan'])

And all of the items:

In [32]:
item_encoder.classes_

array(['00111000/Imports-in-Python', '05bit/peewee-async',
       '0Kee-Team/WatchAD', ..., 'zzw922cn/Automatic_Speech_Recognition',
       'zzzDavid/ICDAR-2019-SROIE', 'zzzeek/sqlalchemy'], dtype=object)

In [33]:
preds = model.predict(u, np.arange(len(item_encoder.classes_)))

In [34]:
pd.DataFrame({
    'repo': item_encoder.classes_,
    'pred': preds
}).sort_values('pred', ascending=False)

Unnamed: 0,repo,pred
9238,psf/black,12.126504
1982,TheAlgorithms/Python,8.802712
5764,hanxiao/bert-as-service,7.714744
8633,nvbn/thefuck,7.585787
11599,vinta/awesome-python,7.435246
...,...,...
1719,RobertoPrevato/BlackSheep,-12.567848
4403,dgorissen/pycel,-12.689927
294,CCExtractor/vardbg,-12.829387
9908,rougier/numpy-tutorial,-12.925920


Evaulating the model:

In [35]:
from spotlight.evaluation import precision_recall_score

In [36]:
precision, recall = precision_recall_score(model, test, train, k=10)

In [37]:
precision.mean()

0.029738562091503266

In [38]:
recall.mean()

0.02262662447683623

Serialize:

In [39]:
import torch 

torch.save(model, 'model.spot')

In [40]:
del model

In [41]:
model = torch.load('model.spot')

Predict on another random user:

In [42]:
u = user_encoder.transform(['RandomOS'])

In [43]:
model.predict(u, np.arange(len(item_encoder.classes_)))

array([-9.134271 , -5.797849 , -8.294138 , ..., -6.563952 , -7.0342917,
       -3.649625 ], dtype=float32)

In [44]:
pd.DataFrame({
    'repo': item_encoder.classes_,
    'pred': model.predict(u, np.arange(len(item_encoder.classes_)))
}).sort_values('pred', ascending=False).head(20)

Unnamed: 0,repo,pred
9238,psf/black,14.884586
10766,streamlit/streamlit,13.455529
11142,tiangolo/fastapi,13.105063
7383,locustio/locust,12.341149
3758,chubin/cheat.sh,12.321002
4833,encode/httpx,12.286151
9239,psf/requests,12.265904
11292,tqdm/tqdm,12.18388
10176,scikit-learn/scikit-learn,11.949236
6623,joke2k/faker,11.515172


Actual likes:

In [45]:
df[df['user'] == 'RandomOS']['repo']

58885               tortoise/tortoise-orm
58887                 mingrammer/diagrams
58905                       spulec/uncurl
58913         mozilla-iot/webthing-python
58922                       spotify/luigi
58924                    python-trio/trio
58925                       ranger/ranger
58928              googlefonts/noto-emoji
58937                Synss/python-mbedtls
58940                    pipxproject/pipx
58941                       linkedin/shiv
58942                      pantsbuild/pex
58946        PythonCharmers/python-future
58947     alan-turing-institute/CleverCSV
58948                 emeryberger/scalene
58949                 aouinizied/nfstream
58950               ionelmc/python-hunter
58951                      wolever/pip2pi
58956          fossasia/open-event-server
58994                python-poetry/poetry
58997                     thumbor/thumbor
59004                       holoviz/panel
59019             0xInfection/Awesome-WAF
59020    swisskyrepo/PayloadsAllTh

### But what if the user is brand new?~

In [46]:
user_encoder.transform(['maxhumber'])

ValueError: y contains previously unseen labels: ['maxhumber']

In [47]:
model.predict(1993, np.arange(len(item_encoder.classes_)))

ValueError: Maximum user id greater than number of users in model.