### The Data

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('../data/stars.csv')

In [3]:
df.sample(5)

Unnamed: 0,description,language,repo,stargazers,user
8868,关于Detour App规则配置的简单介绍,,iamldj/Detour,287,neilyoung2008
6307,Bootstrap components built with React,JavaScript,react-bootstrap/react-bootstrap,16447,pushpendrapratap
18371,A maintainable Python code checker for Sublime...,Python,patrys/PythonChecker,27,berlotto
7202,A highly customizable calendar library for And...,Kotlin,kizitonwose/CalendarView,1046,edujtm
19527,,Makefile,crccheck/docker-postgis,2,dankeemahill


### The Cleanup

In [4]:
df = df[df['repo'] != 'maxhumber/gazpacho']
df = df[df.language.isin(['Python', 'Jupyter Notebook'])]
popular = pd.DataFrame(df['repo'].value_counts())
select_repos = popular[popular['repo'] >= 5].index.tolist()
df = df[df['repo'].isin(select_repos)]

In [5]:
df.sample(5)

Unnamed: 0,description,language,repo,stargazers,user
6500,:bookmark: Browser-independent bookmark manager,Python,jarun/Buku,3022,jrezzende
19758,Bandit is a tool designed to find common secur...,Python,PyCQA/bandit,1731,viseshrp
15788,DeepFaceLab is a tool that utilizes machine le...,Python,iperov/DeepFaceLab,10246,mazharul-miraz
6310,Deepfakes Software For All,Python,deepfakes/faceswap,26495,pushpendrapratap
18599,"💿 Free software that works great, and also hap...",Jupyter Notebook,mahmoud/awesome-python-applications,9260,wrongpoison


### The Preparation

In [6]:
df = df.groupby(['user'])['repo'].apply(lambda x: ','.join(x))
df = pd.DataFrame(df)

In [7]:
df.sample(5)

Unnamed: 0_level_0,repo
user,Unnamed: 1_level_1
javad94,"tqdm/tqdm,alttch/rapidtables,r0x0r/pywebview,p..."
greed2411,"pytorch/captum,jarun/Buku,streamlit/streamlit,..."
sseemayer,"vaexio/vaex,huge-success/sanic,hugapi/hug,tqdm..."
angelparras,"OWASP/CheatSheetSeries,30-seconds/30-seconds-o..."
dankeemahill,"codelucas/newspaper,bisguzar/twitter-scraper,j..."


### The Model

In [8]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neighbors import NearestNeighbors


class NNRecommender:

    def __init__(self, n_neighbors=10, max_features=1000, tokenizer=lambda x: x.split(',')):
        self.cv = CountVectorizer(tokenizer=tokenizer, max_features=max_features)
        self.nn = NearestNeighbors(n_neighbors=n_neighbors)

    def fit(self, X):
        self.X = X
        X = self.cv.fit_transform(X)
        self.nn.fit(X)
        return self

    def predict(self, X):
        Xt = self.cv.transform(X)
        _, neighbors = self.nn.kneighbors(Xt)
        points = []
        for n in neighbors:
            repos = []
            for ni in n:
                r = self.X.iloc[int(ni)].split(',')
                repos.extend(r)
            repos = list(set(repos))
            points.append(repos)
        return points

In [9]:
model = NNRecommender()
model.fit(df['repo'])
model.predict(df['repo'])[14]

['scikit-learn/scikit-learn',
 'TheAlgorithms/Python',
 'plasticityai/supersqlite',
 'minimaxir/big-list-of-naughty-strings',
 'donnemartin/interactive-coding-challenges',
 'donnemartin/system-design-primer',
 'deepfakes/faceswap',
 'fastai/fastai',
 'sloria/TextBlob',
 'mherrmann/fbs',
 'shengqiangzhang/examples-of-web-crawlers',
 'ytdl-org/youtube-dl',
 'PySimpleGUI/PySimpleGUI']

### Under the Hood

In [10]:
cv = CountVectorizer(tokenizer=lambda x: x.split(','), max_features=1000)
X = cv.fit_transform(df['repo'])
X.todense()[:5]

matrix([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 1, 1, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [11]:
nn = NearestNeighbors(n_neighbors=10)

In [12]:
nn.fit(X)
dist, ind = nn.kneighbors()
ind[:5]

array([[106,  14,  99,  48,  65,   3,  10,  19,   2,  40],
       [ 14,  66,  99,  10,  19,  65,  48,   3,  40,  12],
       [ 10,  19,   3,  99,  48,  14,  65,  40,  66,  92],
       [ 48,  10,  99,  19,  14,  65,   2,  40,  92,  72],
       [ 28,  10,  48,  72,  20,  19,  99,  14,  65,   3]])

#### Pickle 

In [13]:
import pickle

In [14]:
with open('model.pkl', 'wb') as f:
    pickle.dump(model, f)

PicklingError: Can't pickle <function NNRecommender.<lambda> at 0x11b292f28>: attribute lookup NNRecommender.<lambda> on __main__ failed

In [15]:
import dill # pip install dill

with open('model.pkl', 'wb') as f:
    dill.dump(model, f)

del model

with open('model.pkl', 'rb') as f:
    model = dill.load(f)

In [16]:
model.predict(df['repo'])[14]

['scikit-learn/scikit-learn',
 'TheAlgorithms/Python',
 'plasticityai/supersqlite',
 'minimaxir/big-list-of-naughty-strings',
 'donnemartin/interactive-coding-challenges',
 'donnemartin/system-design-primer',
 'deepfakes/faceswap',
 'fastai/fastai',
 'sloria/TextBlob',
 'mherrmann/fbs',
 'shengqiangzhang/examples-of-web-crawlers',
 'ytdl-org/youtube-dl',
 'PySimpleGUI/PySimpleGUI']

In [17]:
model.predict(['streamlit/streamlit,huggingface/transformers,encode/httpx,aws/chalice,maxhumber/chart'])

[['scikit-learn/scikit-learn',
  'TheAlgorithms/Python',
  'plasticityai/supersqlite',
  'minimaxir/big-list-of-naughty-strings',
  'Avik-Jain/100-Days-Of-ML-Code',
  'donnemartin/system-design-primer',
  'vinta/awesome-python',
  'fastai/fastai',
  'sloria/TextBlob',
  'mherrmann/fbs',
  'shengqiangzhang/examples-of-web-crawlers',
  'ytdl-org/youtube-dl',
  'PySimpleGUI/PySimpleGUI']]