### The Data

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('data/stars.csv')

In [3]:
df.sample(5)

Unnamed: 0,description,language,repo,stargazers,user
9481,2019年最新总结，阿里，腾讯，百度，美团，头条等技术面试题目，以及答案，专家出题人分析汇总。,Python,0voice/interview_internal_reference,20555,zhoujz10
1757,Your Project with Great Documentation.,Python,timothycrosley/portray,473,03b8
4660,A cute widget of Switch Button for you to cre...,Java,kyleduo/SwitchButton,4152,fahmihidayah
4913,The Most Powerful Swipe Layout!,Java,daimajia/AndroidSwipeLayout,11483,fahmihidayah
23419,Yet another pong (it seems like I can't do any...,JavaScript,bendem/PongJS,1,Curlybear


### The Cleanup

In [4]:
df = df[df['repo'] != 'maxhumber/gazpacho']
popular = pd.DataFrame(df['repo'].value_counts())
select_repos = popular[popular['repo'] >= 3].index.tolist()
df = df[df['repo'].isin(select_repos)]

In [5]:
df.sample(5)

Unnamed: 0,description,language,repo,stargazers,user
18172,:cherry_blossom: A command-line fuzzy finder,Go,junegunn/fzf,24475,AllanLRH
13826,"Simple, fast, safe, compiled language for deve...",V,vlang/v,11672,ayarotsky
21057,All Algorithms implemented in Python,Jupyter Notebook,TheAlgorithms/Python,59309,Brombult
23101,🧠 Leon is your open-source personal assistant.,JavaScript,leon-ai/leon,5927,malfario
5566,"Collection of awesome Python types, stubs, plu...",,typeddjango/awesome-python-typing,267,mikeckennedy


### The Preparation

In [6]:
df = df.groupby(['user'])['repo'].apply(lambda x: ','.join(x))
df = pd.DataFrame(df)

In [7]:
df.sample(5)

Unnamed: 0_level_0,repo
user,Unnamed: 1_level_1
mazharul-miraz,"taniarascia/takenote,instantpage/instant.page,..."
thejustin,"Rigellute/spotify-tui,github/CodeSearchNet,psi..."
javad94,"microsoft/cascadia-code,matplotlib/matplotlib,..."
AndreWohnsland,"plasticityai/supersqlite,mherrmann/fbs"
MagicXY,"firmai/industry-machine-learning,thunlp/GNNPap..."


### The Model

In [8]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neighbors import NearestNeighbors


class NNRecommender:

    def __init__(self, n_neighbors=10, max_features=1000, tokenizer=lambda x: x.split(',')):
        self.cv = CountVectorizer(tokenizer=tokenizer, max_features=max_features)
        self.nn = NearestNeighbors(n_neighbors=n_neighbors)

    def fit(self, X):
        self.X = X
        X = self.cv.fit_transform(X)
        self.nn.fit(X)
        return self

    def predict(self, X):
        Xt = self.cv.transform(X)
        _, neighbors = self.nn.kneighbors(Xt)
        points = []
        for n in neighbors:
            repos = []
            for ni in n:
                r = self.X.iloc[int(ni)].split(',')
                repos.extend(r)
            repos = list(set(repos))
            points.append(repos)
        return points

In [9]:
model = NNRecommender()
model.fit(df['repo'])
model.predict(df['repo'])[14]

['shengqiangzhang/examples-of-web-crawlers',
 'plasticityai/supersqlite',
 'andkret/Cookbook',
 'burnash/gspread',
 'sloria/TextBlob',
 'mherrmann/fbs',
 'facebook/react',
 'fastai/fastai',
 'mrdoob/three.js',
 'github/gitignore',
 'froala/design-blocks',
 'kubernetes/kubernetes',
 'wistbean/learn_python3_spider',
 'torvalds/linux',
 'firmai/industry-machine-learning',
 'Avik-Jain/100-Days-Of-ML-Code',
 'TheAlgorithms/Python',
 'vinta/awesome-python',
 'getify/You-Dont-Know-JS']

### Under the Hood

In [10]:
cv = CountVectorizer(tokenizer=lambda x: x.split(','), max_features=1000)
X = cv.fit_transform(df['repo'])
X.todense()[:5]

matrix([[0, 0, 0, ..., 0, 0, 0],
        [0, 1, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [11]:
nn = NearestNeighbors(n_neighbors=10)

In [12]:
nn.fit(X)
dist, ind = nn.kneighbors()
ind[:5]

array([[ 11,  93,  14,   3,  68,  22,  83, 111,   2,  51],
       [ 11,  22,   3,  14,  69,  93,  83,  68,   2,  51],
       [ 11,  93,  14,   3,  68,  22,  83,  69,  51,   4],
       [ 11,  14,  93,  22,   2,  68,  83,  69,  51,  12],
       [ 11,   3,  14,  93,   2,  83,  68,  22,  69,  51]])

#### Pickle 

In [13]:
import pickle

In [14]:
with open('model.pkl', 'wb') as f:
    pickle.dump(model, f)

PicklingError: Can't pickle <function NNRecommender.<lambda> at 0x11f9120d0>: attribute lookup NNRecommender.<lambda> on __main__ failed

In [15]:
import dill # pip install dill

with open('model.pkl', 'wb') as f:
    dill.dump(model, f)

del model

with open('model.pkl', 'rb') as f:
    model = dill.load(f)

In [16]:
model.predict(df['repo'])[14]

['shengqiangzhang/examples-of-web-crawlers',
 'plasticityai/supersqlite',
 'andkret/Cookbook',
 'burnash/gspread',
 'sloria/TextBlob',
 'mherrmann/fbs',
 'facebook/react',
 'fastai/fastai',
 'mrdoob/three.js',
 'github/gitignore',
 'froala/design-blocks',
 'kubernetes/kubernetes',
 'wistbean/learn_python3_spider',
 'torvalds/linux',
 'firmai/industry-machine-learning',
 'Avik-Jain/100-Days-Of-ML-Code',
 'TheAlgorithms/Python',
 'vinta/awesome-python',
 'getify/You-Dont-Know-JS']

In [17]:
model.predict(['streamlit/streamlit,huggingface/transformers,encode/httpx,aws/chalice,maxhumber/chart'])

[['shengqiangzhang/examples-of-web-crawlers',
  'plasticityai/supersqlite',
  'andkret/Cookbook',
  'burnash/gspread',
  'sloria/TextBlob',
  'mherrmann/fbs',
  'facebook/react',
  'fastai/fastai',
  'mrdoob/three.js',
  'github/gitignore',
  'froala/design-blocks',
  'kubernetes/kubernetes',
  'wistbean/learn_python3_spider',
  'torvalds/linux',
  'firmai/industry-machine-learning',
  'Avik-Jain/100-Days-Of-ML-Code',
  'TheAlgorithms/Python',
  'vinta/awesome-python',
  'getify/You-Dont-Know-JS']]