### The Data

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('../data/stars.csv')

In [4]:
df.sample(5)

Unnamed: 0,description,language,repo,stargazers,user
23037,Kakoune Language Server Protocol Client,Rust,ul/kak-lsp,164,malfario
7940,"Simple, secure & standards compliant web I/O f...",C++,uNetworking/uWebSockets,10926,mpanczyk
22540,Abusing Certificate Transparency logs for gett...,Python,UnaPibaGeek/ctfr,1236,andrewwxy
3952,Modin: Speed up your Pandas workflows by chang...,Python,modin-project/modin,2786,garciparedes
2365,A color picker bottom sheet 🌈,Kotlin,msasikanth/ColorSheet,117,edityomurti


### The Cleanup

In [5]:
df = df[df['repo'] != 'maxhumber/gazpacho']
df = df[df.language.isin(['Python', 'Jupyter Notebook'])]
popular = pd.DataFrame(df['repo'].value_counts())
select_repos = popular[popular['repo'] >= 5].index.tolist()
df = df[df['repo'].isin(select_repos)]

In [7]:
df.sample(5)

Unnamed: 0,description,language,repo,stargazers,user
13147,👨‍💼Python Wrapper for the Linkedin API,Python,tomquirk/linkedin-api,155,erdimollahuseyin
23621,Google Search Scraper,Python,s0md3v/goop,435,avinash-mishra
16509,"Hypothesis is a powerful, flexible, and easy t...",Python,HypothesisWorks/hypothesis,3836,ilyagerner
22037,Create HTML profiling reports from pandas Data...,Python,pandas-profiling/pandas-profiling,3496,pavlokurochka
11008,"💿 Free software that works great, and also hap...",Jupyter Notebook,mahmoud/awesome-python-applications,9260,BevisGoh


In [8]:
df.shape

(2229, 5)

In [9]:
# shove into something that looks like this?
pd.DataFrame([
    [0, 0, 1, 0],
    [0, 1, 1, 0],
    [1, 0, 0, 1],
    [0, 0, 0, 1]
])

Unnamed: 0,0,1,2,3
0,0,0,1,0
1,0,1,1,0
2,1,0,0,1
3,0,0,0,1


### The Preparation

In [10]:
df = df.groupby(['user'])['repo'].apply(lambda x: ','.join(x))
df = pd.DataFrame(df)

In [11]:
df.sample(5)

Unnamed: 0_level_0,repo
user,Unnamed: 1_level_1
andreztz,"pudo/dataset,tawesoft/pyrrhic,jankrepl/pychubb..."
robcowie,"pirate/ArchiveBox,robinhood/faust,psf/requests..."
victordomingos,"ytdl-org/youtube-dl,encode/httpx,MasoniteFrame..."
03b8,"python/cpython,hugapi/hug,HypothesisWorks/hypo..."
BudSun,"30-seconds/30-seconds-of-python,jackfrued/Pyth..."


In [12]:
df.loc['Curlybear'].values.tolist()

['vinta/awesome-python,CorentinJ/Real-Time-Voice-Cloning,danijar/handout,ageitgey/face_recognition,jofpin/trape,grapheneX/grapheneX,gto76/python-cheatsheet,trekhleb/homemade-machine-learning,manrajgrover/halo,chris104957/maildown,psf/black,stewartmcgown/uds,TheAlgorithms/Python,cool-RR/PySnooper,Bogdanp/molten,nicolargo/glances,mkaz/termgraph,donnemartin/system-design-primer,chubin/cheat.sh,psf/requests-html,Zulko/moviepy,jarun/Buku,pypa/pipenv,soimort/you-get,public-apis/public-apis,HelloZeroNet/ZeroNet']

### The Model

In [13]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neighbors import NearestNeighbors


class NNRecommender:
    def __init__(self, n_neighbors=10, max_features=1000, tokenizer=lambda x: x.split(",")):
        self.cv = CountVectorizer(tokenizer=tokenizer, max_features=max_features)
        self.nn = NearestNeighbors(n_neighbors=n_neighbors)

    def fit(self, X):
        self.X = X
        X = self.cv.fit_transform(X)
        self.nn.fit(X)
        return self

    def predict(self, X):
        Xp = []
        for Xi in X:
            Xt = self.cv.transform([Xi])
            neighbors = self.nn.kneighbors(Xt, return_distance=False)
            repos = []
            for n in neighbors[0]:
                r = self.X.iloc[int(n)].split(",")
                repos.extend(r)
            repos = list(set(repos))
            repos = [r for r in repos if r not in Xi.split(",")]
            Xp.append(repos)
        return Xp

In [14]:
model = NNRecommender()
model.fit(df['repo'])

<__main__.NNRecommender at 0x121db4bd0>

In [15]:
df['repo'][42]

'CorentinJ/Real-Time-Voice-Cloning,s0md3v/goop,psf/requests,slundberg/shap,facebookresearch/pytext,practicalAI/practicalAI,Zulko/moviepy,python/mypy,ParthS007/background,keon/algorithms,google/python-fire,eriklindernoren/ML-From-Scratch,minimaxir/big-list-of-naughty-strings,pudo/dataset,nvbn/thefuck,dbcli/pgcli,faif/python-patterns,chriskiehl/Gooey'

In [16]:
model.predict([df['repo'][42]])

[['sherlock-project/sherlock',
  'weskerfoot/DeleteFB',
  'dae/anki',
  'deepfakes/faceswap',
  'scikit-learn/scikit-learn',
  'ageron/handson-ml2',
  'fastai/fastai',
  'Miserlou/Zappa',
  'smacke/subsync',
  'sloria/TextBlob',
  'donnemartin/interactive-coding-challenges',
  'shengqiangzhang/examples-of-web-crawlers']]

### Under the Hood

In [17]:
df.iloc[0].values.tolist()

['python/cpython,hugapi/hug,HypothesisWorks/hypothesis,timothycrosley/portray,timothycrosley/hypothesis-auto,tiangolo/fastapi,bocadilloproject/bocadillo,tartiflette/tartiflette,encode/httpx,pytest-dev/pytest,tonybaloney/wily,sdispater/poetry,mahmoud/glom,psf/black,chubin/wttr.in']

In [18]:
cv = CountVectorizer(tokenizer=lambda x: x.split(','), max_features=1000)
X = cv.fit_transform(df['repo'])
X.todense()[:5]

matrix([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 1, 1, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]])

In [19]:
nn = NearestNeighbors(n_neighbors=10)

In [20]:
nn.fit(X)
ind = nn.kneighbors(return_distance=False)
ind[:5]

array([[106,  14,  99,  48,  65,   3,  10,  19,   2,  40],
       [ 14,  66,  99,  10,  19,  65,  48,   3,  40,  12],
       [ 10,  19,   3,  99,  48,  14,  65,  40,  66,  92],
       [ 48,  10,  99,  19,  14,  65,   2,  40,  92,  72],
       [ 28,  10,  48,  72,  20,  19,  99,  14,  65,   3]])

In [21]:
df.iloc[106].values

array(['timothycrosley/hypothesis-auto,timothycrosley/portray,sdispater/poetry,wyounas/homer,tiangolo/fastapi,psf/black,marshmallow-code/marshmallow,falconry/falcon,cookiecutter/cookiecutter'],
      dtype=object)

#### Pickle 

In [22]:
import pickle

In [23]:
with open('model.pkl', 'wb') as f:
    pickle.dump(model, f)

PicklingError: Can't pickle <function NNRecommender.<lambda> at 0x1a23647320>: attribute lookup NNRecommender.<lambda> on __main__ failed

In [24]:
import dill # pip install dill

with open('model.pkl', 'wb') as f:
    dill.dump(model, f)

del model

with open('model.pkl', 'rb') as f:
    model = dill.load(f)

In [25]:
model.predict(['streamlit/streamlit,huggingface/transformers,encode/httpx,aws/chalice,maxhumber/chart'])

[['mherrmann/fbs',
  'scikit-learn/scikit-learn',
  'minimaxir/big-list-of-naughty-strings',
  'sloria/TextBlob',
  'ytdl-org/youtube-dl',
  'plasticityai/supersqlite',
  'vinta/awesome-python',
  'Avik-Jain/100-Days-Of-ML-Code',
  'PySimpleGUI/PySimpleGUI',
  'TheAlgorithms/Python',
  'donnemartin/system-design-primer',
  'fastai/fastai',
  'shengqiangzhang/examples-of-web-crawlers']]