In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler, MinMaxScaler

import warnings
warnings.filterwarnings('ignore')

In [2]:
data = pd.read_csv('cleaned_users.csv')

In [3]:
data.head()

Unnamed: 0,username,content,most_common_skills
0,esin,calathea shell dris docker remote image size d...,"['shell', 'go', 'dockerfile', 'html', 'docker'..."
1,jeffersonsimaogoncalves,animacao icone app javascript,"['animacao', 'icone', 'app', 'javascript']"
2,pedrualves,abc abc android angular translated rustic simp...,"['javascript', 'html', 'css', 'project', 'angu..."
3,kroitor,asciichart nice looking lightweight console as...,"['c', 'python', 'javascript', 'html', 'docs', ..."
4,vivekweb2013,audio recorder free simple audio recorder app ...,"['android', 'java', 'javascript', 'go', 'audio..."


In [4]:
data.shape

(561, 3)

In [5]:
corpus = data.content.tolist()

vectorizer = TfidfVectorizer(ngram_range=(1, 2), max_features=5000)

scaler = MinMaxScaler(feature_range=(0, 5))

_ = vectorizer.fit(corpus)

vectors = vectorizer.transform(corpus).toarray()

scaled_features = scaler.fit_transform(vectors)


In [6]:
text = "front end applications build frontend application using react typescript back end applications infrastructure build sdks backend solana contracts maintain chain infrastructure code review ensure code quality software reliability automated test implementations ui ux testing processes"

vec = vectorizer.transform([text])
scaled_vec = scaler.transform(vec.toarray())

knn = NearestNeighbors(n_neighbors=5, algorithm='brute', metric='euclidean')

_ = knn.fit(vectors)
dists, idx = knn.kneighbors(scaled_vec)

In [8]:
data.iloc[idx[0]]

Unnamed: 0,username,content,most_common_skills
482,ortonomy,audioaccess framer module framer js enables mi...,"['javascript', 'html', 'css', 'framer', 'game'..."
527,nash90,backend backend ecommerce system using hapi jw...,"['backend', 'using', 'html', 'python', 'app', ..."
183,moksh-mahajan,academic helper kotlin java bloc examples repo...,"['c', 'kotlin', 'html', 'flutter', 'app', 'dar..."
203,EleoXDA,affirmations app simple app shows affirmative ...,"['app', 'kotlin', 'html', 'ruby', 'javascript'..."
164,ritvij14,bloggy backend back end bloggy personal blog w...,"['html', 'kotlin', 'c', 'flutter', 'dart', 'sw..."


In [9]:
data.iloc[123].most_common_skills

"['python', 'css', 'javascript', 'flask', 'api', 'html', 'aws', 'reports', 'cloudera', 'core']"

In [14]:
data.iloc[idx[0]].content.iloc[0]

'audioaccess framer module framer js enables microphone input recording framer prototypes coffeescript javascript html css fling design assets designs fling work fling server fling server node js postgraphql backend flingapp front end javascript plpgsql flingapp front javascript vue css html flingapp frontend flingapp react js front end managing organisation book freelancers javascript css html react javascript html css mafiaserver node js based server create manage game mafia including real time chat javascript mediacontainer framer layer extension framer provides ready made video player layer playback controls coffeescript ortonomy github io freelance front end dev hire product manager board game addict css javascript html random password part cocoa programming course swift sided core ruby html simon game browser based version classic 70s 80s hand held memory game css javascript html ruby simon ios swift 3 version simon ios swift surge boilerplate swiftlesson4 antoine swift lesson 4 

In [11]:
text

'front end applications build frontend application using react typescript back end applications infrastructure build sdks backend solana contracts maintain chain infrastructure code review ensure code quality software reliability automated test implementations ui ux testing processes'

In [12]:
np.sum(scaled_vec[0])

127.29751180716862

In [13]:
len(vectorizer.vocabulary_)

5000

In [17]:
import pickle

with open("scaler.pkl", "wb") as f:
    pickle.dump(scaler, f)
    
with open("vectorizer.pkl", "wb") as f:
    pickle.dump(vectorizer, f)

with open("model.pkl", "wb") as f:
    pickle.dump(knn, f)