In [3]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
import re
import nltk
from nltk.corpus import stopwords
import string

from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.neighbors import KNeighborsClassifier
# from sklearn.neural_network import MLPClassifier

nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
from google.colab import drive
drive.mount('/content/drive',force_remount=True)

Mounted at /content/drive


In [5]:
def cleanResume(resumeText):

    resumeText = re.sub('[%s]' % re.escape("""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"""), ' ', resumeText)  # remove punctuations
    resumeText = re.sub(r'[^\x00-\x7f]',r' ', resumeText) 
    resumeText = re.sub('\s+', ' ', resumeText)  # remove extra whitespace
    return resumeText

In [6]:
def getData():

  resumeDataSet = pd.read_csv('/content/drive/My Drive/new_dataset_of_resume_skills.csv')

  resumeDataSet['cleaned_resume_skills'] = ''
  resumeDataSet['cleaned_resume_skills'] = resumeDataSet.Resume.apply(lambda x: cleanResume(x))
  
  return resumeDataSet

In [7]:
def encoding(resumeDataSet):
  
  le = LabelEncoder()    
  resumeDataSet['Category'] = le.fit_transform(resumeDataSet['Category'])
  return le

In [8]:
def vectorizing(requiredText):

  word_vectorizer = TfidfVectorizer( sublinear_tf=True, stop_words='english')
  word_vectorizer.fit(requiredText)
  WordFeatures = word_vectorizer.transform(requiredText)

  return word_vectorizer, WordFeatures

In [35]:
def trainModel(X,Y):

  x_train,x_test,y_train,y_test = train_test_split(X,Y,random_state=4,test_size=0.25)

  model = KNeighborsClassifier(n_neighbors=10).fit(x_train, y_train)
  prediction = model.predict(x_test)
  print('Accuracy of KNN :- {:.2f}'.format(model.score(x_test, y_test)))
  return model


In [38]:
def topKNeighbours(y, model, testData, le,k=2):

  top_k = set()     # Set of unique Neighbours
  i = 1             # Predicted value will always be added to the set

  predictedValue = model.predict(testData)
  invrValue = le.inverse_transform(predictedValue)[0]
  # print(invrValue)
  top_k.add(invrValue)
  
  dist, ind = model.kneighbors(testData)
  # print(dist[0])
  # print(ind[0])

  for i in range(k-1) :
      top_k.add(y[ind[0][i]])
  
  return top_k


In [40]:
if __name__ == "__main__":

  # Get data
  resumeDataSet = getData()  

  # Y == Categories without encoding
  y = resumeDataSet['Category']

  # Cleaned Resume values
  reqText = resumeDataSet['cleaned_resume_skills'].values

  # Encoding the Categories
  le = encoding(resumeDataSet)

  # The encoded required category values
  reqTarget = resumeDataSet['Category'].values

  # For TF-IDF
  wordVec, wordFeatures = vectorizing(reqText)

  # Returning the trained model
  knn = trainModel(wordFeatures,reqTarget)

  testData = ['''Big Data, Nosql, MongoDB, Database, Redis''']
  cleanTestData = [x.lower() for x in testData]
 
  # Transforming the test data to vector form
  wordFeatures_testData = wordVec.transform(cleanTestData)

  # Top K categories
  topK = topKNeighbours(y,knn,wordFeatures_testData,le)
  print(topK)
  

Accuracy of KNN :- 0.74
{'Database', 'SAP Developer'}
