In [None]:
# What is Resume Screening?
### Hiring the right talent is a challenge for all businesses. This challenge is magnified by the high volume of applicants if the business is labour-intensive, growing, and facing high attrition rates.
### An example of such a business is that IT departments are short of growing markets. In a typical service organization, professionals with a variety of technical skills and business domain expertise are hired and assigned to projects to resolve customer issues. This task of selecting the best talent among many others is known as Resume Screening.

### Typically, large companies do not have enough time to open each CV, so they use machine learning algorithms for the Resume Screening task.

In [None]:
try:
    import numpy as np
    import pandas as pd
    import matplotlib.pyplot as plt
    import warnings
    from sklearn.naive_bayes import MultinomialNB
    from sklearn.multiclass import OneVsRestClassifier
    from sklearn import metrics
    from sklearn.metrics import accuracy_score
    from pandas.plotting import scatter_matrix
    from sklearn.neighbors import KNeighborsClassifier
except ModuleNotFoundError:
    !pip install -U numpy pandas matplotlib scikit-learn

In [None]:
resumeDataSet = pd.read_csv('../input/resume-screening/Resume Screening.csv' ,encoding='utf-8')

In [None]:
resumeDataSet['cleaned_resume'] = ''

In [None]:
resumeDataSet.head()

In [None]:
print ("Displaying the distinct categories of resume -")
print (resumeDataSet['Category'].unique())

In [None]:
print ("Displaying the distinct categories of resume and the number of records belonging to each category -")
print (resumeDataSet['Category'].value_counts())

In [None]:
import seaborn as sns

In [None]:
plt.figure(figsize=(15,15))
plt.xticks(rotation=90)
sns.countplot(y="Category",data=resumeDataSet)

In [None]:
from matplotlib.gridspec import GridSpec

In [None]:
targetCounts=resumeDataSet["Category"].value_counts()

In [None]:
targetLabels=resumeDataSet["Category"].unique()

In [None]:
plt.figure(1,figsize=(25,25))
the_grid=GridSpec(2,2)

cmap = plt.get_cmap('coolwarm')
colors = [cmap(i) for i in np.linspace(0, 1, 3)]
plt.subplot(the_grid[0, 1], aspect=1, title='CATEGORY DISTRIBUTION')

source_pie=plt.pie(targetCounts,labels=targetLabels,autopct='%1.1f%%', shadow=True, colors=colors)
plt.show()

In [None]:
import re
def cleanResume(resumeText):
    resumeText = re.sub('http\S+\s*', ' ', resumeText)  # remove URLs
    resumeText = re.sub('RT|cc', ' ', resumeText)  # remove RT and cc
    resumeText = re.sub('#\S+', '', resumeText)  # remove hashtags
    resumeText = re.sub('@\S+', '  ', resumeText)  # remove mentions
    resumeText = re.sub('[%s]' % re.escape("""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"""), ' ', resumeText)  # remove punctuations
    resumeText = re.sub(r'[^\x00-\x7f]',r' ', resumeText) 
    resumeText = re.sub('\s+', ' ', resumeText)  # remove extra whitespace
    return resumeText

In [None]:
resumeDataSet['cleaned_resume'] = resumeDataSet.Resume.apply(lambda x: cleanResume(x))

In [None]:
import nltk
from nltk.corpus import stopwords
import string
from wordcloud import WordCloud

In [None]:
oneSetOfStopWords = set(stopwords.words('english')+['``',"''"])

In [None]:
totalWords =[]

In [None]:
Sentences = resumeDataSet['Resume'].values

In [None]:
cleanedSentences = ""

In [None]:
for i in range(0,160):
    cleanedText = cleanResume(Sentences[i])
    cleanedSentences += cleanedText
    requiredWords = nltk.word_tokenize(cleanedText)
    for word in requiredWords:
        if word not in oneSetOfStopWords and word not in string.punctuation:
            totalWords.append(word)

In [None]:
wordfreqdist = nltk.FreqDist(totalWords)

In [None]:
mostcommon = wordfreqdist.most_common(50)

In [None]:
mostcommon

In [None]:
wc = WordCloud().generate(cleanedSentences)

In [None]:
plt.figure(figsize=(15,15))
plt.imshow(wc, interpolation='bilinear')
plt.axis("off")
plt.show()

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
var_mod = ['Category']
le = LabelEncoder()

In [None]:
for i in var_mod:
    resumeDataSet[i] = le.fit_transform(resumeDataSet[i])

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack

In [None]:
requiredText = resumeDataSet['cleaned_resume'].values

In [None]:
requiredTarget = resumeDataSet['Category'].values

In [None]:
requiredTarget = resumeDataSet['Category'].values

In [None]:
WordFeatures = word_vectorizer.transform(requiredText)

In [None]:
X_train,X_test,y_train,y_test = train_test_split(WordFeatures,requiredTarget,random_state=0, test_size=0.2)

In [None]:
X_train.shape

In [None]:
X_test.shape

In [None]:
clf = OneVsRestClassifier(KNeighborsClassifier())

In [None]:
clf.fit(X_train, y_train)

In [None]:
prediction = clf.predict(X_test)

In [None]:
print('Accuracy of KNeighbors Classifier on training set: {:.2f}'.format(clf.score(X_train, y_train)))

In [None]:
print('Accuracy of KNeighbors Classifier on test set: {:.2f}'.format(clf.score(X_test, y_test)))

In [None]:
print("\n Classification report for classifier %s:\n%s\n" % (clf, metrics.classification_report(y_test, prediction)))