# Resume selector

Each resume is stored in the resumetext column. Resumes are reviewed for eligibility for data scientist positions. If a resume is selected, the applicant is invited for an interview.

### Install required libraries

In [1]:
!pip install nltk gensim wordcloud

#nltk: library for text preprocessing and linguistic analysis.
#Use cases: Tokenization, Stopword removal, Lemmatization/Stemming, Part-of-speech tagging, Named Entity Recognition

#Gensim: topic modeling and semantic similarity
#Use cases:
# - Building TF-IDF or word2vec models to capture context
# - Finding semantic similarity between resumes
# - Topic modeling with LDA (Latent Dirichlet Allocation)
# - Vectorizing text for machine learning models

#wordcloud: visualize the most frequent or important words in a corpus
# Use cases:
# - Quickly spotting dominant keywords in resumes
# - Creating visual summaries of skills, experiences
# - Exploratory data analysis (EDA) on text data

Collecting gensim
  Downloading gensim-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.1 kB)
Collecting numpy<2.0,>=1.18.5 (from gensim)
  Downloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting scipy<1.14.0,>=1.7.0 (from gensim)
  Downloading scipy-1.13.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.6/60.6 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
Downloading gensim-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (26.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m26.7/26.7 MB[0m [31m16.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.3 MB)
[2K   [90m━━━━━━━━━━━

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud, STOPWORDS
import nltk
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from sklearn.metrics import classification_report, confusion_matrix

ValueError: numpy.dtype size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject

### update notebook style

In [2]:
!pip install jupyterthemes

Collecting jupyterthemes
  Downloading jupyterthemes-0.20.0-py2.py3-none-any.whl.metadata (1.0 kB)
Collecting lesscpy>=0.11.2 (from jupyterthemes)
  Downloading lesscpy-0.15.1-py2.py3-none-any.whl.metadata (6.0 kB)
Collecting jedi>=0.16 (from ipython>=5.4.1->jupyterthemes)
  Downloading jedi-0.19.2-py2.py3-none-any.whl.metadata (22 kB)
Downloading jupyterthemes-0.20.0-py2.py3-none-any.whl (7.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m42.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading lesscpy-0.15.1-py2.py3-none-any.whl (46 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.7/46.7 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading jedi-0.19.2-py2.py3-none-any.whl (1.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m65.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: lesscpy, jedi, jupyterthemes
Successfully installed jedi-0.19.2 jupyterthemes-0.20.0 les

In [None]:
from jupyterthemes import jtplot
jtplot.style(theme='monokai', context='notebook', ticks=True, grid=False)

### load dataset

In [None]:
resume_df = pd.read_csv('resume.csv', encoding = 'latin-1')
resume_df

In [None]:
resume_df = resume_df[['resume_text','class']]

In [None]:
resume_df.iloc[[0,-1],:]

### EDA

In [None]:
resume_df.info()

In [None]:
resume_df.isnull().sum()

In [None]:
resume_df['class'].value_counts()

In [None]:
resume_df['class'] = resume_df['class'].apply(lambda x:1 if x == 'flagged' else 0)
resume_df

In [None]:
resume_df_0 = resume_df[resume_df['class']==0]
resume_df_1 = resume_df[resume_df['class']==1]
print('The number of 0 class',len(resume_df_0))
print('The number of 1 class',len(resume_df_1))

### Data cleaning

In [None]:
resume_df['resume_text'] = resume_df['resume_text'].apply(lambda x:x.replace('\r', ''))
resume_df

In [None]:
nltk.download('punkt')
nltk.download("stopwords")

In [None]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from','subject','reply','use','email','com'])

In [None]:
def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text) :
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 2 and token not in stop_words:
            result.append(token)

    return ' '.join(result)

In [None]:
resume_df['cleaned'] = resume_df['resume_text'].apply(preprocess)

In [None]:
print(resume_df['cleaned'][0])

In [None]:
print(resume_df['resume_text'][0])

### Visualization

In [None]:
sns.countplot(resume_df['class'], label = 'Count Plot')

In [None]:
plt.figure(figsize = (20,20))
wc = WordCloud(max_words = 2000, width= 1600, height= 800, stopwords = stop_words).generate(str(resume_df[resume_df['class']==1].cleaned))
plt.imshow(wc)

In [None]:
wc = WordCloud(max_words = 2000, width= 1600, height= 800, stopwords = stop_words).generate(str(resume_df[resume_df['class']==0].cleaned))
plt.imshow(wc)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
sample_data = ['This is the first document.','This document is the second document.','And this is the third one.','Is this the first document?']
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(sample_data)

In [None]:
print(vectorizer.get_feature_names())

In [None]:
print(X.toarray())

In [None]:
vectorizer = CountVectorizer()
countvectorizer = vectorizer.fit_transform(resume_df['cleaned'])

In [None]:
print(vectorizer.get_feature_names())

In [None]:
print(countvectorizer.toarray())

### Train model

In [None]:
X = countvectorizer

In [None]:
y = resume_df['class']

In [None]:
print(X.shape, y.shape)

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X,y,test_size = 0.2)

In [None]:
from sklearn.naive_bayes import MultinomialNB

NB_classifier = MultinomialNB()
NB_classifier.fit(x_train, y_train)

In [None]:
x_train, x_test, y_train, y_test = train_test_split(X,y,test_size = 0.25)

In [None]:
y_predict_train = NB_classifier.predict(x_train)
y_predict_train
cm = confusion_matrix(y_train, y_predict_train)
sns.heatmap(cm, annot = True)

In [None]:
y_predict_test = NB_classifier.predict(x_test)
cm = confusion_matrix(y_test, y_predict_test)
sns.heatmap(cm, annot = True)

In [None]:
print(classification_report(y_test, y_predict_test))