In [1]:
import numpy as np
import pandas as pd

In [2]:
data =pd.read_csv('UpdatedResumeDataSet.csv')

In [3]:
data

Unnamed: 0,Category,Resume
0,Data Science,Skills * Programming Languages: Python (pandas...
1,Data Science,Education Details \r\nMay 2013 to May 2017 B.E...
2,Data Science,"Areas of Interest Deep Learning, Control Syste..."
3,Data Science,Skills â¢ R â¢ Python â¢ SAP HANA â¢ Table...
4,Data Science,"Education Details \r\n MCA YMCAUST, Faridab..."
...,...,...
957,Testing,Computer Skills: â¢ Proficient in MS office (...
958,Testing,â Willingness to accept the challenges. â ...
959,Testing,"PERSONAL SKILLS â¢ Quick learner, â¢ Eagerne..."
960,Testing,COMPUTER SKILLS & SOFTWARE KNOWLEDGE MS-Power ...


In [4]:
data.isna().sum()

Category    0
Resume      0
dtype: int64

In [5]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [6]:
data.Category.value_counts()

Java Developer               84
Testing                      70
DevOps Engineer              55
Python Developer             48
Web Designing                45
HR                           44
Hadoop                       42
Blockchain                   40
ETL Developer                40
Operations Manager           40
Data Science                 40
Sales                        40
Mechanical Engineer          40
Arts                         36
Database                     33
Electrical Engineering       30
Health and fitness           30
PMO                          30
Business Analyst             28
DotNet Developer             28
Automation Testing           26
Network Security Engineer    25
SAP Developer                24
Civil Engineer               24
Advocate                     20
Name: Category, dtype: int64

In [7]:
from imblearn.over_sampling import SMOTE

In [8]:
smote = SMOTE(sampling_strategy='minority')

In [9]:
stop_words = stopwords.words('english')
lematizer = WordNetLemmatizer()

In [10]:
pattern = r'[^A-Za-z0-9 ]+'
def preprocessing(text):
    #removing unwanted characters
    text = re.sub(pattern, '', text)
    #Tockenize
    tokens = nltk.word_tokenize(text)
    #removing stop word
    tokens = [token for token in tokens if token not in stop_words]
    #lematise
    tokens = [lematizer.lemmatize(token) for token in tokens]
    # join tokens back into a string
    text = ' '.join(tokens)
    return text

In [11]:
data['Resume'] = data['Resume'].apply(preprocessing)
data

Unnamed: 0,Category,Resume
0,Data Science,Skills Programming Languages Python panda nump...
1,Data Science,Education Details May 2013 May 2017 BE UITRGPV...
2,Data Science,Areas Interest Deep Learning Control System De...
3,Data Science,Skills R Python SAP HANA Tableau SAP HANA SQL ...
4,Data Science,Education Details MCA YMCAUST Faridabad Haryan...
...,...,...
957,Testing,Computer Skills Proficient MS office Word Basi...
958,Testing,Willingness accept challenge Positive thinking...
959,Testing,PERSONAL SKILLS Quick learner Eagerness learn ...
960,Testing,COMPUTER SKILLS SOFTWARE KNOWLEDGE MSPower Poi...


In [12]:
data['Resume'] = data['Resume'].apply(str.lower)
data['Resume']

0      skills programming languages python panda nump...
1      education details may 2013 may 2017 be uitrgpv...
2      areas interest deep learning control system de...
3      skills r python sap hana tableau sap hana sql ...
4      education details mca ymcaust faridabad haryan...
                             ...                        
957    computer skills proficient ms office word basi...
958    willingness accept challenge positive thinking...
959    personal skills quick learner eagerness learn ...
960    computer skills software knowledge mspower poi...
961    skill set os windows xp788110 database mysql s...
Name: Resume, Length: 962, dtype: object

In [13]:
data.Category.unique()

array(['Data Science', 'HR', 'Advocate', 'Arts', 'Web Designing',
       'Mechanical Engineer', 'Sales', 'Health and fitness',
       'Civil Engineer', 'Java Developer', 'Business Analyst',
       'SAP Developer', 'Automation Testing', 'Electrical Engineering',
       'Operations Manager', 'Python Developer', 'DevOps Engineer',
       'Network Security Engineer', 'PMO', 'Database', 'Hadoop',
       'ETL Developer', 'DotNet Developer', 'Blockchain', 'Testing'],
      dtype=object)

In [14]:
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.multiclass import OneVsRestClassifier
#train_test_split is to split and train the data. KNeighborsClassifier is for K nearest neighbor.
from sklearn.neighbors import KNeighborsClassifier
#LogisticRegression is the module used to implement logistic regression.
from sklearn.linear_model import LogisticRegression
#If the hyperplane classifies the dataset linearly then the algorithm we call it as SVC.T
#he algorithm that separates the dataset by non-linear approach then we call it as SVM.
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier

In [15]:
X = data['Resume'].values
enc=LabelEncoder()
y= data.Category.values
y = enc.fit_transform(y)

In [16]:
X[:1]

array(['skills programming languages python panda numpy scipy scikitlearn matplotlib sql java javascriptjquery machine learning regression svm nave bayes knn random forest decision trees boosting technique cluster analysis word embedding sentiment analysis natural language processing dimensionality reduction topic modelling lda nmf pca neural nets database visualizations mysql sqlserver cassandra hbase elasticsearch d3js dcjs plotly kibana matplotlib ggplot tableau others regular expression html css angular 6 logstash kafka python flask git docker computer vision open cv understanding deep learningeducation details data science assurance associate data science assurance associate ernst young llpskill details javascript exprience 24 monthsjquery exprience 24 monthspython exprience 24 monthscompany details company ernst young llpdescription fraud investigations dispute services assurancetechnology assisted reviewtar technology assisted review assist accelerating review process run analyt

In [17]:
y[:5]

array([6, 6, 6, 6, 6])

In [18]:
word_vectorizer = TfidfVectorizer(sublinear_tf=True, stop_words='english')
word_vectorizer.fit(X)
WordFeatures= word_vectorizer.transform(X)

In [19]:
from imblearn.over_sampling import SMOTE

In [21]:
X,y= smote.fit_resample(WordFeatures,y)

In [22]:
X_train,X_test,y_train,y_test= train_test_split(X, y, random_state=42,stratify=y)

In [23]:
print(X_train.shape)
print(X_test.shape)

(769, 9539)
(257, 9539)


In [25]:
models = {
    'K-Nearest Neighbors' : KNeighborsClassifier(),
    'Logistic Regression' : LogisticRegression(),
    'Support Vector Machine' : SVC(),
    'Random Forest' : RandomForestClassifier()    
}

In [26]:
model_list=[]
for model in models.values():
    model_list.append(OneVsRestClassifier(model))
model_list

[OneVsRestClassifier(estimator=KNeighborsClassifier()),
 OneVsRestClassifier(estimator=LogisticRegression()),
 OneVsRestClassifier(estimator=SVC()),
 OneVsRestClassifier(estimator=RandomForestClassifier())]

In [27]:
print("The 4 Estimator we used:")
print("1.KNeighborsClassifier\n2.LogisticRegresssion\n3.SVC\n4.RandomForestClassifier\n")
for i in model_list:
    i.fit(X_train, y_train)
    print(f'{i} trained')

print("*"*60)
print("all models trained")

The 4 Estimator we used:
1.KNeighborsClassifier
2.LogisticRegresssion
3.SVC
4.RandomForestClassifier

OneVsRestClassifier(estimator=KNeighborsClassifier()) trained
OneVsRestClassifier(estimator=LogisticRegression()) trained
OneVsRestClassifier(estimator=SVC()) trained
OneVsRestClassifier(estimator=RandomForestClassifier()) trained
************************************************************
all models trained


In [28]:
for count, value in enumerate(model_list):
    print(f"Accuracy of {value} on training set :", model_list[count].score(X_train, y_train))
    print(f"Accuracy of {value} on test set :", model_list[count].score(X_test, y_test))
    print("*"*100)
    print("\n")

Accuracy of OneVsRestClassifier(estimator=KNeighborsClassifier()) on training set : 0.9492847854356307
Accuracy of OneVsRestClassifier(estimator=KNeighborsClassifier()) on test set : 0.8638132295719845
****************************************************************************************************


Accuracy of OneVsRestClassifier(estimator=LogisticRegression()) on training set : 1.0
Accuracy of OneVsRestClassifier(estimator=LogisticRegression()) on test set : 0.9922178988326849
****************************************************************************************************


Accuracy of OneVsRestClassifier(estimator=SVC()) on training set : 1.0
Accuracy of OneVsRestClassifier(estimator=SVC()) on test set : 1.0
****************************************************************************************************


Accuracy of OneVsRestClassifier(estimator=RandomForestClassifier()) on training set : 1.0
Accuracy of OneVsRestClassifier(estimator=RandomForestClassifier()) on test se

In [29]:
from sklearn.metrics import confusion_matrix as CM
from sklearn.metrics import classification_report

from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score

In [30]:
for count, value in enumerate(model_list):
    print(f'{value} classification report')
    print("-"*80)
    print(classification_report(y_test, model_list[count].predict(X_test)))
    print("*"*100)
    print(" ")

OneVsRestClassifier(estimator=KNeighborsClassifier()) classification report
--------------------------------------------------------------------------------
              precision    recall  f1-score   support

           0       0.38      1.00      0.55        21
           1       1.00      1.00      1.00         9
           2       1.00      0.33      0.50         6
           3       1.00      1.00      1.00        10
           4       1.00      0.86      0.92         7
           5       1.00      0.17      0.29         6
           6       1.00      0.60      0.75        10
           7       1.00      0.62      0.77         8
           8       1.00      1.00      1.00        14
           9       1.00      0.29      0.44         7
          10       1.00      1.00      1.00        10
          11       1.00      1.00      1.00         8
          12       1.00      0.64      0.78        11
          13       1.00      1.00      1.00        11
          14       1.00      0.6

In [33]:
svc = model_list[2]

In [35]:
enc.inverse_transform(svc.predict(word_vectorizer.transform(np.array(['i am a data scientist i had learn python pandas numpy machine learning deep learning i have a grat experience in machine learning deep learning and i had do many project like nlp image classification etc']))))

array(['Data Science'], dtype=object)

In [37]:
import pickle

In [38]:
pickle.dump(svc,open('svc_model','wb'))

In [39]:
pickle.dump(word_vectorizer,open('word_vectorizer','wb'))

In [40]:
pickle.dump(enc,open('Label_encoder','wb'))