<a href="https://colab.research.google.com/github/Akash-Rayhan/Resume-Filtering-System/blob/main/notebooks/Random_Forest_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [20]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import nltk
import re
import string
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from sklearn.preprocessing import LabelEncoder

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [21]:
dataframe = pd.read_csv('/content/drive/MyDrive/Datasets/resume_dataset/Resume/Resume.csv')
dataframe.drop(columns = ['ID', 'Resume_html'], inplace = True)
dataframe.head()

Unnamed: 0,Resume_str,Category
0,HR ADMINISTRATOR/MARKETING ASSOCIATE\...,HR
1,"HR SPECIALIST, US HR OPERATIONS ...",HR
2,HR DIRECTOR Summary Over 2...,HR
3,HR SPECIALIST Summary Dedica...,HR
4,HR MANAGER Skill Highlights ...,HR


In [22]:
stop_words = stopwords.words('english')
lemmatizer = WordNetLemmatizer()
def clean_text(text):
    '''Make text lowercase,remove extra whitespaces, remove links,remove punctuation
    and remove words containing numbers.'''
    text = str(text).lower()
    text = re.sub('\s+', ' ', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

def preprocess_data(text):
    # Clean puntuation, urls, and so on
    text = clean_text(text)
    # Remove stopwords
    text = ' '.join(word for word in text.split(' ') if word not in stop_words)
    # Stemm all the words in the sentence
    text = ' '.join(lemmatizer.lemmatize(word) for word in text.split(' '))

    return text

In [23]:
dataframe['Resume_str'] = dataframe['Resume_str'].apply(preprocess_data)

In [24]:
corpus = set() # As we are storing unique words, set doesnot store duplicate data

def build_corpus(text):
  """ This function creates vocabulary collecting unique words from this dataset
  """
  tokens = word_tokenize(text)
  for token in tokens:
    corpus.add(token)

dataframe['Resume_str'].apply(build_corpus)

0       None
1       None
2       None
3       None
4       None
        ... 
2479    None
2480    None
2481    None
2482    None
2483    None
Name: Resume_str, Length: 2484, dtype: object

In [26]:
from sklearn.feature_extraction.text import TfidfVectorizer


features = dataframe['Resume_str'].values
targets = dataframe['Category'].values

word_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    stop_words='english',
    vocabulary=corpus)
WordFeatures = word_vectorizer.fit_transform(features)



In [27]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(WordFeatures,targets,random_state=42, test_size=0.2,
                                                 shuffle=True, stratify=targets)

In [28]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

RFC = RandomForestClassifier(random_state=42)

param_grid = {
    'n_estimators': [50, 100, 200],
    'max_features': ['sqrt', 'log2'],
    'max_depth' : [4,5,6],
    'criterion' :['gini', 'entropy']
}

grid = GridSearchCV(estimator=RFC, param_grid=param_grid, cv= 5, scoring='accuracy', return_train_score=False, verbose=1, n_jobs=-1, refit=True)
grid_search = grid.fit(X_train, y_train)

print(grid_search.best_params_)


Fitting 5 folds for each of 36 candidates, totalling 180 fits
{'criterion': 'gini', 'max_depth': 5, 'max_features': 'sqrt', 'n_estimators': 200}


In [29]:
print('Accuracy of RandomForest Classifier on training set: {:.2f}'.format(grid.score(X_train, y_train)))
print('Accuracy of RandomForest Classifier on test set:     {:.2f}'.format(grid.score(X_test, y_test)))

Accuracy of RandomForest Classifier on training set: 0.72
Accuracy of RandomForest Classifier on test set:     0.57


In [30]:
from sklearn import metrics
predictions = grid.predict(X_test)
print("\n Classification report for classifier %s:\n%s\n" % (grid, metrics.classification_report(y_test, predictions)))


 Classification report for classifier GridSearchCV(cv=5, estimator=RandomForestClassifier(random_state=42), n_jobs=-1,
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [4, 5, 6],
                         'max_features': ['sqrt', 'log2'],
                         'n_estimators': [50, 100, 200]},
             scoring='accuracy', verbose=1):
                        precision    recall  f1-score   support

            ACCOUNTANT       0.43      0.96      0.59        24
              ADVOCATE       0.85      0.71      0.77        24
           AGRICULTURE       0.00      0.00      0.00        13
               APPAREL       1.00      0.05      0.10        19
                  ARTS       0.00      0.00      0.00        21
            AUTOMOBILE       0.00      0.00      0.00         7
              AVIATION       0.62      0.62      0.62        24
               BANKING       0.83      0.65      0.73        23
                   BPO       0.00

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
