In [4]:
class FiniteStateAcceptor:
    def __init__(self):
        self.states = set()
        self.initial_state = None
        self.accepting_states = set()
        self.transitions = {}

    def add_state(self, state):
        self.states.add(state)

    def set_initial_state(self, state):
        self.initial_state = state

    def add_accepting_state(self, state):
        self.accepting_states.add(state)

    def add_transition(self, from_state, to_state, symbol):
        transition = (to_state, symbol)
        self.transitions.setdefault(from_state, []).append(transition)

    def accept(self, word):
        current_state = self.initial_state
        for symbol in word:
            if current_state in self.transitions and symbol in [t[1] for t in self.transitions[current_state]]:
                transitions = self.transitions[current_state]
                for transition in transitions:
                    if transition[1] == symbol:
                        current_state = transition[0]
                        break
                else:
                    return False
            else:
                return False
        return current_state in self.accepting_states

In [93]:
# Create a finite state acceptor for the word "javascript"
js_fsa = FiniteStateAcceptor()
js_fsa.add_state('q0') # j
js_fsa.add_state('q1') # a
js_fsa.add_state('q2') # v
js_fsa.add_state('q3') # s
js_fsa.add_state('q4') # c
js_fsa.add_state('q5') # r
js_fsa.add_state('q6') # i
js_fsa.add_state('q7') # p
js_fsa.add_state('q8') # t

js_fsa.set_initial_state('q0')
js_fsa.add_accepting_state('q8')

js_fsa.add_transition('q0', 'q1', 'j')
js_fsa.add_transition('q1', 'q2', 'a')
js_fsa.add_transition('q2', 'q3', 'v')
js_fsa.add_transition('q3', 'q2', 'a')
js_fsa.add_transition('q2', 'q3', 's')
js_fsa.add_transition('q3', 'q4', 'c')
js_fsa.add_transition('q4', 'q5', 'r')
js_fsa.add_transition('q5', 'q6', 'i')
js_fsa.add_transition('q6', 'q7', 'p')
js_fsa.add_transition('q7', 'q8', 't')

word,wrong_word = 'javascript','jawascript'
if js_fsa.accept(word):
    print(f'The word "{word}" is accepted.')
else:
    print(f'The word "{word}" is not accepted.')

if js_fsa.accept(wrong_word):
    print(f'The word "{wrong_word}" is accepted.')
else:
    print(f'The word "{wrong_word}" is not accepted.')

The word "javascript" is accepted.
The word "jawascript" is not accepted.


In [53]:
# Create a finite state acceptor for the word "html"
html_fsa = FiniteStateAcceptor()
html_fsa.add_state('q0') # h
html_fsa.add_state('q1') # t
html_fsa.add_state('q2') # m
html_fsa.add_state('q3') # l

html_fsa.set_initial_state('q0')
html_fsa.add_accepting_state('q4')

html_fsa.add_transition('q0', 'q1', 'h')
html_fsa.add_transition('q1', 'q2', 't')
html_fsa.add_transition('q2', 'q3', 'm')
html_fsa.add_transition('q3', 'q4', 'l')

word = 'html'
if html_fsa.accept(word):
    print(f'The word "{word}" is accepted.')
else:
    print(f'The word "{word}" is not accepted.')

The word "html" is accepted.


In [54]:
# Create a finite state acceptor for the word "html"
css_fsa = FiniteStateAcceptor()
css_fsa.add_state('q0') # c
css_fsa.add_state('q1') # s
css_fsa.add_state('q2') # s

css_fsa.set_initial_state('q0')
css_fsa.add_accepting_state('q3')

css_fsa.add_transition('q0', 'q1', 'c')
css_fsa.add_transition('q1', 'q2', 's')
css_fsa.add_transition('q2', 'q3', 's')

word = 'css'
if css_fsa.accept(word):
    print(f'The word "{word}" is accepted.')
else:
    print(f'The word "{word}" is not accepted.')

The word "css" is accepted.


In [55]:
# Create a finite state acceptor for the word "html"
sql_fsa = FiniteStateAcceptor()
sql_fsa.add_state('q0') # s
sql_fsa.add_state('q1') # q
sql_fsa.add_state('q2') # l

sql_fsa.set_initial_state('q0')
sql_fsa.add_accepting_state('q3')

sql_fsa.add_transition('q0', 'q1', 's')
sql_fsa.add_transition('q1', 'q2', 'q')
sql_fsa.add_transition('q2', 'q3', 'l')

word = 'sql'
if sql_fsa.accept(word):
    print(f'The word "{word}" is accepted.')
else:
    print(f'The word "{word}" is not accepted.')

The word "sql" is accepted.


In [68]:
def preprocess_resume(resume_text):
    # Convert to lowercase
    resume_text = resume_text.lower()

    # Remove non-alphanumeric characters and multiple whitespaces
    resume_text = re.sub(r'[^a-zA-Z0-9\s]', ' ', resume_text)
    resume_text = re.sub(r'\s+', ' ', resume_text)

    # Tokenize words
    tokens = word_tokenize(resume_text)

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Lemmatize words
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Join tokens back into a single string
    processed_resume = ' '.join(tokens)

    return processed_resume

In [69]:
import csv
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Load the CSV file into a DataFrame
df = pd.read_csv('../csv/UpdatedResumeDataSet.csv')

In [70]:
# Preprocess
df['Resume_cleaned'] = df['Resume'].apply(lambda x: preprocess_resume(x))

In [72]:
df

Unnamed: 0,Category,Resume,Resume_cleaned
0,Data Science,Skills * Programming Languages: Python (pandas...,skill programming language python panda numpy ...
1,Data Science,Education Details \r\nMay 2013 to May 2017 B.E...,education detail may 2013 may 2017 b e uit rgp...
2,Data Science,"Areas of Interest Deep Learning, Control Syste...",area interest deep learning control system des...
3,Data Science,Skills â¢ R â¢ Python â¢ SAP HANA â¢ Table...,skill r python sap hana tableau sap hana sql s...
4,Data Science,"Education Details \r\n MCA YMCAUST, Faridab...",education detail mca ymcaust faridabad haryana...
...,...,...,...
957,Testing,Computer Skills: â¢ Proficient in MS office (...,computer skill proficient m office word basic ...
958,Testing,â Willingness to accept the challenges. â ...,willingness accept challenge positive thinking...
959,Testing,"PERSONAL SKILLS â¢ Quick learner, â¢ Eagerne...",personal skill quick learner eagerness learn n...
960,Testing,COMPUTER SKILLS & SOFTWARE KNOWLEDGE MS-Power ...,computer skill software knowledge m power poin...


In [78]:
df['regex_js_present'] = df['Resume_cleaned'].apply(lambda x: bool(re.search(r'javascript', x, re.IGNORECASE)))
df

Unnamed: 0,Category,Resume,Resume_cleaned,regex_js_present,fsa_js_present
0,Data Science,Skills * Programming Languages: Python (pandas...,skill programming language python panda numpy ...,True,True
1,Data Science,Education Details \r\nMay 2013 to May 2017 B.E...,education detail may 2013 may 2017 b e uit rgp...,False,False
2,Data Science,"Areas of Interest Deep Learning, Control Syste...",area interest deep learning control system des...,False,False
3,Data Science,Skills â¢ R â¢ Python â¢ SAP HANA â¢ Table...,skill r python sap hana tableau sap hana sql s...,False,False
4,Data Science,"Education Details \r\n MCA YMCAUST, Faridab...",education detail mca ymcaust faridabad haryana...,False,False
...,...,...,...,...,...
957,Testing,Computer Skills: â¢ Proficient in MS office (...,computer skill proficient m office word basic ...,False,False
958,Testing,â Willingness to accept the challenges. â ...,willingness accept challenge positive thinking...,False,False
959,Testing,"PERSONAL SKILLS â¢ Quick learner, â¢ Eagerne...",personal skill quick learner eagerness learn n...,False,False
960,Testing,COMPUTER SKILLS & SOFTWARE KNOWLEDGE MS-Power ...,computer skill software knowledge m power poin...,False,False


In [79]:
df['fsa_js_present'] = df['Resume_cleaned'].apply(lambda x: any(True for word in x.split() if js_fsa.accept(word)))
df

Unnamed: 0,Category,Resume,Resume_cleaned,regex_js_present,fsa_js_present
0,Data Science,Skills * Programming Languages: Python (pandas...,skill programming language python panda numpy ...,True,True
1,Data Science,Education Details \r\nMay 2013 to May 2017 B.E...,education detail may 2013 may 2017 b e uit rgp...,False,False
2,Data Science,"Areas of Interest Deep Learning, Control Syste...",area interest deep learning control system des...,False,False
3,Data Science,Skills â¢ R â¢ Python â¢ SAP HANA â¢ Table...,skill r python sap hana tableau sap hana sql s...,False,False
4,Data Science,"Education Details \r\n MCA YMCAUST, Faridab...",education detail mca ymcaust faridabad haryana...,False,False
...,...,...,...,...,...
957,Testing,Computer Skills: â¢ Proficient in MS office (...,computer skill proficient m office word basic ...,False,False
958,Testing,â Willingness to accept the challenges. â ...,willingness accept challenge positive thinking...,False,False
959,Testing,"PERSONAL SKILLS â¢ Quick learner, â¢ Eagerne...",personal skill quick learner eagerness learn n...,False,False
960,Testing,COMPUTER SKILLS & SOFTWARE KNOWLEDGE MS-Power ...,computer skill software knowledge m power poin...,False,False


In [80]:
df['regex_html_present'] = df['Resume_cleaned'].apply(lambda x: bool(re.search(r'html', x, re.IGNORECASE)))
df

Unnamed: 0,Category,Resume,Resume_cleaned,regex_js_present,fsa_js_present,regex_html_present
0,Data Science,Skills * Programming Languages: Python (pandas...,skill programming language python panda numpy ...,True,True,True
1,Data Science,Education Details \r\nMay 2013 to May 2017 B.E...,education detail may 2013 may 2017 b e uit rgp...,False,False,False
2,Data Science,"Areas of Interest Deep Learning, Control Syste...",area interest deep learning control system des...,False,False,False
3,Data Science,Skills â¢ R â¢ Python â¢ SAP HANA â¢ Table...,skill r python sap hana tableau sap hana sql s...,False,False,False
4,Data Science,"Education Details \r\n MCA YMCAUST, Faridab...",education detail mca ymcaust faridabad haryana...,False,False,False
...,...,...,...,...,...,...
957,Testing,Computer Skills: â¢ Proficient in MS office (...,computer skill proficient m office word basic ...,False,False,False
958,Testing,â Willingness to accept the challenges. â ...,willingness accept challenge positive thinking...,False,False,False
959,Testing,"PERSONAL SKILLS â¢ Quick learner, â¢ Eagerne...",personal skill quick learner eagerness learn n...,False,False,False
960,Testing,COMPUTER SKILLS & SOFTWARE KNOWLEDGE MS-Power ...,computer skill software knowledge m power poin...,False,False,False


In [81]:
df['fsa_html_present'] = df['Resume_cleaned'].apply(lambda x: any(True for word in x.split() if html_fsa.accept(word)))
df

Unnamed: 0,Category,Resume,Resume_cleaned,regex_js_present,fsa_js_present,regex_html_present,fsa_html_present
0,Data Science,Skills * Programming Languages: Python (pandas...,skill programming language python panda numpy ...,True,True,True,True
1,Data Science,Education Details \r\nMay 2013 to May 2017 B.E...,education detail may 2013 may 2017 b e uit rgp...,False,False,False,False
2,Data Science,"Areas of Interest Deep Learning, Control Syste...",area interest deep learning control system des...,False,False,False,False
3,Data Science,Skills â¢ R â¢ Python â¢ SAP HANA â¢ Table...,skill r python sap hana tableau sap hana sql s...,False,False,False,False
4,Data Science,"Education Details \r\n MCA YMCAUST, Faridab...",education detail mca ymcaust faridabad haryana...,False,False,False,False
...,...,...,...,...,...,...,...
957,Testing,Computer Skills: â¢ Proficient in MS office (...,computer skill proficient m office word basic ...,False,False,False,False
958,Testing,â Willingness to accept the challenges. â ...,willingness accept challenge positive thinking...,False,False,False,False
959,Testing,"PERSONAL SKILLS â¢ Quick learner, â¢ Eagerne...",personal skill quick learner eagerness learn n...,False,False,False,False
960,Testing,COMPUTER SKILLS & SOFTWARE KNOWLEDGE MS-Power ...,computer skill software knowledge m power poin...,False,False,False,False


In [82]:
df['regex_css_present'] = df['Resume_cleaned'].apply(lambda x: bool(re.search(r'css', x, re.IGNORECASE)))
df

Unnamed: 0,Category,Resume,Resume_cleaned,regex_js_present,fsa_js_present,regex_html_present,fsa_html_present,regex_css_present
0,Data Science,Skills * Programming Languages: Python (pandas...,skill programming language python panda numpy ...,True,True,True,True,False
1,Data Science,Education Details \r\nMay 2013 to May 2017 B.E...,education detail may 2013 may 2017 b e uit rgp...,False,False,False,False,False
2,Data Science,"Areas of Interest Deep Learning, Control Syste...",area interest deep learning control system des...,False,False,False,False,False
3,Data Science,Skills â¢ R â¢ Python â¢ SAP HANA â¢ Table...,skill r python sap hana tableau sap hana sql s...,False,False,False,False,False
4,Data Science,"Education Details \r\n MCA YMCAUST, Faridab...",education detail mca ymcaust faridabad haryana...,False,False,False,False,False
...,...,...,...,...,...,...,...,...
957,Testing,Computer Skills: â¢ Proficient in MS office (...,computer skill proficient m office word basic ...,False,False,False,False,False
958,Testing,â Willingness to accept the challenges. â ...,willingness accept challenge positive thinking...,False,False,False,False,False
959,Testing,"PERSONAL SKILLS â¢ Quick learner, â¢ Eagerne...",personal skill quick learner eagerness learn n...,False,False,False,False,False
960,Testing,COMPUTER SKILLS & SOFTWARE KNOWLEDGE MS-Power ...,computer skill software knowledge m power poin...,False,False,False,False,False


In [83]:
df['fsa_css_present'] = df['Resume_cleaned'].apply(lambda x: any(True for word in x.split() if css_fsa.accept(word)))
df

Unnamed: 0,Category,Resume,Resume_cleaned,regex_js_present,fsa_js_present,regex_html_present,fsa_html_present,regex_css_present,fsa_css_present
0,Data Science,Skills * Programming Languages: Python (pandas...,skill programming language python panda numpy ...,True,True,True,True,False,False
1,Data Science,Education Details \r\nMay 2013 to May 2017 B.E...,education detail may 2013 may 2017 b e uit rgp...,False,False,False,False,False,False
2,Data Science,"Areas of Interest Deep Learning, Control Syste...",area interest deep learning control system des...,False,False,False,False,False,False
3,Data Science,Skills â¢ R â¢ Python â¢ SAP HANA â¢ Table...,skill r python sap hana tableau sap hana sql s...,False,False,False,False,False,False
4,Data Science,"Education Details \r\n MCA YMCAUST, Faridab...",education detail mca ymcaust faridabad haryana...,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...
957,Testing,Computer Skills: â¢ Proficient in MS office (...,computer skill proficient m office word basic ...,False,False,False,False,False,False
958,Testing,â Willingness to accept the challenges. â ...,willingness accept challenge positive thinking...,False,False,False,False,False,False
959,Testing,"PERSONAL SKILLS â¢ Quick learner, â¢ Eagerne...",personal skill quick learner eagerness learn n...,False,False,False,False,False,False
960,Testing,COMPUTER SKILLS & SOFTWARE KNOWLEDGE MS-Power ...,computer skill software knowledge m power poin...,False,False,False,False,False,False


In [84]:
df['regex_sql_present'] = df['Resume_cleaned'].apply(lambda x: bool(re.search(r'sql', x, re.IGNORECASE)))
df

Unnamed: 0,Category,Resume,Resume_cleaned,regex_js_present,fsa_js_present,regex_html_present,fsa_html_present,regex_css_present,fsa_css_present,regex_sql_present
0,Data Science,Skills * Programming Languages: Python (pandas...,skill programming language python panda numpy ...,True,True,True,True,False,False,True
1,Data Science,Education Details \r\nMay 2013 to May 2017 B.E...,education detail may 2013 may 2017 b e uit rgp...,False,False,False,False,False,False,False
2,Data Science,"Areas of Interest Deep Learning, Control Syste...",area interest deep learning control system des...,False,False,False,False,False,False,True
3,Data Science,Skills â¢ R â¢ Python â¢ SAP HANA â¢ Table...,skill r python sap hana tableau sap hana sql s...,False,False,False,False,False,False,True
4,Data Science,"Education Details \r\n MCA YMCAUST, Faridab...",education detail mca ymcaust faridabad haryana...,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...
957,Testing,Computer Skills: â¢ Proficient in MS office (...,computer skill proficient m office word basic ...,False,False,False,False,False,False,False
958,Testing,â Willingness to accept the challenges. â ...,willingness accept challenge positive thinking...,False,False,False,False,False,False,False
959,Testing,"PERSONAL SKILLS â¢ Quick learner, â¢ Eagerne...",personal skill quick learner eagerness learn n...,False,False,False,False,False,False,False
960,Testing,COMPUTER SKILLS & SOFTWARE KNOWLEDGE MS-Power ...,computer skill software knowledge m power poin...,False,False,False,False,False,False,False


In [85]:
df['fsa_sql_present'] = df['Resume_cleaned'].apply(lambda x: any(True for word in x.split() if sql_fsa.accept(word)))
df

Unnamed: 0,Category,Resume,Resume_cleaned,regex_js_present,fsa_js_present,regex_html_present,fsa_html_present,regex_css_present,fsa_css_present,regex_sql_present,fsa_sql_present
0,Data Science,Skills * Programming Languages: Python (pandas...,skill programming language python panda numpy ...,True,True,True,True,False,False,True,True
1,Data Science,Education Details \r\nMay 2013 to May 2017 B.E...,education detail may 2013 may 2017 b e uit rgp...,False,False,False,False,False,False,False,False
2,Data Science,"Areas of Interest Deep Learning, Control Syste...",area interest deep learning control system des...,False,False,False,False,False,False,True,True
3,Data Science,Skills â¢ R â¢ Python â¢ SAP HANA â¢ Table...,skill r python sap hana tableau sap hana sql s...,False,False,False,False,False,False,True,True
4,Data Science,"Education Details \r\n MCA YMCAUST, Faridab...",education detail mca ymcaust faridabad haryana...,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...
957,Testing,Computer Skills: â¢ Proficient in MS office (...,computer skill proficient m office word basic ...,False,False,False,False,False,False,False,False
958,Testing,â Willingness to accept the challenges. â ...,willingness accept challenge positive thinking...,False,False,False,False,False,False,False,False
959,Testing,"PERSONAL SKILLS â¢ Quick learner, â¢ Eagerne...",personal skill quick learner eagerness learn n...,False,False,False,False,False,False,False,False
960,Testing,COMPUTER SKILLS & SOFTWARE KNOWLEDGE MS-Power ...,computer skill software knowledge m power poin...,False,False,False,False,False,False,False,False


In [88]:
# Calculate accuracy
correct_predictions = 0

# Initialize counts
true_positive = 0
false_positive = 0
false_negative = 0
true_negative = 0

regex_js_present = df['regex_js_present']
fsa_js_present = df['fsa_js_present']

for idx,expected_output in enumerate(regex_js_present):
    predicted_output = regex_js_present[idx]
    if predicted_output == expected_output:
        if predicted_output:
            true_positive += 1
        else:
            true_negative += 1
        correct_predictions += 1
    else:
        if predicted_output:
            false_positive += 1
        else:
            false_negative += 1


# Calculate accuracy, precision, recall, and F1-score
accuracy=correct_predictions/len(regex_js_present)
precision=true_positive/(true_positive+false_positive)
recal =true_positive/(true_positive+false_negative)
f1_score=2*(precision*recall)/(precision+recall)

# Print the metrics
print(f'Accuracy: {accuracy:.2f}')
print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')
print(f'F1-score: {f1_score:.2f}')

Accuracy: 1.00
Precision: 1.00
Recall: 1.00
F1-score: 1.00


In [89]:
# Calculate accuracy
correct_predictions = 0

# Initialize counts
true_positive = 0
false_positive = 0
false_negative = 0
true_negative = 0

regex_html_present = df['regex_html_present']
fsa_html_present = df['fsa_html_present']

for idx,expected_output in enumerate(regex_html_present):
    predicted_output = regex_html_present[idx]
    if predicted_output == expected_output:
        if predicted_output:
            true_positive += 1
        else:
            true_negative += 1
        correct_predictions += 1
    else:
        if predicted_output:
            false_positive += 1
        else:
            false_negative += 1


# Calculate accuracy, precision, recall, and F1-score
accuracy = correct_predictions / len(regex_js_present)
precision = true_positive / (true_positive + false_positive)
recall = true_positive / (true_positive + false_negative)
f1_score = 2 * (precision * recall) / (precision + recall)

# Print the metrics
print(f'Accuracy: {accuracy:.2f}')
print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')
print(f'F1-score: {f1_score:.2f}')

Accuracy: 1.00
Precision: 1.00
Recall: 1.00
F1-score: 1.00


In [86]:
df.to_csv('../csv/resume_parsed.csv')