# preprocessing

### lemma tagger

In [None]:
import nltk
from nltk.corpus import wordnet
def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

### file processing

In [None]:
from pathlib import Path
import re
def get_content_as_string(document_path):
    txt = Path(document_path).read_text()
    txt = txt.replace('\n', ' ')
    txt = re.sub('\W+', ' ', txt) #Select only alpha numerics
    txt = re.sub('[^A-Za-z]+', ' ', txt) #select only alphabet characters
    txt = txt.lower()
    return txt

### nltk tokenizer

In [None]:
import nltk
def tokenize_document(text_file):
    tokens = nltk.word_tokenize(text_file)
    return tokens

def tag_tokens(tokens):
    tagged_tokens = nltk.pos_tag(tokens)
    return tagged_tokens

### tf_idf_lemmetizer

In [None]:
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords as stp

lemmatizer = WordNetLemmatizer()
analyzer = TfidfVectorizer().build_analyzer()
def stemmed_words(doc):
    return (lemmatizer.lemmatize(w,get_wordnet_pos(w)) for w in analyzer(doc) if w not in set(stp.words('english')))


# Text_processing

### cv_cosine_similarity

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords as stp
from sklearn.feature_extraction.text import CountVectorizer


def get_binay_cosine_similarity(compare_doc,doc_corpus):
    count_vect = CountVectorizer(binary=True,analyzer=stemmed_words)
    cv_req_vector = count_vect.fit_transform([compare_doc]).todense()
    print('Features are:' ,count_vect.get_feature_names())
    cv_resume_vector = count_vect.transform(doc_corpus).todense()
    cosine_similarity_list = []
    for i in range(len(cv_resume_vector)):
        cosine_similarity_list.append(cosine_similarity(cv_req_vector,cv_resume_vector[i])[0][0])
    return cosine_similarity_list


### tf_idf_cosine_similarity

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords as stp

def get_tf_idf_cosine_similarity(compare_doc,doc_corpus):
    tf_idf_vect = TfidfVectorizer(analyzer=stemmed_words)
    tf_idf_req_vector = tf_idf_vect.fit_transform([compare_doc]).todense()
    tf_idf_resume_vector = tf_idf_vect.transform(doc_corpus).todense()
    cosine_similarity_list = []
    for i in range(len(tf_idf_resume_vector)):
        cosine_similarity_list.append(cosine_similarity(tf_idf_req_vector,tf_idf_resume_vector[i])[0][0])
    return cosine_similarity_list

# processing 

### resume_matcher

In [None]:
nltk.download('averaged_perceptron_tagger')

In [None]:
nltk.download('wordnet')

In [None]:
nltk.download('punkt')

In [None]:
import os
def process_files(req_document,resume_docs):
    req_doc_text = get_content_as_string(req_document)
    resume_doc_text = []
    for doct in resume_docs:
        resume_doc_text.append(get_content_as_string(doct))

    cos_sim_list = get_tf_idf_cosine_similarity(req_doc_text,resume_doc_text)
    print(cos_sim_list)
    final_doc_rating_list = []
    zipped_docs = zip(cos_sim_list,resume_docs)
    sorted_doc_list = sorted(zipped_docs, key = lambda x: x[0], reverse=True)
    for element in sorted_doc_list:
        doc_rating_list = []
        doc_rating_list.append(os.path.basename(element[1]))
        doc_rating_list.append("{:.0%}".format(element[0]))
        final_doc_rating_list.append(doc_rating_list)
    return final_doc_rating_list
    


if __name__ == "__main__":
     req_document = '/home/ashok/Desktop/GEP_Techathon/jobdescription/req.txt'
     resume_docs = ['/home/ashok/Desktop/GEP_Techathon/resume/resume1.txt',
                    '/home/ashok/Desktop/GEP_Techathon/resume/resume2.txt',
                    '/home/ashok/Desktop/GEP_Techathon/resume/resume3.txt',
                    '/home/ashok/Desktop/GEP_Techathon/resume/resume4.txt',
                    '/home/ashok/Desktop/GEP_Techathon/resume/resume5.txt']
     final_doc_rating_list=process_files(req_document,resume_docs)

In [None]:
final_doc_rating_list

# Name Extraction from Resume

In [None]:
import spacy
from spacy.matcher import Matcher
nlp = spacy.load('en_core_web_sm')
matcher = Matcher(nlp.vocab)
def extract_name(resume_text):
    nlp_text = nlp(resume_text)
    pattern = [{'POS': 'PROPN'}, {'POS': 'PROPN'}]
    matcher.add('NAME',[pattern])
    matches = matcher(nlp_text)
    for match_id, start, end in matches:
        span = nlp_text[start:end]
        return span.text

In [None]:
nltk.download('maxent_ne_chunker')

In [None]:
nltk.download('words')

In [None]:
from pathlib import Path
def get_resume_txt(document_path):
    txt = Path(document_path).read_text()
    txt = txt.replace('\n', ' ')
    return txt

In [None]:
resume_docs = ['/home/ashok/Desktop/GEP_Techathon/resume/resume1.txt',
                    '/home/ashok/Desktop/GEP_Techathon/resume/resume2.txt',
                    '/home/ashok/Desktop/GEP_Techathon/resume/resume3.txt',
                    '/home/ashok/Desktop/GEP_Techathon/resume/resume4.txt',
                    '/home/ashok/Desktop/GEP_Techathon/resume/resume5.txt']
for doc in resume_docs:
    resume_txt = get_resume_txt(doc)
    name = extract_name(resume_txt)
    print(name)

# Email Extraction from Resume

In [None]:
EMAIL_REG = re.compile(r'[a-z0-9\.\-+_]+@[a-z0-9\.\-+_]+\.[a-z]+')
def extract_emails(resume_text):
    return re.findall(EMAIL_REG, resume_text)

In [None]:
for doc in resume_docs:
    resume_txt = get_resume_txt(doc)
    email = extract_emails(resume_txt)
    print(email[0])