In [1]:

from bs4 import BeautifulSoup
import os,os.path
import gensim,re 
import matplotlib.pyplot as plt
import operator
import pandas as pd
import math
re_c = re.compile(r'\w+')
import spacy
import numpy as np
from spacy.vectors import Vectors
vectors = Vectors(shape=(10000, 300))
nlp = spacy.load('en_core_web_sm')
nlp.vocab.vectors = vectors
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [2]:
def get_sentences():
    sent_file = open(('sentences.txt'),'w',encoding="utf8")
    with open("para.txt",encoding="utf8") as fobj:
        for line in fobj:
            if line != '\n' and line.strip().startswith('<p>'):
                try:
                    soup = BeautifulSoup(line.strip(),"lxml")
                    doc = nlp(soup.p.text)
                except:
                    logging.warning(line,"can't be parsed.")
                    continue
                for each in doc.sents:
                    text = each.text+'\n'
                    sent_file.write(each.text+'\n')
                    
    sent_file.close()

In [None]:
# get_sentences()

In [3]:
class MySentences():
    def __init__(self):
        pass
    def __iter__(self):
        with open(('sample_bitcoin.stackexchange_sentences.txt'),'r', encoding='utf-8') as fin:
            for line in fin:
                review_text = re.sub("[^a-zA-Z]"," ", line)
                print(review_text)
                yield review_text.split()

In [4]:
data = MySentences()

In [5]:
data

<__main__.MySentences at 0x265c0b02588>

In [None]:
from collections import defaultdict
d = defaultdict(lambda : 0)
for each in data:
    d[len(each)] += 1

In [None]:
type(d)

In [None]:

d = dict(sorted(d.items(), key=operator.itemgetter(1), reverse=True)[:10])
plt.bar(range(len(d)), d.values(), align='center')
plt.xticks(range(len(d)), d.keys())
plt.ylabel("Number of sentences")
plt.xlabel("Number of words in that sentence")
plt.savefig('context.jpg')
plt.show()

In [None]:
# model = gensim.models.Word2Vec(data,workers=4, size=300, min_count = 1, window = 15, sample = 1e-3)

In [None]:
# model_name = "model"
# model.save(model_name)

In [None]:
model = gensim.models.Word2Vec.load('model')

In [None]:
flag_print = True
flag_clear = True
threshold = 0.5

In [None]:
similar_to = {
    'edu' : ['education', 'study', 'academics', 'institute', 'school', 'college'],
    'exp' : ['job', 'internship', 'training', 'research', 'career', 'profession', 'role'
             'project', 'responsibility', 'description', 'work experience', 'workshop', 'conference'],
    'skill' : ['skill', 'languages', 'technology', 'framework', 'tools', 'database'],
    'extra' : ['introduction', 'intro', 'achievement', 'hobby', 'links', 'additional', 
               'personal', 'award', 'objective', 'miscellaneous', 'interest']
}

list_of_sections = similar_to.keys()
for section in list_of_sections:
    new_list = []
    
    for word in similar_to[section]:
        docx = nlp(word)
        new_list.append(docx[0].lemma_)
        
    if flag_print:
        print(section, new_list)
        
    similar_to[section] = new_list

In [None]:
def modify(word):
    try:
        symbols = '''~'`!@#$%^&*)(_+-=}{][|\:;",./<>?'''
        mod_word = ''
        
        for char in word:
            if (char not in symbols):
                mod_word += char.lower()

        docx = nlp(mod_word)

        if (len(mod_word) == 0 or docx[0].is_stop):
            return None
        else:
            return docx[0].lemma_
    except:
        return None
    
if flag_print:
    test_words = ['Hello!!', '.,<>', 'India', 'of', '..freedoM..', 'e-mail']
    
    for word in test_words:
        print(word, '--returned-->', modify(word))

In [None]:
def is_empty(line):
    for c in line:
        if (c.isalpha()):
            return False
    return True
      
if flag_print:
    test_words = ['.', '<.>', 'Speak', 'out', '"Eric"', 'freemail...']
    
    for word in test_words:
        print(word, '--returned-->',) 

In [None]:
dict_of_data_series = {}
flag_print = False
for file_name in os.listdir(os.getcwd()+'/CVs'):
    if flag_print:
        print('\n')
        print('*'*25) 
        print(file_name) 
        print('*'*25) 
    main_file_handler = open('CVs/'+file_name, 'r', encoding='latin-1')  
    previous_section  = 'extra'
    curr_data_series = pd.Series([""]*len(list_of_sections), index=list_of_sections)             
    for line in main_file_handler:
        if (len(line.strip()) == 0 or is_empty(line)):
            continue
        list_of_words_in_line = re_c.findall(line)
        list_of_imp_words_in_line  = []
        for i in range(len(list_of_words_in_line)):
            modified_word = modify(list_of_words_in_line[i])
            if (modified_word):
                list_of_imp_words_in_line.append(modified_word)
        curr_line = ' '.join(list_of_imp_words_in_line)
        doc = nlp(curr_line)
        section_value = {}
        for section in list_of_sections:
            section_value[section] = 0.0
        section_value[None] = 0.0
        for token in doc:
            for section in list_of_sections:
                for word in similar_to[section]:
                    word_token = doc.vocab[word]
                    section_value[section] = max(section_value[section], float(word_token.similarity(token)))
        most_likely_section = None
        for section in list_of_sections:
            if (section_value[most_likely_section] < section_value[section] and section_value[section] > threshold):
                most_likely_section = section
        if (previous_section != most_likely_section and most_likely_section is not None):
            previous_section = most_likely_section
        try:
            docx = nlp(line)
        except:
            continue  
        mod_line = ''
        for token in docx:
            if (not token.is_stop):
                mod_line += token.lemma_ + ' '
        curr_data_series[previous_section] += mod_line
    dict_of_data_series[file_name] = curr_data_series
    if flag_print:
        print(curr_data_series)
    main_file_handler.close()
data_frame = pd.DataFrame(dict_of_data_series)
# data_frame.to_csv('prc_data.csv', sep='\t')
# data_frame.head()

In [None]:
def get_closest(word, n): 
    word = word.lower()
    words = [word]
    similar_vals = [1]
    try:
        similar_list = model.most_similar(positive=[word],topn=n)
        for tupl in similar_list:
            words.append(tupl[0])
            similar_vals.append(tupl[1])
    except:
        pass    
    return words, similar_vals

In [None]:
cvs = pd.read_csv('prc_data.csv', sep='\t')
cvs = cvs.set_index('Unnamed: 0')
prc_description = "python"

In [None]:
word_value = {}
similar_words_needed = 2
for word in prc_description.split():
    similar_words, similarity = get_closest(word, similar_words_needed)
    for i in range(len(similar_words)):
        word_value[similar_words[i]] = word_value.get(similar_words[i], 0)+similarity[i]
        print(similar_words[i], word_value[similar_words[i]])

In [None]:
no_of_cv = 150

count = {}
idf = {}
for word in word_value.keys():
    count[word] = 0
    for i in range(no_of_cv):
        try:
            if word in cvs.loc(0)['skill'][i].split() or word in cvs.loc(0)['exp'][i].split():
                count[word] += 1
        except:
            pass
    if (count[word] == 0):
        count[word] = 1
    idf[word] = math.log(no_of_cv/count[word])
print(count)
print(idf)

In [None]:
score = {}
for i in range(no_of_cv):
    score[i] = 0
    try:
        for word in word_value.keys():
            tf = cvs.loc(0)['skill'][i].split().count(word) + cvs.loc(0)['exp'][i].split().count(word)
            score[i] += word_value[word]*tf*idf[word]
    except:
        pass

In [None]:
sorted_list = []
for i in range(no_of_cv):
    sorted_list.append((score[i], i))
sorted_list.sort(reverse = True)
for s, i in sorted_list:
    if list(cvs)[i] != '.DS_Store':
        print(list(cvs)[i], ':', s)