In [None]:
import pandas as pd
import os
import docx2txt
from pdfminer.high_level import extract_text
from sentence_transformers import SentenceTransformer, util
import torch
import timeit
import openai
from flask import Flask, request, render_template

app = Flask(__name__)

ALLOWED_EXTENSIONS = {'pdf', 'docx'}

def allowed_file(filename):
    return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS

# verify the file's type
def get_file_extension(file_path):
    _, file_extension = os.path.splitext(file_path)
    return file_extension

# convert docx to text
def docx_to_text(docx_path):
    text = docx2txt.process(docx_path)
    return text

# convert pdf to text
def pdf_to_text(pdf_path):
    text = extract_text(pdf_path)
    return text

def pdf_docx_to_text(file_path):
    extension = get_file_extension(file_path)

    if extension == '.pdf':
        text = pdf_to_text(file_path)
    else:
        text = docx_to_text(file_path)
    return text

openai.api_key = "Your-openai-api-key"
def extract_resume(resume_text):
    prompt = "Extract education, key skills, experiences from this resume and limit the number of words to 150: "

    response = openai.Completion.create(
        engine="text-davinci-003",
        prompt=prompt+resume_text,
        temperature=0.0,
        max_tokens=400,
        top_p=1,
        frequency_penalty=1.0,
        presence_penalty=1.2,
            stop=None,
        n=1,
    )
    return response

def similarity_score(sentence_model, education, skills_experiences, df_match_job):
    # sentence transformer
    model = SentenceTransformer(sentence_model)

    result = []
    for i in range(len(df_match_job)):
        # words and sentences
        job_text_education = str(df_match_job.iloc[[i]]['Education'])
        job_text_requirements = str(df_match_job.iloc[[i]]['Key Requirements'])
    
        #sentence embeddings
        embeddings_job = model.encode(job_text_education+job_text_requirements)
        embeddings_resume = model.encode(education+skills_experiences)
    
        
        # sentence embeddings similarity
        final_score = util.cos_sim(embeddings_job, embeddings_resume).item()
        
        temporary = list((df_match_job.iloc[[i]]['Title'], df_match_job.iloc[[i]]['FullDescription'], final_score))
        result.append(temporary)
        
    Final_result = pd.DataFrame(result, columns =['Title', 'Description', 'Final Score'])
    return Final_result

@app.route('/', methods=['GET', 'POST'])
def upload_file():
    Final_result = ''
    if request.method == 'POST':
        # check if the post request has the file part
        if 'file' not in request.files:
            Final_result = 'No file part in the request'
        file = request.files['file']
        percentage_value = float(request.form.get('number'))
        # if the user does not select a file, the browser submits an empty file without a filename
        if file.filename == '':
            Final_result = 'No selected file'
        if file and allowed_file(file.filename):
            # save the file to a folder
            file_path = 'file_save/' + file.filename
            file.save(file_path)
            pd.set_option('display.max_colwidth', None)
            resume_text = pdf_docx_to_text(file_path)
            df_match_job = pd.read_csv('df_match_job.csv')
            # extract resume with GPT-3 engine
            response = extract_resume(resume_text)
            section = response.choices[0].text.strip()
            # extract education and skills portion from section
            education = ""
            skills_experiences = ""

            education_index = section.index('Education')
            #skill_experiences_index = section.index('Key Skills')
            skill_experiences_index = section.index('Skills')


            education += section[education_index+10:skill_experiences_index-1].strip()
            skills_experiences += section[skill_experiences_index+11:len(section)].strip()
            # compute similarity score and print outcome
            #start = timeit.default_timer()
            result = similarity_score('sentence-transformers/paraphrase-MiniLM-L3-v2', education, education, df_match_job)
            #stop = timeit.default_timer()
            #print('Time: ', stop - start)  
            Final_result = result.iloc[:int(len(df_match_job)*percentage_value)].sort_values(by=['Final Score'], ascending=False)
            Final_result = Final_result.reset_index()
            text = ""
            for i in range(len(Final_result)):
                a = str(Final_result['Title'][i]).index('Name: Title')
                b = str(Final_result['Description'][i]).index('Name: FullDescription')
                text+=('Title: '+str(Final_result['Title'][i])[13:a-2].strip()+'\n')
                text+=('Description:\n'+str(Final_result['Description'][i])[13:b-2].strip())
                text+='\n\n'
            Final_result = 'File uploaded successfully, see the results below:\n\n\n'+text
        else:
            Final_result = 'Invalid file type. Allowed file types are: {}'.format(', '.join(ALLOWED_EXTENSIONS))
    return render_template("upload.html", output = Final_result)


if __name__ == '__main__':
    app.run(debug = True)