# Data Wrangling


## Wrong output cases:
- Output is not valid if you worked in one company on different positions (Examples: FaikC, OmarD) **`DONE`**
- Output is not valid if info part is divided between two pages (Example: NedimC) **`DONE`**

Those are tricky ones, but I am going to figure it out. 

In [34]:
import spacy
import pandas as pd
import re
import en_core_web_sm
import slate3k as slate

In [35]:
def pdf_to_text(name):
    """
    Returns extracted text from pdf as list of strings. 
    Each element of list is one page of pdf document.

    """   
    with open('Resumes/LinkedIn/' + name + '.pdf', 'rb') as f:
        extracted_text = slate.PDF(f)
        f.close()
    
    return extracted_text

### Read pdf and concatenate pages in one string

In [36]:
nlp = en_core_web_sm.load() #load small spacy model

In [37]:
def concat_pages(filename):
    """Concatenate pages in one string, and returns that string"""
    cv = pdf_to_text(filename)
    text = ""
    
    for page in cv:
        text+=page
    
    return text

In [38]:
import cv2
import pytesseract
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

img = cv2.imread('Resumes/AmirH.png')
text = pytesseract.image_to_string(img)
doc = nlp(text)

In [39]:
#doc = nlp(concat_pages("ResadZ"))

**Here is the output of a pdf file**

In [40]:
text = doc.text

In [41]:
text

'feneled\n\nDee\n\neee\n\nTenens)\noe\nSerbian (Full Prof\n\nPeres\n\net ied\n\nrr See\n\n(tio)\n\ni Ce eat\nSer eo)\n\n \n\nAmir Hadzié\nYouTuber With 390.000+ Followers. Professional Web Developer.\nInfluencer of The Year 2017\n\nSummary\n\nFirst Bosnian YouTube Creator with 280.000+ subscribers on my\nchannel, and 50+ million views.\n\nProfessional Web Developer since 2009. | work with modern web\ntechnologies to achieve simple but powerful solutions. Microsoft\nCertified IT Professional and Solutions Developer (MCITP & MCSD).\n\nExperience\n\nYouTube\nEntertainer\nMarch 2010 - Present\n\nEntertainment\n\nAmilma Digital\nFounder & CEO\nMay 2009 - October 2016 (7 years 6 months)\n\nDASTO semtel\nNetwork Administrator\nMay 2011 - September 2011 (5 months)\n\nIntemet provider support and management. Brand development, web site\ntraffic growth, web site Ul and advertising revenue. Developed brand strategy\nand statistics systems.\n\nEducation\n\nMicrosoft\nMicrosoft Certified IT Profess

In [42]:
def clean_string(text):

    '''
    Remove part of text which contains something like this:
        Page 2 of 3\n\n\xa0\n\xa0\n\xa0\n\x0c
        
    This part does not contain any valuable information, and sometimes occures wrong answer in the output.
    '''
    
    pdf = text
    end = 0
    result = ""
    
    expression = r"Page \d of \d\n\n\xa0\n\xa0\n\xa0\n\x0c" 
    
    for match in re.findall(expression, text): #method `findall` returns all matched words as an array
        word = re.search(match, pdf)  #search returns start and the end of a word
        pdf = pdf[word.end():] #    pdf = pdf[word.end():] #search returns result after first match, so on the next iteration text file should be from the end of the word, to the end of a document
        result+=text[end:word.start() + end] #read text from the end of a last word to the start of the new word 
        end += word.end()
    
    return result

In [43]:
text = clean_string(text)

**Here I tried to get start and end of keywords**

In [44]:
def get_companies(text, expression):
    
    """
    Function finds start and the end of a pattern, and put those numbers in a list.
    List of those integers is being returned.
    """
    keys = []
    pdf = text
    end = 0
    
    for match in re.findall(expression, text):
        word = re.search(expression, pdf)
        pdf = pdf[word.end():]
        keys.append([word.start() + end, word.end() + end])
        end += word.end() 
    
    return keys

In [45]:
present_keywords = get_companies(text, r"\n\n.+\d{1,4}\xa0-\xa0Present\xa0\n")

## Get company and title

In [46]:
def get_info(start):
    """
    This function returns company and title.
    
    Parameters:
    start (int): Start of a word
    
    Returns:
    list[str, str]: [Name of the company, Title in that company]
    
    """
    
    string = ""
    for i in reversed(range(start)):
        string+=text[i]
        if("\n\n" in string[::-1]): #reverse an array and check for `\n\n`(end of a word) part in the string. 
            title = string[::-1].strip() #remove /n
            
            string = ""
            for j in reversed(range(i)):
                string+=text[j]
                if("\n\n" in string[::-1]):         
                    company = string[::-1].strip()
                    return [company, title]
                            

In [47]:
companies = []

for key in present_keywords:
    companies.append(get_info(key[0])) #get company and title for every date

In [48]:
present_dates = [text[keyword[0]:keyword[1]].strip() for keyword in present_keywords]

#This part above is just made for second dataframe. It can be deleted.

df = pd.DataFrame({
    "Company": [row[0] for row in companies],
    "Title": [row[1] for row in companies],
    "Date": present_dates
    
})

In [49]:
df

Unnamed: 0,Company,Title,Date


## Get companies from past

In [67]:
past_companies = get_companies(text, r'\n\n.+\d{1,4}\xa0-\xa0.+\d{1,4}\xa0.+\n')

In [68]:
for word in past_companies:
    companies.append(get_info(word[0]))

In [69]:
past_dates = [text[keyword[0]:keyword[1]].strip() for keyword in past_companies]
present_dates += past_dates

#This part above is just made for dataframe below. It can be deleted.

df = pd.DataFrame({
    "Company": [row[0] for row in companies],
    "Title": [row[1] for row in companies],
    "Date": present_dates
    
})

In [70]:
df

Unnamed: 0,Company,Title,Date


# Solving specific cases

In [54]:
titles = [row[1] for row in companies]
companies = [row[0] for row in companies]

In [55]:
def find_keywords(text):
    """
    This function should find all parts in text that are like this:
        \n\n2 years 11 months\n\n
    
    If statements fix some inaccuracies of regex.
    
    Parameters:
    text (string): Text of a pdf
    
    Returns:
    matched_words (list): Matched words in list. Those words mustn't be ("less than a year", "2016-2020 (4 years)") or something like that. 
    """

    expression = r'\n\n\d.+\n\n'
    matched_words = []
    
    for match in re.findall(expression, text):
        doc = nlp(match)
        for token in doc:
            if(token.lemma_ == "less" or token.lemma_ == "-"):
                break
            if(token.lemma_ == "year" or token.lemma_ == "month"):
                matched_words.append(match)
                break
                
    return matched_words

In [56]:
matched_words = find_keywords(text)

In [57]:
matched_words

[]

In [58]:
def get_dates_below(matched_words):
    """
    As you can see in the text, after those keywords there are some dates. If you add first N of those
    dates below you should get value which is equal to the keyword. Example:
    
    Keyword: 2 years 11 months 
    Dates below: 
    - August 2013 - April 2014 (9 months)
    - June 2011 - August 2013 (2 years 3 months) 
    - March 2010 - June 2011 (1 year 4 months) 
    - June 2008 - May 2009 (1 year) 
    ...
    
    If you add first two dates, you can realize that the person in that period worked in the same company.
    
    Parameters:
    matched_words (list): keywords which met the rules
    
    Returns:
    array (list): dates below keyword
    pom (list): start and the end of dates below keyword
    
    """

    keys = []
    end = 0
    pdf = text
    array = []
    pom = []
    m = []
    position_of_keyword = []
    
    for matched_word in matched_words:
        for match in re.findall(matched_word, text):
            keys = []
            word = re.search(matched_word, pdf)
            pdf = pdf[word.end():]
            
            keys.append([word.start() + end, word.end() + end])
            position_of_keyword+=(keys)
            end += word.end() 
            
            array.append([text[key[0]:key[1]] for key in past_companies if keys[0][1] < key[1]])
            pom.append([[key[0], key[1]] for key in past_companies if keys[0][1] < key[1]])
    
    return array, pom, position_of_keyword

In [59]:
array, pom, position_of_keyword = get_dates_below(matched_words)

In [60]:
array

[]

In [61]:
def get_period(array):
    """
    Get part of matched words from brackets. Example:
    
    Matched word:
    '\n\nAugust 2013\xa0-\xa0April 2014\xa0(9 months)\n'
    Output should be:
    (9 months)
    
    Parameters:
    array (list): list of dates below keyword
    
    Returns:
    array (list): list of elapsed times between dates
    
    """

    experience = []
    expression = r'\(.+\)'

    for i in range(len(array)):
        for j in range(len(array[i])):
            word = array[i][j]
            for match in re.findall(expression, word):
                array[i][j] = match
    
    return array

In [62]:
array = get_period(array)

In [63]:
array

[]

In [64]:
def get_year_and_month(word):
    """
    Get year and month from preprocessed data.
    
    Parameters:
    word (string): elapsed times between dates
    
    Returns:
    (list): Number of years, Number of months
    """
    doc = nlp(word)
    month = 0
    year = 0
    word = 0
    
    for i in range(len(doc)):
        if(doc[i].lemma_ == "year"):
            year = doc[i-1]
        if(doc[i].lemma_ == "month"):
            month = doc[i-1]
    return [int(str(year)), int(str(month))]

In [65]:
for p in range(len(matched_words)):
    
    target_year = get_year_and_month(matched_words[p])[0]
    target_month = get_year_and_month(matched_words[p])[1]
    
    month = 0
    year = 0
    
    string = ""
    
    #Get name of a company. Name of a company is placed above keyword.
    for i in reversed(range(position_of_keyword[p][0])):
        string+=text[i]
        if("\n\n" in string[::-1]):
            cmp = string[::-1].strip()
            break
 
    for j in range(len(array[p])):
        
        #Add months and years
        month += get_year_and_month(array[p][j])[1]
        year += get_year_and_month(array[p][j])[0]
            
        #Check if they are equal to the target_year and target_month
        if(year + (month-1)//12 == target_year and (month-1)%12== target_month):
            
            #Chnage name of a company for all wrong answers
            start = companies.index(matched_words[p].strip())
            companies[start:start+j+1] = [cmp for i in range(j+1)]
            break

In [66]:
df = pd.DataFrame({
    "Company": companies,
    "Title": titles,
    "Date": present_dates
    
})

df

Unnamed: 0,Company,Title,Date
