This notebook contains a small analysis of the dataset we are working with.

In [1]:
import pandas as pd

#### Reading the dataset

In [2]:
df = pd.read_csv("data.tsv",sep="\t")

#### Details about the Dataset

In [3]:
schema_df = pd.read_csv('schema.csv', index_col='column name')
schema_df;

In [4]:
df['Selected'].value_counts();

#### Cleaning the Dataset for preprocessing

- The column named `Unnamed: 22` is removed
- Longer Columns Names are reduced to short names
- Changing inputs from the `Selected` column to `True` or `False` 
- Rounding up the Age

In [5]:
# removing the a column for the dataframe
del df['Unnamed: 22']
# changing the names of the columns
df.rename(columns={'What do you think is your life’s purpose and why do you think having a purpose is important? (100 words or less)':'Purpose','Describe 5 things that attracted you to Unilever.':'About company'}, inplace=True)
# changing all the Yes and No to True or False
df['Selected'] = df['Selected'].map({'Yes':True,'No':False})
# rounding off the age
df['Age'] = df['Age'].apply(round)

#### Importing modules for NLP

In [6]:
from nltk.tokenize import sent_tokenize, word_tokenize, PunktSentenceTokenizer
from nltk.corpus import stopwords
from nltk import ne_chunk
from nltk import pos_tag
from re import sub

STOPWORDS = set(stopwords.words("english"))

#### Algorithms for cleaning up sentences

In [7]:
from src.utils.clean_sentences import cleanup_sentence, cleanup_bulletpoints, cleanup_brackets

#### Scoring Algorithm for Clubs

In [10]:
from src.utils.extract_keypoints_clubs import clubs_main

In [11]:
df["Clubs & Associations"] = df["Clubs & Associations"].fillna("None")

In [12]:
df["Clubs & Associations"] = df["Clubs & Associations"].apply(clubs_main)

#### Scoring algorithm for prof. qualification

In [13]:
from src.utils.extract_keypoints_qualification import prof_qualification_main

In [14]:
df['Professional Qualification'] = df['Professional Qualification'].fillna("None")

In [15]:
df["Professional Qualification"] = df["Professional Qualification"].apply(prof_qualification_main) 

#### Calculating the score for sports

In [16]:
from src.utils.extract_keypoints_sport import sports_main

In [17]:
df['Sports'] = df['Sports'].fillna("None")

In [18]:
df['Sports'] = df['Sports'].apply(sports_main)

In [19]:
filt = (df["Selected"] == True);

In [20]:
df.loc[filt,["Name","Sports"]];

#### Calculating the score for leadership

In [42]:
keyword = ["President", "Leader", "Team Leader", "Treasurer",\
           "Secretory", "Captain", "prefect", "active member"]
group_1 = {"President", "Captain", "Leader"}
group_2 = {"Team Leader","Treasurer"}
group_3 = {"Secretory", "prefect", "active member"}

In [24]:
def advanced_search_qualification(sentence):
    """search if any keyword is mentioned in the
    sentence (advanced) and gives a score to the
    sentence.
    - scoring -
    check calculate_score_qualification function
    """
    score = 0
    words = word_tokenize(sentence)
    tagged = pos_tag(words)
    namedEnt = ne_chunk(tagged)
    tree = list({(" ".join(c[0] for c in chunk), chunk.label())
                 for chunk in namedEnt if hasattr(chunk, 'label')})
    return tree

In [55]:
def calculate_score_leadership(sentence):
    """
    """
    # if it is in group 1 score is 1.5
    group_1 = {"President", "Captain", "Leader"}
    # if it is in group 2 score is 1.0
    group_2 = {"Team Leader","Treasurer"}
    # if it is in group 3 score is 0.5
    group_3 = {"Secretory", "prefect", "active member"}
    word_set = set(word_tokenize(sentence))
    score = len(word_set.intersection(group_1)) * 1.5 + len(word_set.intersection(group_2)) + len(word_set.intersection(group_3)) * 0.5
    return score

In [56]:
sentence = df.loc[4, "Leadership"]
print(sentence)
sentence = cleanup_bulletpoints(cleanup_brackets(sentence))
filter_sentence = cleanup_sentence(sentence)
calculate_score_leadership(filter_sentence)

Vice President (External Relations) Blue Marble project - AIESEC  Vice President, Committee member - Human Resource Guild Junior prefect Vice President - English Literary Association


2.0