    This notebook contains a small analysis of the dataset we are working with.

In [1]:
import pandas as pd

#### Reading the dataset

In [2]:
df = pd.read_csv("data.tsv",sep="\t")

#### Details about the Dataset

In [3]:
schema_df = pd.read_csv('schema.csv', index_col='column name')
schema_df

Unnamed: 0_level_0,description
column name,Unnamed: 1_level_1
Name,Name of the person
Selected,Whether the person is selected to work in the ...
Date of Birth,Date of birth
Age,How old the person is
Gender,Male or Female
LinkedIn Profile,link to a linkedin profile
University,Name of the university
Other,Name of School (if included)
Degree Course,Name of the Course
Area Of Study,Specilization of the degree


In [4]:
df['Selected'].value_counts()

No     860
Yes     34
Name: Selected, dtype: int64

#### Cleaning the Dataset for preprocessing

- The column named `Unnamed: 22` is removed
- Longer Columns Names are reduced to short names
- Changing inputs from the `Selected` column to `True` or `False` 
- Rounding up the Age

In [5]:
# removing the a column for the dataframe
del df['Unnamed: 22']
# changing the names of the columns
df.rename(columns={'What do you think is your life’s purpose and why do you think having a purpose is important? (100 words or less)':'Purpose','Describe 5 things that attracted you to Unilever.':'About company'}, inplace=True)
# changing all the Yes and No to True or False
df['Selected'] = df['Selected'].map({'Yes':True,'No':False})
# rounding off the age
df['Age'] = df['Age'].apply(round)

In [16]:
from nltk.tokenize import sent_tokenize, word_tokenize, PunktSentenceTokenizer
from nltk.corpus import stopwords
from nltk import ne_chunk
from nltk import pos_tag
from re import sub


STOPWORDS = set(stopwords.words("english"))

#### Algorithms for cleaning up sentences

In [17]:
def cleanup_bulletpoints(sentence):
    """Removes non alphanumeric characters
    from the sentence
    """
    return sub('[^A-Za-z0-9]+', ' ', sentence)

In [18]:
def cleanup_sentence(sentence):
    """Removes stopwords and unnecessary tags
    from the sentence
    """
    global STOPWORDS
    words = word_tokenize(sentence)
    filter_sentence = [w for w in words if w not in STOPWORDS]
    return " ".join(w for w in filter_sentence)

#### Scoring Algorithm

In [19]:
def advanced_search(sentence):
    """search if any keyword is mentioned in the 
    sentence (advanced)
    TODO: edit this part
    """
    words = word_tokenize(sentence)
    tagged = pos_tag(words)
    namedEnt = ne_chunk(tagged)
    return {(" ".join(c[0] for c in chunk), chunk.label())for chunk in namedEnt if hasattr(chunk,'label')}

In [20]:
sentence = df.loc[1,'Clubs & Associations']
filter_sentence = cleanup_sentence(cleanup_bulletpoints(sentence))
print(advanced_search(filter_sentence))

{('Team Captain School Drama Circle', 'PERSON'), ('PERADENIYA', 'ORGANIZATION'), ('Scrabble Club', 'PERSON'), ('Director Finance English Debating Club', 'PERSON'), ('Psychology Student Union Student Co', 'PERSON'), ('SCHOOL Mooting Club', 'ORGANIZATION'), ('UNIVERSITY OF', 'ORGANIZATION'), ('Senior Player Senior School Choir', 'PERSON'), ('DRAMSOC University Peradeniya Ceylon Drama Circle Faculty', 'ORGANIZATION'), ('SCHOOL', 'ORGANIZATION'), ('Interact Club', 'ORGANIZATION')}
