<a href="https://colab.research.google.com/github/Echo9k/3-potential_talents/blob/dev/Potential%20Talent.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Standard libraries
import os
import sys
# Data manipulation
import pandas as pd

# Machine Learning and vectorization
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Geolocation
from geopy.geocoders import Nominatim

# Visualization
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt

# Utilities
from IPython.display import Markdown, display

ModuleNotFoundError: No module named 'geopy'

#### This project is about how to predict fit the candidate is based on their available information.

### Importing Librariers

In [None]:
try:
    from google.colab import drive
    drive.mount('/content/drive')
    root_dir = "/content/drive/MyDrive/wdir/repos/Apziva/3-potential_talents/"
    os.getcwd()

except ImportError:
    while 'potential_talents' not in os.listdir('.'):
        os.chdir('..')
        root_dir=os.getcwd()
    
    # append term_deposit to system to import custom functions
    sys.path.append('.')
    
%pwd

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/wdir/repos/Apziva/3-potential_talents


'/content/drive/MyDrive/wdir/repos/Apziva/3-potential_talents'

In [5]:
df = pd.read_csv("data/raw/data.csv",
                 usecols=["job_title", "location", "connection"],
                 converters={"connection":lambda x: 501 if x.strip() == '500+'
                 else int(x.strip() if x.strip().isdigit() else x)})

# Downcast the connection
df["connection"] = pd.to_numeric(df["connection"], downcast="integer")
SEED=42
df.sample(5, random_state=SEED)

Unnamed: 0,job_title,location,connection
30,2019 C.T. Bauer College of Business Graduate (...,"Houston, Texas",85
65,Experienced Retail Manager and aspiring Human ...,"Austin, Texas Area",57
64,Human Resources Coordinator at InterContinenta...,"Atlanta, Georgia",501
53,Student at Chapman University,"Lake Forest, California",2
45,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 104 entries, 0 to 103
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   job_title   104 non-null    object
 1   location    104 non-null    object
 2   connection  104 non-null    int16 
dtypes: int16(1), object(2)
memory usage: 2.0+ KB


In [7]:
print(f"The data shape is {df.shape}")
display(Markdown("#### Numeric"),
        df.describe(include="number").round(2))
display(Markdown("#### Object"),
        df.describe(include="object"))

The data shape is (104, 3)


#### Numeric

Unnamed: 0,connection
count,104.0
mean,263.09
std,223.06
min,1.0
25%,47.0
50%,193.0
75%,501.0
max,501.0


#### Object

Unnamed: 0,job_title,location
count,104,104
unique,52,41
top,2019 C.T. Bauer College of Business Graduate (...,Kanada
freq,7,12


potentially we have skewness in the number of connections

In [None]:
df.shape[0]

In [8]:
# plot the connection distributions
fig = px.histogram(df, x="connection", nbins=20,
                   title="Distribution of Connections")
fig.show()

Some applicants might have applied twice. Let's see how many and drop them.

In [None]:
df.shape[0]

In [9]:
display(df[df.duplicated(keep=False)].sample(3))

## Dropping the duplicated rows
df = df.drop_duplicates().reset_index(drop=True)
print("Shape fo non-duplicated dataframe:", df.shape)

df=df.reset_index(drop=True)
print("New shape: ",df.shape)

Unnamed: 0,job_title,location,connection
50,HR Senior Specialist,San Francisco Bay Area,501
8,Student at Humber College and Aspiring Human R...,Kanada,61
63,"SVP, CHRO, Marketing & Communications, CSR Off...","Houston, Texas Area",501


Shape fo non-duplicated dataframe: (53, 3)
New shape:  (53, 3)


In [10]:
df.job_title.value_counts().head()

Unnamed: 0_level_0,count
job_title,Unnamed: 1_level_1
Aspiring Human Resources Professional,2
2019 C.T. Bauer College of Business Graduate (Magna Cum Laude) and aspiring Human Resources professional,1
Lead Official at Western Illinois University,1
Senior Human Resources Business Partner at Heil Environmental,1
Aspiring Human Resources Professional | An energetic and Team-Focused Leader,1


In [11]:
from term_deposit.data import

# Apply the function
word_counts = calculate_word_frequencies(df["job_title"])

# Print results
print("Number of total words:", len(word_counts))
print(word_counts.most_common()[:10])

Number of total words: 221
[('Human', 34), ('Resources', 29), ('at', 22), ('and', 13), ('Aspiring', 11), ('|', 10), ('Professional', 7), ('in', 6), ('University', 6), ('Seeking', 6)]


In [12]:
df.location.value_counts().head()

Unnamed: 0_level_0,count
location,Unnamed: 1_level_1
"Houston, Texas Area",4
"Raleigh-Durham, North Carolina Area",3
Greater New York City Area,3
"Austin, Texas Area",2
Amerika Birleşik Devletleri,2


## 1. Data Preprocessing & Feature Engineering

### Cleaning the data

    * Removing special character
    * Standarize text
    * Removing stopwords and applying lemmatization

Lemmatization usually refers to doing things properly with the use of a vocabulary and morphological analysis of words, normally aiming to remove inflectional endings only and to return the base or dictionary form of a word, which is known as the lemma .

In [None]:
df.shape[0]

In [None]:
import string

# Create a translation table
translator = str.maketrans('', '', string.punctuation)

# Remove punctuation from the 'job_title' column
df["job_title"] = df["job_title"].str.translate(translator)

In [None]:
### 1. Preprocess the Location Field to Standardize Formats

# Initialize the geolocator
geolocator = Nominatim(user_agent="geoapiExercises")



# Apply the standardization function
df['standardized_location'] = df['location'].apply(standardize_location)

In [15]:
df["job_title"] =df["job_title"].str.lower()
df["location"]=df["location"].str.lower()

In [None]:
df.shape[0]

In [17]:
print("Job title before removing stopwords:\n", df.job_title.head(1))
print("-"*90)

df[["location", "job_title"]] = cleaning(df, ["location", "job_title"])

print("Job title after removing stopwords:\n", df.job_title.head(1))
print("-"*90)
df.sample(3, random_state=SEED)

Job title before removing stopwords:
 0    2019 ct bauer college of business graduate mag...
Name: job_title, dtype: object
------------------------------------------------------------------------------------------
Job title after removing stopwords:
 0    2019 ct bauer college business graduate magna ...
Name: job_title, dtype: object
------------------------------------------------------------------------------------------


Unnamed: 0,job_title,location,connection
19,human resource generalist scottmadden inc,"raleigh-durham, north carolina area",501
41,admission representative community medical cen...,"long beach, california",9
47,seeking human resource position,"la vega, nevada area",48


Cleaning and standardizing the job_title column by replacing abbreviations and specific terms with their full forms. This makes the data more consistent and easier to analyze.

In [18]:
# lookup table

## Replace the abbreviation with their description
df = df.replace({'job_title' : dir_replacements_job_title}, regex=True)

In [19]:
df.shape[0]

# Paso 1: Definir el Marco de Modelado

## 1.1 Preprocesamiento de Datos y Generación de Características:

#### Word Embedding Techniques



I will be using 4 different strategies to find the similarties betwen the targeted sentences and each job title as follows

 1. **TF-IDF**(term frequency-inverse document frequency) is a statistical measure that evaluates how relevant a word is to a document in a collection of documents.

    
  2. **GloVe** stands for global vectors for word representation. It is used for generating word embeddings by aggregating global word-word co-occurrence matrix from a corpus. The resulting embeddings show interesting linear substructures of the word in vector space.
    
   
   3. **Word2Vec** is a technique which produces word embeddings for better word representation. We can also say it consists of models for generating word embedding which are shallow two layer neural networks having one input layer, one hidden layer and one output layer.
    
   
   4. **BERT** It is designed to pre-train deep bidirectional representations from unlabeled text by jointly conditioning on both left and right context.

##### 1. Fuzzy word match

In [21]:


# Example usage with your DataFrame and keyword
df['partial_match'] = calculate_keyword_match(df, "aspiring human resources")

In [22]:
# prompt: Let's modify this cell so that partial_match considers all search_phrases



# Example usage with your DataFrame and keyword list
df['partial_match'] = calculate_keyword_match(df, search_phrases)

##### 2. TF-IDF

In [23]:
## Convert hob titles column into a list
job_title_list =list(df["job_title"])

# Vectorize job_title_list
vectorizer = TfidfVectorizer().fit(job_title_list)
X =vectorizer.transform(job_title_list)

# Get feature names in all the documents
# Use get_feature_names_out() instead of get_feature_names()
feature_names = vectorizer.get_feature_names_out()
print("Number of unique features: ", len(feature_names))
print("First 5 features: ", feature_names[:5])

# convert job titles into array
tfidf_vector = X.toarray()
print("Shape of Tfidf Vector: ", tfidf_vector.shape)

Number of unique features:  167
First 5 features:  ['administration' 'administrative' 'admission' 'advisory' 'always']
Shape of Tfidf Vector:  (53, 167)


In [24]:
def similarity(phrase, vectorizer, tfidf_matrix, df):
    """
    Calculate the similarity of a phrase with a given column in a dataframe using cosine similarity.

    Parameters:
        phrase (list): A list containing the search phrase(s).
        vectorizer (TfidfVectorizer): Fitted TF-IDF vectorizer.
        tfidf_matrix (csr_matrix): TF-IDF matrix of the dataframe's text.
        df (DataFrame): DataFrame to update with similarity scores.
    """

    similarity_scores = []
    for job_title in df["job_title"]:
        # Convert search phrase and job title into vectors
        phrase_vector = vectorizer.transform(phrase)
        job_title_vector = vectorizer.transform([job_title])

        # Calculate cosine similarity and take the maximum for all phrases
        score = max(cosine_similarity(job_title_vector, phrase_vector).flatten())
        similarity_scores.append(score)

    # Return the list of similarity scores
    return similarity_scores

In [25]:
# Example DataFrame and setup
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(df["job_title"])

# Call the similarity function
df["similarity_score"] = similarity(search_phrases, vectorizer, tfidf_matrix, df)

# View the DataFrame
df.sample(3, random_state=SEED)

Unnamed: 0,job_title,location,connection,partial_match,similarity_score
19,human resource generalist scottmadden inc,"raleigh-durham, north carolina area",501,76,0.446244
41,admission representative community medical cen...,"long beach, california",9,56,0.0
47,seeking human resource position,"la vega, nevada area",48,79,0.096089


In [26]:
# Searched prhase
phrases = pd.DataFrame({"phrase":search_phrases})
cleaning(phrases, "phrase")

Unnamed: 0,phrase
0,aspiring human resource
1,human resource assistant
2,hr coordinator
3,hr generalist (entry-level)
4,talent acquisition assistant
5,recruitment coordinator
6,hr intern
7,hr trainee
8,junior hr specialist
9,hr associate


In [27]:
# Assuming cleaning is defined and does what it needs to do
cleaning(phrases, "phrase")

# Call the similarity function with all required arguments
df["TF-IDF_fit_score"] = similarity([phrases.phrase[0]],
                                    vectorizer, tfidf_matrix, df)
df.sort_values(by ="TF-IDF_fit_score", ascending=False).head(3)

Unnamed: 0,job_title,location,connection,partial_match,similarity_score,TF-IDF_fit_score
7,human resources senior specialist,san francisco bay area,501,81,0.500978,0.500978
2,aspiring human resource professional,"raleigh-durham, north carolina area",44,96,0.430994,0.430994
45,aspiring human resource professional,"kokomo, indiana area",71,96,0.430994,0.430994


##### 3. GloVe

In [None]:
display(Markdown("### Head"), Markdown("---"),
df.sort_values(by="BERT_model_fit_score", ascending=False).head(3))
display(Markdown("### Tail"), Markdown("---"),
df.sort_values(by="BERT_model_fit_score", ascending=False).tail(3))

In [82]:
from pathlib import Path

data_path = Path("data/internim/")
data_path.mkdir(parents=True, exist_ok=True)

df.to_parquet(data_path / "encoded.parquet", index=True)