In [55]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.svm import SVC
import re


# Opening the dataset

In [56]:

# Replace 'your_file_path' with the actual file paths where the Excel files are located
file_paths = ['AI Research Scientist.xlsx','Data Scientist.xlsx',
              'Computer Vision Engineer.xlsx', 'Natural Language Processing Engineer.xlsx',
              'Software Developer.xlsx']

# Load the data from each Excel file and assign labels
profiles_list = []
for path in file_paths:
    df = pd.read_excel(path)
    df['Profile'] = path.split('/')[-1].split('.')[0]  # Extract the profile name from the file name
    profiles_list.append(df)

# Combine all profiles into a single DataFrame
all_profiles_df = pd.concat(profiles_list, ignore_index=True)


columns_to_remove = ['name', 'linkedin_link', 'currentJobTitle', 'education_from_date', 'education_to_date','education_description','education_institution_name','experience_section_from_date', 'experience_section_to_date','experience_section_duration', 'experience_section_company','experience_section_location','certificate_issue_authority','certificate_issue_date', 'certificate_skill']  # Replace with your actual column names
all_profiles_df = all_profiles_df.drop(columns_to_remove, axis=1)


all_profiles_df

Unnamed: 0,about,education_degree,experience_section_position_title,experience_section_description,certficate_name,Profile
0,"Hi guys, \nHere is an AI Research Scientist, w...","-:-M.Tech, Instrumentation and signal processi...",,,-:-Data Analytics with Python-:-Natural Langua...,AI Research Scientist
1,"Hi guys, \nHere is an AI Research Scientist, w...","-:-M.Tech, Instrumentation and signal processi...",,,-:-Data Analytics with Python-:-Natural Langua...,AI Research Scientist
2,AI Applied Research Scientist and AI Product M...,,,,,AI Research Scientist
3,"As a research scientist, I specialize in the f...",-:-Anant has verified their government ID.,,,,AI Research Scientist
4,Machine Learning Engineer with a demonstrated ...,"-:-Master's degree, Mathematics-:-Bachelor of ...",-:-AI Research Scientist-:-Machine Learning En...,-:-Skills: Data Analysis\nData Analysis-:-1) W...,-:-Building Web Applications in Django-:-Djang...,AI Research Scientist
...,...,...,...,...,...,...
500,A career with a progressive organization that ...,"-:-Bachelor of Business Administration - BBA, ...",-:-Associate cybersecurity analyst-:-Business ...,-:-Skills: SIEM · Soc engineer · Mc afee siem ...,,Software Developer
501,"Highly accountable, dedicated and collaborativ...","-:-MBA, Operations Research, Operations Research",,,,Software Developer
502,• Conducting Vulnerability Assessments Using Q...,,-:-Cyber Security Analyst-:-Cyber Security Ana...,-:-nan-:-• Conducting Vulnerability Assessment...,-:-AWS Security Fundamentals-:-Microsoft 365 F...,Software Developer
503,,,,,,Software Developer


# Text Cleaning

In [57]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string


# Initialize stopwords and lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

import re
import string

def clean_text(text):
    if pd.isnull(text):
        return ""
    # Convert text to lowercase
    text = str(text).lower()
    # Replace punctuation with a single space
    text = re.sub('['+re.escape(string.punctuation)+']', ' ', text)
    # Replace multiple spaces with a single space
    text = re.sub('\s+', ' ', text).strip()
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    # Remove stopwords and lemmatize the words
    words = nltk.word_tokenize(text)
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    # Join the cleaned words in a string
    text = ' '.join(words)
    return text

# Apply the cleaning function to the 'About' section
all_profiles_df['about_cleaned'] = all_profiles_df['about'].apply(clean_text)


# Data Wrangling

In [58]:
import pandas as pd

# Function to categorize degree types
def categorize_degree(degree_str):
    if pd.isna(degree_str):
        return {
            'Has_PhD': 0,
            'Has_Masters': 0,
            'Has_Bachelors': 0,
            'Has_Associate': 0,
            'Has_Diploma': 0,
        }
    
    # Normalize the text to make keyword matching more consistent
    degree_str = degree_str.lower()
    
    return {
        'Has_PhD': int('ph.d' in degree_str or 'phd' in degree_str or 'doctor' in degree_str),
        'Has_Masters': int('master' in degree_str or 'mtech' in degree_str or 'm.sc' in degree_str or 'mba' in degree_str),
        'Has_Bachelors': int('bachelor' in degree_str or 'btech' in degree_str or 'be' in degree_str or 'b.sc' in degree_str),
        'Has_Associate': int('associate' in degree_str),
        'Has_Diploma': int('diploma' in degree_str or 'post graduate diploma' in degree_str),
        
    }

# Apply the categorization function to each row in the 'education_degree' column
degree_categories = all_profiles_df['education_degree'].apply(categorize_degree)

# Convert the resulting series of dictionaries to a dataframe
degree_categories_df = pd.DataFrame(degree_categories.tolist())

# Concatenate the new degree categories dataframe with the original data
all_profiles_df = pd.concat([all_profiles_df, degree_categories_df], axis=1)


all_profiles_df = all_profiles_df.drop(['education_degree'],axis = 1)



## Formating the dataset

In [59]:
import pandas as pd
import re
import string

# Define a function to split the position titles and clean them
def extract_positions(positions_str):
    if pd.isna(positions_str):
        return []
    
    # Split the string by '-:-', trim whitespace, and filter out any empty strings
    positions = [position.strip() for position in positions_str.split('-:-') if position.strip()]
    return positions

# Apply the function to the 'experience_section_position_title' column
all_profiles_df['experience_positions'] = all_profiles_df['experience_section_position_title'].apply(extract_positions)

# Now, 'all_profiles_df' contains a new column 'experience_positions' with the list of positions


# Function to concatenate positions into a single standardized string
def standardize_positions(positions_list):
    # Join the list into a single string with spaces
    positions_str = ' '.join(positions_list)
    
    # Replace newline characters with spaces
    positions_str = positions_str.replace('\n', ' ')
    
    # Remove punctuation using a regular expression
    positions_str = re.sub(f'[{re.escape(string.punctuation)}]', '', positions_str)
    
    # Convert to lowercase
    positions_str = positions_str.lower()
    
    return positions_str

# Apply the function to concatenate and standardize the position titles
all_profiles_df['standardized_positions'] = all_profiles_df['experience_positions'].apply(standardize_positions)

columns_to_remove = ['experience_positions', 'experience_section_position_title']  # Replace with your actual column names
all_profiles_df = all_profiles_df.drop(columns_to_remove, axis=1)

## cleaning experience section description


In [60]:
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

# Download the required NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

# Set of English stopwords
stop_words = set(stopwords.words('english'))

# Initializing the WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

# Function to clean the text: remove punctuation, stop words, and perform lemmatization
def clean_description(description):
    if pd.isna(description):
        return ""
    
    # Tokenize the description
    words = word_tokenize(description)
    
    # Remove punctuation and stop words, and then lemmatize the remaining words
    cleaned_description = [
        lemmatizer.lemmatize(word.lower())
        for word in words
        if word.isalpha() and word.lower() not in stop_words
    ]
    
    # Join the cleaned words back into a single string
    return ' '.join(cleaned_description)

# Apply the cleaning function to the 'experience_section_description' column
all_profiles_df['cleaned_experience_description'] = all_profiles_df['experience_section_description'].apply(clean_description)

# Show the results for the new column
all_profiles_df[['experience_section_description', 'cleaned_experience_description']].head()

#droping extra columns
columns_to_remove = ['experience_section_description'] 
all_profiles_df = all_profiles_df.drop(columns_to_remove, axis=1)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/amolharsh/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/amolharsh/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /Users/amolharsh/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## removing stop words and punctuations

In [61]:
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

# Download the required NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

# Set of English stopwords
stop_words = set(stopwords.words('english'))

# Initializing the WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

# Function to clean the text: remove punctuation, stop words, and perform lemmatization
def clean_certificate_name(name):
    if pd.isna(name):
        return ""
    
    # Tokenize the name
    words = word_tokenize(name)
    
    # Remove punctuation and stop words, and then lemmatize the remaining words
    cleaned_name = [
        lemmatizer.lemmatize(word.lower())
        for word in words
        if word.isalpha() and word.lower() not in stop_words
    ]
    
    # Join the cleaned words back into a single string
    return ' '.join(cleaned_name)

# Apply the cleaning function to the 'certficate_name' column
all_profiles_df['cleaned_certificate_name'] = all_profiles_df['certficate_name'].apply(clean_certificate_name)

# Show the results for the new column
all_profiles_df[['certficate_name', 'cleaned_certificate_name']].head()


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/amolharsh/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/amolharsh/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /Users/amolharsh/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Unnamed: 0,certficate_name,cleaned_certificate_name
0,-:-Data Analytics with Python-:-Natural Langua...,analytics language complete developer guide le...
1,-:-Data Analytics with Python-:-Natural Langua...,analytics language complete developer guide le...
2,,
3,,
4,-:-Building Web Applications in Django-:-Djang...,web application feature application technology...


## certificate name extracted

In [62]:
columns_to_remove = ['certficate_name']  # Replace with your actual column names
all_profiles_df = all_profiles_df.drop(columns_to_remove, axis=1)
all_profiles_df.to_excel('final_combined_profil.xlsx', index = False)


# performing tf-idf (text to numerical conversion)

In [63]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

# Assuming all_profiles_df is your dataframe
# Fill missing text data with placeholder
all_profiles_df['about_cleaned'].fillna('N/A', inplace=True)
all_profiles_df['standardized_positions'].fillna('N/A', inplace=True)
all_profiles_df['cleaned_experience_description'].fillna('N/A', inplace=True)
all_profiles_df['cleaned_certificate_name'].fillna('N/A', inplace=True)

# Function to create a string of qualifications based on binary columns
def create_qualification_string(row):
    qualifications = []
    if row['Has_PhD'] == 1:
        qualifications.append('phd')
    if row['Has_Masters'] == 1:
        qualifications.append('masters')
    if row['Has_Bachelors'] == 1:
        qualifications.append('bachelors')
    if row['Has_Associate'] == 1:
        qualifications.append('associate')
    if row['Has_Diploma'] == 1:
        qualifications.append('diploma')
    return ' '.join(qualifications)

# Apply the function to each row
all_profiles_df['qualification_keywords'] = all_profiles_df.apply(create_qualification_string, axis=1)

# Combine all text columns into a single text column for TF-IDF
all_profiles_df['combined_text'] = all_profiles_df['about_cleaned'] + " " + \
                                   all_profiles_df['standardized_positions'] + " " + \
                                   all_profiles_df['cleaned_experience_description'] + " " + \
                                   all_profiles_df['cleaned_certificate_name'] + " " + \
                                   all_profiles_df['qualification_keywords']

# Initialize TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=5000)

# Fit and transform the combined text column with TF-IDF
tfidf_matrix = tfidf_vectorizer.fit_transform(all_profiles_df['combined_text'])
tfidf_matrix

<505x5000 sparse matrix of type '<class 'numpy.float64'>'
	with 39411 stored elements in Compressed Sparse Row format>

# Dimension reduction
Tried PCA, LDA NMF maximum accuracy with TruncatedSVD algo for dimension reduction¶

In [64]:

# Convert the TF-IDF matrix to a DataFrame
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())


from sklearn.decomposition import TruncatedSVD

# Choose the number of components, for example, 100
# dimensionality reduction
n_components = 100
svd = TruncatedSVD(n_components=n_components)

# Fit and transform the TF-IDF matrix
reduced_tfidf_matrix = svd.fit_transform(tfidf_matrix)

# Convert the reduced matrix back to a DataFrame (optional)
reduced_tfidf_df = pd.DataFrame(reduced_tfidf_matrix)

final_features = reduced_tfidf_df


In [65]:
# importing extra data stored locally

In [107]:
import pandas as pd

# Specify the path to your Excel file
file_path = 'new_profiles_tf_id_truncated_data.xlsx'

# Read the Excel file into a DataFrame
new_reinforcement_truncated_tfid_data = pd.read_excel(file_path)
new_final_text_data = pd.read_excel('profile_names_text_data.xlsx')
final_features  = new_reinforcement_truncated_tfid_data
all_profiles_df = new_final_text_data
# Now the Excel file data is loaded into the DataFrame 'df'


In [108]:
new_final_text_data.iloc[-1]

Unnamed: 0                                                     820
combined_text    samyucktha g • samyucktha ganesapandian plaksh...
Profile                                                       Test
Name: 820, dtype: object

## giving numeric value/labels to the target variable

In [86]:

# Encode the target variable 'Profile'
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(all_profiles_df['Profile'])


# Clustering the dataset for insight generation

In [72]:
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import euclidean_distances




X_train, X_test, y_train, y_test = train_test_split(final_features, encoded_labels, test_size=0.1, random_state=42)


X_train = final_features
y_train = encoded_labels
# Step 1 & 2: Compute the centroid of each class in the training data
unique_labels = np.unique(y_train)
centroids = {}
unique_labels





array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17])

## finding the centroid values for each class in the training dataset

In [73]:
for label in unique_labels:

    class_features = X_train[y_train == label]
    centroids[label] = np.mean(class_features, axis=0)
 

## finding cosine similarity for each test data in the dataset

In [109]:
from sklearn.metrics.pairwise import cosine_distances
import numpy as np
import pandas as pd

def rank_classes_for_sample(sample, centroids, label_encoder):
    distances = {}

    # Convert sample to numpy array if it's a Series
    if isinstance(sample, pd.Series):
        sample = sample.to_numpy()

    # Calculate Cosine distance for each centroid
    for label, centroid in centroids.items():
        # Convert centroid to numpy array if it's a Series
        if isinstance(centroid, pd.Series):
            centroid = centroid.to_numpy()

        # Reshape centroid and sample for compatibility
        centroid_reshaped = centroid.reshape(1, -1)
        sample_reshaped = sample.reshape(1, -1)

        # Calculate Cosine distance and store in dictionary
        distance = cosine_distances(sample_reshaped, centroid_reshaped)[0][0]
        
        distances[label] = distance

    # Normalize the distances to convert them into similarity scores
    
    max_distance = max(distances.values()) 
    min_distance = min(distances.values())
#     print("distance: ", distances)
    
    # Normalizing distances to a range of [0, 1], where 1 is most similar
    normalized_similarities = {label: (max_distance  - distance) / (max_distance - min_distance) if max_distance != min_distance else 1.0 for label, distance in distances.items()}

    # Sort the similarities in descending order (higher is more similar)
    sorted_similarities = sorted(normalized_similarities.items(), key=lambda x: x[1], reverse=True)
    
    # Convert numerical labels back to original labels
    labels = [label_encoder.inverse_transform([label])[0] for label, _ in sorted_similarities]

    # Probabilities-like scores (higher is better)
    probabilities = [similarity * 100 for _, similarity in sorted_similarities]

    ranked_classes = [(labels[i], probabilities[i]) for i in range(len(labels))]
    return ranked_classes

# Example usage of the function
# Assuming X_test, centroids, label_encoder, and y_test are already defined
#~~~~~~~~~~~~~~~~~~~~~~~~~
#~~~~~~~~~~~~~~~~~~~~~~~~~
#~~~~~~~~~~~~~~~~~~~~~~~~~
# Predict for a single test sample
i = 6 # Index of the test sample
#~~~~~~~~~~~~~~~~~~~~~~~~~
#~~~~~~~~~~~~~~~~~~~~~~~~~
#~~~~~~~~~~~~~~~~~~~~~~~~~

#sample = X_test.iloc[i] if isinstance(X_test, pd.DataFrame) else X_test[i]
sample = new_reinforcement_truncated_tfid_data.iloc[-1]


flag = False
for x in sample:
    if x != 0:
        flag = True
    
if (flag):
    ranked_classes = rank_classes_for_sample(sample, centroids, label_encoder)
else:
    print("\nFailed")
    print("\n!!!!!All the values in the Series are 0!!!\n")

# Accessing the true label index from y_test
true_label_index = y_test[i] if isinstance(y_test, np.ndarray) else y_test.iloc[i]

true_label = label_encoder.inverse_transform([true_label_index])[0]

# Output
print(f"True Label: {true_label}")
for label, probability in ranked_classes:
    print(f"Predicted Label: {label}, Probability: {probability:.2f}%")


True Label: Computer Vision Engineer
Predicted Label: Computational Biologist, Probability: 100.00%
Predicted Label: Evolutionary Biologist, Probability: 77.01%
Predicted Label: Genetic Engineer, Probability: 76.82%
Predicted Label: Synthetic Biologist, Probability: 76.77%
Predicted Label: Robotics Machine Learning Engineer, Probability: 51.11%
Predicted Label: AI Research Scientist, Probability: 42.48%
Predicted Label: Data Scientist, Probability: 42.30%
Predicted Label: Data Analyst, Probability: 38.32%
Predicted Label: Product Manager, Probability: 30.59%
Predicted Label: Natural Language Processing Engineer, Probability: 28.13%
Predicted Label: Computer Vision Engineer, Probability: 25.78%
Predicted Label: Quant, Probability: 25.42%
Predicted Label: Electromechanical engineer, Probability: 21.04%
Predicted Label: RoboticsEngineer, Probability: 18.84%
Predicted Label: Software Developer, Probability: 8.51%
Predicted Label: Economist, Probability: 7.51%
Predicted Label: Financial Ana

# New Evaluation metric: testing whether the true label lies in the top three ranking recommendation or not

In [71]:


def predict_top_k_classes(sample, centroids, label_encoder, k):
    ranked_classes = rank_classes_for_sample(sample, centroids, label_encoder)
    # Return the top k classes
    return label_encoder.transform([label[0] for label in ranked_classes[:k]])

def top_k_accuracy_score(y_true, X_test, k, centroids, label_encoder):
    correct = 0
    for i in range(len(X_test)):
        top_k_preds = predict_top_k_classes(X_test.iloc[i] if isinstance(X_test, pd.DataFrame) else X_test[i], centroids, label_encoder, k)
        if y_true[i] in top_k_preds:
            correct += 1
    return correct / len(y_true)

# Calculate Top-k Accuracy, for example k=3
top_k_accuracy = top_k_accuracy_score(y_test, X_test, 3, centroids, label_encoder)
print(f"Top-3 Accuracy: {top_k_accuracy *100} %")


Top-3 Accuracy: 93.90243902439023 %
