In [19]:
import pandas as pd
import json
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string
import textacy
from textacy import extract

In [20]:
import cohere
co = cohere.Client("API KEY")

In [41]:
#!pip install textacy spacy
#!python -m spacy download en_core_web_md



In [22]:
def read_json(filename):
    with open(filename) as f:
        data = json.load(f)
    return data

In [23]:


class KeytermExtractor:
    """
    A class for extracting keyterms from a given text using various algorithms.
    """

    def __init__(self, raw_text: str, top_n_values: int = 20):
        """
        Initialize the KeytermExtractor object.

        Args:
            raw_text (str): The raw input text.
            top_n_values (int): The number of top keyterms to extract.
        """
        self.raw_text = raw_text
        self.text_doc = textacy.make_spacy_doc(
            self.raw_text, lang="en_core_web_md")
        self.top_n_values = top_n_values

    def get_keyterms_based_on_textrank(self):
        """
        Extract keyterms using the TextRank algorithm.

        Returns:
            List[str]: A list of top keyterms based on TextRank.
        """
        return list(extract.keyterms.textrank(self.text_doc, normalize="lemma",
                                              topn=self.top_n_values))

    def get_keyterms_based_on_sgrank(self):
        """
        Extract keyterms using the SGRank algorithm.

        Returns:
            List[str]: A list of top keyterms based on SGRank.
        """
        return list(extract.keyterms.sgrank(self.text_doc, normalize="lemma",
                                            topn=self.top_n_values))

    def get_keyterms_based_on_scake(self):
        """
        Extract keyterms using the sCAKE algorithm.

        Returns:
            List[str]: A list of top keyterms based on sCAKE.
        """
        return list(extract.keyterms.scake(self.text_doc, normalize="lemma",
                                           topn=self.top_n_values))

    def get_keyterms_based_on_yake(self):
        """
        Extract keyterms using the YAKE algorithm.

        Returns:
            List[str]: A list of top keyterms based on YAKE.
        """
        return list(extract.keyterms.yake(self.text_doc, normalize="lemma",
                                          topn=self.top_n_values))

    def bi_gramchunker(self):
        """
        Chunk the text into bigrams.

        Returns:
            List[str]: A list of bigrams.
        """
        return list(textacy.extract.basics.ngrams(self.text_doc, n=2, filter_stops=True,
                                                  filter_nums=True, filter_punct=True))

    def tri_gramchunker(self):
        """
        Chunk the text into trigrams.

        Returns:
            List[str]: A list of trigrams.
        """
        return list(textacy.extract.basics.ngrams(self.text_doc, n=3, filter_stops=True,
                                                  filter_nums=True, filter_punct=True))

In [24]:
selected_file = read_json("/content/drive/MyDrive/GPt_Project/job_talents_application_pairs.json")

In [25]:
df = pd.DataFrame(selected_file)

In [26]:
(df['profile'])

0      {'firstName': 'Rumit Singh Saluja', 'lastName'...
1      {'firstName': 'Kashish Dawar', 'lastName': 'Ka...
2      {'firstName': 'Nikhil Saxena', 'lastName': 'Ni...
3      {'firstName': 'Tejas Kapadia', 'lastName': 'Te...
4      {'firstName': 'Cyrus Belgamvala', 'lastName': ...
                             ...                        
495    {'firstName': 'Manoj Mahajan', 'lastName': 'Ma...
496    {'firstName': 'Rahul Sadana', 'lastName': 'Rah...
497    {'firstName': 'Mayank Mathur', 'lastName': 'Ma...
498    {'firstName': 'Umang Chaddha', 'lastName': 'Um...
499    {'firstName': 'Sweta Keshav Dhupkar', 'lastNam...
Name: profile, Length: 500, dtype: object

In [27]:
# data clean:
def handle_name_duplication(profile):
    name = profile['firstName']
    if profile['firstName'] != profile['lastName']:
        name += ' ' + profile['lastName']
    return name

In [28]:
df['Full_name'] = df['profile'].apply(handle_name_duplication)

In [29]:
# def dict_to_string(d):
#     return ' '.join(d.values())
def dict_to_string(d):
    # Convert each value to a string
    str_values = []
    for value in d.values():
        if isinstance(value, list):
            str_values.append(' '.join(map(str, value)))
        else:
            str_values.append(str(value))
    return ' '.join(str_values)


In [30]:
#Job keyword
job_keyword_list = []
for i in range(10):
    job_info_str = dict_to_string(df['job_info'][i])
     # Extract key terms using the KeytermExtractor
    extractor = KeytermExtractor(job_info_str)
    keyterms_textrank = extractor.get_keyterms_based_on_textrank()
    job_keyword_list.append(keyterms_textrank)


In [46]:
#Candidate keyword
candidate_keyword_list = []
for i in range(500):
    profile_str = dict_to_string(df['profile'][i])
     # Extract key terms using the KeytermExtractor
    extractor = KeytermExtractor(profile_str)
    #keyterms_textrank = extractor.get_keyterms_based_on_textrank()
    #keyterms_scake =  extractor.get_keyterms_based_on_scake()
    keyterms_bi = extractor.bi_gramchunker()
    candidate_keyword_list.append(keyterms_bi)

In [49]:
job_keyword_list[0]



[('long term relationship', 0.03997928208098411),
 ('improved client stickiness', 0.03665533675068877),
 ('HNI client', 0.03614943977340221),
 ('align client', 0.035658054818726775),
 ('long term investment goal', 0.031952312029448454),
 ('relationship value', 0.029911348441322618),
 ('new HNI customer', 0.024808952433353237),
 ('superior service delivery', 0.01987256211654376),
 ('customer base', 0.0194130092467798),
 ('bank level requirement', 0.018101368827193806),
 ('yrs Job responsibility', 0.01645310841667541),
 ('investment expertise', 0.015926229959077585),
 ('conduct risk profiling', 0.015612237634633439),
 ('product approach', 0.014863479051451063),
 ('short term', 0.014738951019315036),
 ('product penetration', 0.01453442485386526),
 ('sale orientation', 0.013411392295409719),
 ('inbound sale', 0.013275110515530129),
 ('Investment requirement', 0.012938650469923197),
 ('regular reporting', 0.012851813680037334)]

In [None]:
print(keyterms_textrank )

In [32]:
# job_keyword_df = pd.DataFrame(job_keyword_list)

In [33]:
# job_keywords_text_only = [item[0] for item in job_keyword_list[0]]

In [None]:
job_keyword_list_text = [[item[0] for item in sublist] for sublist in job_keyword_list]
job_keyword_list_text

In [None]:
candidate_keyword_list_text = [[item[0] for item in sublist] for sublist in candidate_keyword_list]
candidate_keyword_list_text[1]

In [36]:
# Convert list of lists to dictionary
job_info_data = {'job_info': [f'job_{i+1}' for i in range(len(job_keyword_list_text))], 'keywords': job_keyword_list_text}

# Create DataFrame
job_info_df = pd.DataFrame(job_info_data)

In [50]:
job_info_df

Unnamed: 0,job_info,keywords
0,job_1,"[long term relationship, improved client stick..."
1,job_2,"[strong presentation skill, yrs Business Devel..."
2,job_3,"[key digital marketing tool, performance digit..."
3,job_4,"[alternate consumer promotion concept plan, br..."
4,job_5,"[tax free annual remuneration package, large s..."
5,job_6,"[Manage critical employee relation issue, empl..."
6,job_7,"[annual hotel budget, regional expense budget,..."
7,job_8,"[company financial, financial statement analys..."
8,job_9,"[complex high end analytical project, strong i..."
9,job_10,"[talent management strategy, deliver robust ta..."


In [37]:
job1_with_candidate = job_info_df['keywords'].apply(lambda x: x + candidate_keyword_list_text[0])

In [51]:
job1_with_candidate

0    [long term relationship, improved client stick...
1    [strong presentation skill, yrs Business Devel...
2    [key digital marketing tool, performance digit...
3    [alternate consumer promotion concept plan, br...
4    [tax free annual remuneration package, large s...
5    [Manage critical employee relation issue, empl...
6    [annual hotel budget, regional expense budget,...
7    [company financial, financial statement analys...
8    [complex high end analytical project, strong i...
9    [talent management strategy, deliver robust ta...
Name: keywords, dtype: object

In [52]:
job1_embedding =  co.embed(texts = list(job1_with_candidate[0]),model='embed-english-v2.0').embeddings

[[-0.020751953,
  -1.2792969,
  2.6660156,
  -2.6777344,
  1.0419922,
  -0.5961914,
  -1.078125,
  0.21813965,
  0.5410156,
  2.7636719,
  -0.77197266,
  -0.29614258,
  1.0126953,
  0.83203125,
  1.1962891,
  1.4648438,
  -0.8979492,
  0.75927734,
  1.8984375,
  -0.03213501,
  -0.81396484,
  2.3535156,
  0.5253906,
  3.5996094,
  1.1259766,
  3.15625,
  0.46875,
  -0.8100586,
  -0.22912598,
  0.21838379,
  -0.6298828,
  -3.9746094,
  0.63671875,
  0.9711914,
  0.47729492,
  -2.3828125,
  2.4492188,
  -0.81396484,
  1.2666016,
  -0.40722656,
  -2.4707031,
  -0.14099121,
  -1.1025391,
  -2.3925781,
  -0.23046875,
  1.6748047,
  0.47851562,
  -0.671875,
  0.62939453,
  -0.18151855,
  2.1777344,
  0.49536133,
  -1.6826172,
  -0.6816406,
  -0.2536621,
  1.390625,
  -0.70947266,
  1.4453125,
  -1.7763672,
  0.29125977,
  0.016403198,
  2.1855469,
  0.94384766,
  -3.2050781,
  -1.0517578,
  1.8623047,
  3.6308594,
  0.9975586,
  1.2626953,
  -1.5546875,
  0.38891602,
  0.4230957,
  0.79345703

In [39]:
def dense_retrieval(query,
                    results_lang='en',
                    properties = ["text", "title", "url", "views", "lang", "_additional {distance}"],
                    num_results=5):

    nearText = {"concepts": [query]}

    # To filter by language
    where_filter = {
    "path": ["lang"],
    "operator": "Equal",
    "valueString": results_lang
    }
    response = (
        client.query
        .get("Articles", properties)
        .with_near_text(nearText)
        .with_where(where_filter)
        .with_limit(num_results)
        .do()
    )

    result = response['data']['Get']['Articles']

    return result

In [None]:
# for e in job1_embedding:
#     print(e[:3])

In [None]:
# type(df['job_info'])
# type(df['job_info'][1])


In [None]:
# extractor = KeytermExtractor(job_info1)
# keyterms_textrank = extractor.get_keyterms_based_on_textrank()
# keyterms_sgrank =  extractor.get_keyterms_based_on_sgrank()
# keyterms_scake = extractor.get_keyterms_based_on_scake()
# keyterms_yake = extractor.get_keyterms_based_on_yake()
# keyterms_bi = extractor.bi_gramchunker()
# keyterms_tri = extractor.tri_gramchunker()

In [None]:
# keyterms_textrank

In [None]:
#keyterms_sgrank

In [None]:
#keyterms_scake

In [None]:
#keyterms_yake

In [None]:
#keyterms_bi


In [None]:
#keyterms_tri