In this code, we are extracting the relevant keyords from all the sections that is present in linkedin.
This code was instrumental in understanding the data, implementing primary exploratory data analysis in the mide-semester presentation.

In [None]:
# Importing the necessary libraries

from langchain.llms import OpenAIChat
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
import ast
import pandas as pd

In [None]:
class TaskExecution():
    def __init__(self):
        self.prompt = """
        You are an expert in summarising and extracting keywords. You will be given a paragraph as input. Your task is to extract all the necessary and relevant keywords from the inputted 
        paragraph, which are similar or match the job role: {role}.

        Return all the extracted keywords in the form of python list.
        The inputted paragraph is:
        {input}"""

        self.company_prompt = """ 
        
        You will be given a paragraph as an input, which is basically an experience section. Your task is to extract the names of the companies (if present), the person has explicitly stated that he has worked in.
        Return the output in python list format.

        If there is no company specified in the paragraph, then return
        ['nan']

        The input paragraph is:
        {input}
        """

        self.degree_prompt= """  
        You will be given a paragraph as input, which will be the education description of a person. Your task is to identify that whether the person has done bachelors and masters.
        The bachelors and masters will be binary variables. If you are unable to identify bachelors or masters for a person, assume the corresponding variable to be 0.
        If the input is "nan", then both the variables should be 0.
        Return only the python list in the format: [bachelors,masters]
        The inputted paragraph is:
        {input}
        """

        self.llm = OpenAIChat(model_name='gpt-3.5-turbo',openai_api_key="ENTER YOUR OWN API KEY",temperature=0)

    def extract_keywords(self,about,role):
        prompt = PromptTemplate(input_variables=['role','input'],template=self.prompt)
        chain = LLMChain(llm=self.llm, prompt = prompt)
        output = chain.run({'input':about,'role':role})
        print(output)
    
    def extract_company(self,input):
        company_prompt = PromptTemplate(input_variables=['input'],template=self.company_prompt)
        chain = LLMChain(llm=self.llm, prompt = company_prompt)
        output = chain.run({"input":input})
        return(output)
    
    def extract_degree(self,input):
        degree_prompt = PromptTemplate(input_variables=['input'],template=self.degree_prompt)
        chain = LLMChain(llm=self.llm, prompt = degree_prompt)
        output = chain.run({"input":input})
        return(output)


def person_company(experience_company_set, current_job_title_set):

        if 'nan' in experience_company_set and 'nan' in current_job_title_set: # When both do not contain the company names
            # return '[nan]'
            return ['nan']
        
        if 'nan' in experience_company_set and 'nan' not in current_job_title_set:
            output_string = '[' + ', '.join(current_job_title_set) + ']'
            return current_job_title_set
        
        if 'nan' not in experience_company_set and 'nan' in current_job_title_set:
            output_string = '[' + ', '.join(experience_company_set) + ']'
            return experience_company_set
        
        if 'nan' not in experience_company_set and 'nan' not in current_job_title_set:
            output = list(set(current_job_title_set) | set(experience_company_set))
            output_string = '[' + ', '.join(output) + ']'
            return output
        
def unique(lst):
    list_of_unique_numbers = []

    unique_numbers = set(lst)
    
    for number in unique_numbers:
        list_of_unique_numbers.append(number)

    return list_of_unique_numbers

company_lst = []   
company_dict = {}
degree_lst = []
def main():
    job_role = "Data Scientist"
    obj= TaskExecution()
    # output = obj.extract_keywords(about,job_role)
    # print(type(output))
    # print("************ SUMMARISED EXPERIENCE SECTION *************")
    # experience_output = obj.extract_keywords(experience,job_role)
    # print(type(experience_output))
       
    data = pd.read_excel('Data Scientist.xlsx')

    for _,rows in data.iterrows():
        experience_company = rows['experience_section_company']
        current_job_title = rows['currentJobTitle']
        experience_company_set = unique(ast.literal_eval(obj.extract_company(experience_company))) # This is in list format
        current_job_title_set = unique(ast.literal_eval(obj.extract_company(current_job_title))) # This is in list format
        company_output = person_company(experience_company_set,current_job_title_set)
        company_lst.append(company_output)

        degree_output = obj.extract_degree(rows['education_degree'])
        degree_lst.append(degree_output)


if __name__ == "__main__":
    main()
     

In [None]:
# Re-formatting the company list and fixing some of the entries

final_list = []
for elem in company_lst:
    final_list.append(",".join(elem))

final_list[2] = "Government of India"


# Re-formatting the degree list -- this already in the list format
degree_lst[2] = '[1,1]'
final_degree_list = []

for elem in degree_lst:
    elem = ast.literal_eval(elem)
    final_degree_list.append(elem)

count_degree_list = final_degree_list

for i in range(len(final_degree_list)):
    final_degree_list[i] = map(str,final_degree_list[i])
    final_degree_list[i] = ",".join(final_degree_list[i])



In [None]:
company_dict = {}
company_dict['company_names'] = final_list
company_dict['degree'] = final_degree_list
company_df = pd.DataFrame(company_dict)

In [None]:
company_df

In [None]:
import re

# Your input string
input_str = ",".join(final_list) 


# Split the input string by lines and then by commas
lines = input_str.strip().split("\n")
all_companies = [line.split(",") for line in lines]

# Flatten the list of lists
all_companies = [item for sublist in all_companies for item in sublist]

# Remove unwanted words and convert to lowercase
unwanted_words = ["Inc", "Pvt", "Corp", "Ltd", "nan","Limited",".", "pes", "ab", "bombay", "ltd", "ls", "new", "fis", "iit", "bit", "PVT", "fis", "indian", "international", "institude", "govt"]
cleaned_companies = []

for company in all_companies:
    # Remove unwanted words
    for word in unwanted_words:
        company = company.replace(word, "")
    # Remove extra spaces and convert to lowercase
    company = re.sub(' +', ' ', company).strip().lower()
    if company:  # Check if the string is empty
        cleaned_companies.append(company)

from wordcloud import WordCloud
import matplotlib.pyplot as plt

# Assuming 'companies' is your string variable containing company names separated by commas
# companies = "Company1,Company2,Company3,..."

# Generate the word cloud
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(",".join(cleaned_companies))

# Display the word cloud using matplotlib
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()



In [None]:
from collections import Counter
word_counter = Counter(cleaned_companies)
filtered_word_count = {word: count for word, count in word_counter.items() if count >= 2}

# Create a list containing only words that appear at least 'threshold' times
filtered_words = [word for word, count in word_counter.items() if count >= 2]

print("Filtered word count:", filtered_word_count)
print("Filtered words:", filtered_words)
print(word_counter)

import matplotlib.pyplot as plt
companies = list(filtered_word_count.keys())
values = list(filtered_word_count.values())

# Create the bar plot
plt.figure(figsize=(12, 6))  # Optional: set the figure size
plt.barh(companies, values, color='skyblue')  # Horizontal bar plot
plt.xlabel('Count')
plt.ylabel('Company')
plt.title('Company vs Count')
plt.show()

In [None]:
# DEGREE PLOTS

bachelor_count = 0
master_count = 0

for elem in count_degree_list:
   a = elem[0]
   b = elem[2]
   bachelor_count+=int(a)
   master_count+=int(b)

degrees = ['Bachelor', 'Master']
counts = [bachelor_count, master_count]
colors = ['blue', 'green']

# Create the bar plot
bars = plt.bar(degrees, counts, color=colors)
bars[0].set_label('Bachelor')
bars[1].set_label('Master')

plt.xlabel('Degree')
plt.ylabel('Count')
plt.title('Count of Degrees')
plt.legend()
plt.grid(True)
plt.show()


In [None]:
bachelor_count
master_count

In [None]:

# TESTING THIS

def person_company(experience_company_set, current_job_title_set):

        if 'nan' in experience_company_set and 'nan' in current_job_title_set: # When both do not contain the company names
            return ['nan']
        
        if 'nan' in experience_company_set and 'nan' not in current_job_title_set:
                return current_job_title_set
        
        if 'nan' not in experience_company_set and 'nan' in current_job_title_set:
            return experience_company_set
        
        if 'nan' not in experience_company_set and 'nan' not in current_job_title_set:
            output = list(set(current_job_title_set) | set(experience_company_set))
            return output

def unique(lst):
    list_of_unique_numbers = []

    unique_numbers = set(lst)
    
    for number in unique_numbers:
        list_of_unique_numbers.append(number)

    return list_of_unique_numbers


def extract_degree(input):
        degree_prompt = """ 
        You will be given a paragraph as input, which will be the education description of a person. Your task is to identify that whether the person has done bachelors and masters.
       The bachelors and masters will be binary variables (0,1). For any missing degree, assume the corresponding variable to be zero.

        Return only the python list in the format: [bachelors,masters]

        The inputted paragraph is:
        {input}

        """
        llm = OpenAIChat(model_name='gpt-3.5-turbo',openai_api_key="ENTER YOUR OWN API KEY",temperature=0)
        degree_prompt = PromptTemplate(input_variables=['input'],template=degree_prompt)
        chain = LLMChain(llm=llm, prompt = degree_prompt)
        output = chain.run({"input":input})
        return(output)


education = "-:-Post Graduate Certification, Artificial Intelligence and Machine Learning-:-B.tech(ECE) + M.tech(AI), electronics and communication, Artificial Intelligence"
output = ast.literal_eval(extract_degree(education))

