In [1]:
import pandas as pd

In [2]:
df=pd.read_csv('job_descriptions.csv')

In [3]:
columns_to_drop = ['Job Id', 'latitude','longitude','Job Posting Date','Preference','Job Portal']
df = df.drop(columns=columns_to_drop)





In [4]:
df['skills'][3]

'Wireless network design and architecture Wi-Fi standards and protocols RF (Radio Frequency) planning and optimization Wireless security protocols Troubleshooting wireless network issues'

In [5]:
df.columns

Index(['Experience', 'Qualifications', 'Salary Range', 'location', 'Country',
       'Work Type', 'Company Size', 'Contact Person', 'Contact', 'Job Title',
       'Role', 'Job Description', 'Benefits', 'skills', 'Responsibilities',
       'Company', 'Company Profile'],
      dtype='object')

In [6]:
role_counts=df['Role'].value_counts()

In [7]:
for role, count in role_counts.items():
    print(f"{role}: {count}")

Interaction Designer: 20580
Network Administrator: 17470
User Interface Designer: 14036
Social Media Manager: 13945
User Experience Designer: 13935
Procurement Analyst: 13757
Social Media Analyst: 10659
Quality Assurance Analyst: 10541
SEO Specialist: 10512
Executive Assistant: 10496
Database Administrator: 10482
Procurement Manager: 10407
Data Analyst: 10406
Backend Developer: 10404
Demand Planner: 10362
Office Manager: 10361
Frontend Developer: 10308
Customer Success Manager: 10308
Retirement Planner: 10305
Account Executive: 7063
Inside Sales Representative: 7052
UX/UI Designer: 7028
Network Security Analyst: 7027
Paralegal: 7021
Training Coordinator: 7020
Event Coordinator: 7016
Personal Assistant: 7014
Sustainable Design Specialist: 7004
Data Scientist: 7003
Customer Support Specialist: 7002
Systems Administrator: 7000
Data Entry Specialist: 6988
Manufacturing Engineer: 6983
Residential Landscape Designer: 6979
Sales Account Manager: 6975
IT Project Manager: 6974
Portfolio Manager

# Slicing the df by Roles

In [8]:
#Filtering based on selective tech roles.

roles_of_interest = [
    'Data Analyst',
    'Data Scientist',
    'Data Architect',
    'Machine Learning Engineer',
    'Big Data Engineer',
    'Frontend Developer',
    'Backend Developer',
    'Systems Engineer',
    'Network Administrator',
    'Quality Assurance Analyst',
    'User Interface Designer',
    'User Experience Designer',
    'Social Media Analyst',
    'Database Administrator',
    'Systems Administrator',
    'IT Project Manager',
    'Business Intelligence Analyst',
    'DevOps Engineer',
    'IT Support Specialist',
    'Java Web Application Developer',
    'Enterprise Architect',
    'Data Analyst Researcher',
    'Software QA Tester',
    'SQL Database Developer',
    'Network Security Specialist',
    'QA Tester',
    'Full-Stack Developer'
    
]

# Filter the DataFrame for the roles of interest
df = df[df['Role'].isin(roles_of_interest)]

# Get the value counts for each role
df['Role'].value_counts()




Network Administrator             17470
User Interface Designer           14036
User Experience Designer          13935
Social Media Analyst              10659
Quality Assurance Analyst         10541
Database Administrator            10482
Data Analyst                      10406
Backend Developer                 10404
Frontend Developer                10308
Data Scientist                     7003
Systems Administrator              7000
IT Project Manager                 6974
Business Intelligence Analyst      6939
DevOps Engineer                    6918
IT Support Specialist              6799
Java Web Application Developer     3562
Enterprise Architect               3556
Data Architect                     3536
Systems Engineer                   3533
Data Analyst Researcher            3531
Software QA Tester                 3527
Big Data Engineer                  3521
SQL Database Developer             3488
Network Security Specialist        3487
QA Tester                          3451


In [9]:
# new_df=df.copy()

In [10]:
# df=new_df.copy()

In [11]:
# df.shape

In [12]:
# Filter DataFrame to include only records with roles of interest
subset_df = df[df['Role'].isin(roles_of_interest)]

# Calculate the number of records needed for each role
records_per_role = 120000 // len(roles_of_interest)

# Create the final subset DataFrame
df = pd.concat([subset_df[subset_df['Role'] == role].sample(n=records_per_role, replace=True) for role in roles_of_interest])


# Feature Engineering

In [13]:
#Transformation on Experience col

In [14]:
pip install inflect

Note: you may need to restart the kernel to use updated packages.


In [15]:

import inflect

def convert_numbers_to_words(text):
    text_lower = text.lower()
    p = inflect.engine()
    
    words = []
    for word in text_lower.split():
        if word.isdigit():
            # Convert number to word
            word = p.number_to_words(word)
        words.append(word)
    return ' '.join(words)

df['Experience'] = df['Experience'].apply(convert_numbers_to_words)
df['Experience'] = df['Experience'].str.lower()



In [16]:
df=df.reset_index()

In [17]:
df['Experience'][2]

'one to nine years'

In [18]:
#Transformation on Qualification col

In [19]:
df['Qualifications'].unique()

array(['BBA', 'MBA', 'M.Com', 'BA', 'PhD', 'M.Tech', 'MCA', 'B.Com',
       'BCA', 'B.Tech'], dtype=object)

In [20]:

qualification_map = {
    'M.Tech': 'Master of Technology',
    'BCA': 'Bachelor of Computer Applications',
    'PhD': 'Doctor of Philosophy',
    'MBA': 'Master of Business Administration',
    'MCA': 'Master of Computer Applications',
    'M.Com': 'Master of Commerce',
    'BBA': 'Bachelor of Business Administration',
    'B.Tech': 'Bachelor of Technology',
    'B.Com': 'Bachelor of Commerce',
    'BA': 'Bachelor of Arts'
}

# Function to expand qualifications
def expand_qualifications(value):
    return qualification_map.get(value, value)  # If the value is not found in the map, return the original value

# Expand qualifications in the DataFrame
df['Qualifications'] = df['Qualifications'].apply(expand_qualifications)


In [21]:
#Transforming Salary Range

In [22]:
df['Salary Range'][2]

'$59K-$112K'

In [23]:
pip install num2words

Note: you may need to restart the kernel to use updated packages.


In [24]:
from num2words import num2words

# Function to convert number to words
def salary_range_to_words(salary):
    parts = salary.split('-')  # Split the salary range
    lower = int(parts[0][1:-1])  # Extract lower salary and remove '$' and 'K'
    upper = int(parts[1][1:-1])  # Extract upper salary and remove '$' and 'K'

    lower_in_words = num2words(lower * 1000).capitalize() + ' dollars'  # Convert lower salary to words
    upper_in_words = num2words(upper * 1000).capitalize() + ' dollars'  # Convert upper salary to words

    return f"{lower_in_words} to {upper_in_words}"

# Expand salary range in the DataFrame
df['Salary Range'] = df['Salary Range'].apply(salary_range_to_words)


In [25]:
#Transformation of location feature

In [26]:
#we tried to map these location properly using the geopy, but the location is not mapping properly

In [27]:
# import re

# abnormal_locations = df[df['Country'].str.contains(r'[^\w\s]', regex=True)]['Country']

# print(len(abnormal_locations))

In [28]:
df['location'][0]

'Honiara'

In [29]:
df['Country'][20]

'Timor-Leste'

In [30]:
import pandas as pd
import re
from unidecode import unidecode
import spacy


# Function to clean abnormal locations
def clean_location(location):
    # Fix encoding issues using unidecode
    cleaned_location = unidecode(location)
    
    # Remove unwanted characters like parentheses, commas, etc.
    cleaned_location = re.sub(r'[^\w\s]', '', cleaned_location)
    
    return cleaned_location.strip()  # Remove leading/trailing spaces

# Load SpaCy's English model
nlp = spacy.load("en_core_web_lg")

# Function to extract only location names using SpaCy
def extract_location_name(text):
    doc = nlp(text)
    locations = [ent.text for ent in doc.ents if ent.label_ == 'GPE']  # Extract entities labeled as GeoPolitical Entities
    return locations[0] if locations else text  # Return the first location entity or original text if no location found

# Process 'location' column
abnormal_locations_location = df[df['location'].str.contains(r'[^\w\s]', regex=True)]['location']

for index in abnormal_locations_location.index:
    extracted_location = extract_location_name(str(df.at[index, 'location']))  # Convert to string
    df.at[index, 'location'] = clean_location(extracted_location)

# Process 'country' column
abnormal_locations_country = df[df['Country'].str.contains(r'[^\w\s]', regex=True)]['Country']

for index in abnormal_locations_country.index:
    extracted_country = extract_location_name(str(df.at[index, 'Country']))  # Convert to string
    df.at[index, 'Country'] = clean_location(extracted_country)







In [31]:
df['Contact'][2]

'772.531.3087x66671'

In [32]:
#Transforming Company Size

In [33]:
# Check for non-integer values in 'Company Size' column
non_integer_values = df[~df['Company Size'].astype(str).str.isdigit()]['Company Size']

if non_integer_values.empty:
    print("There are no non-integer values in the 'Company Size' column.")
else:
    print("Non-integer values found in the 'Company Size' column:")
    print(non_integer_values)

There are no non-integer values in the 'Company Size' column.


In [34]:
# Transforming Job title col

In [35]:
def find_abnormal_characters(text):
    # Using regex to identify non-alphabetical characters, excluding spaces and numbers
    abnormal_chars = re.findall(r'[^A-Za-z\s0-9]', text)
    return abnormal_chars

# Identify and count abnormal characters in job titles
abnormal_jobs = df[df['Job Title'].apply(lambda x: bool(find_abnormal_characters(x)))]

if not abnormal_jobs.empty:
    abnormal_jobs['Abnormal Characters'] = abnormal_jobs['Job Title'].apply(find_abnormal_characters)
    abnormal_count = abnormal_jobs.shape[0]

    print(f"Total abnormal job titles found: {abnormal_count}")
    print("\nAbnormal job titles and their abnormal characters:")
    print(abnormal_jobs[['Job Title', 'Abnormal Characters']])
else:
    print("No abnormal job titles found.")

Total abnormal job titles found: 8888

Abnormal job titles and their abnormal characters:
            Job Title Abnormal Characters
44440  UX/UI Designer                 [/]
44441  UX/UI Designer                 [/]
44442  UX/UI Designer                 [/]
44443  UX/UI Designer                 [/]
44444  UX/UI Designer                 [/]
...               ...                 ...
53323  UX/UI Designer                 [/]
53324  UX/UI Designer                 [/]
53325  UX/UI Designer                 [/]
53326  UX/UI Designer                 [/]
53327  UX/UI Designer                 [/]

[8888 rows x 2 columns]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  abnormal_jobs['Abnormal Characters'] = abnormal_jobs['Job Title'].apply(find_abnormal_characters)


In [36]:
def replace_abnormal_with_space(text):
    # Using regex to replace non-alphabetical characters (excluding spaces and numbers) with space
    return re.sub(r'[^A-Za-z\s0-9]', ' ', text)

# Replace abnormal characters in job titles
df['Job Title'] = df['Job Title'].apply(replace_abnormal_with_space)


In [37]:
df['Job Title'][2]

'Marketing Analyst'

In [38]:
# Transforming Role col

In [39]:
def find_abnormal_characters(text):
    # Using regex to identify non-alphabetical characters, excluding spaces and numbers
    abnormal_chars = re.findall(r'[^A-Za-z\s0-9]', text)
    return abnormal_chars

# Identify and count abnormal characters in Role
abnormal_jobs = df[df['Role'].apply(lambda x: bool(find_abnormal_characters(x)))]

if not abnormal_jobs.empty:
    abnormal_jobs['Abnormal Characters'] = abnormal_jobs['Role'].apply(find_abnormal_characters)
    abnormal_count = abnormal_jobs.shape[0]

    print(f"Total abnormal job Roles found: {abnormal_count}")
    print("\nAbnormal job Roles and their abnormal characters:")
    print(abnormal_jobs[['Role', 'Abnormal Characters']])
else:
    print("No abnormal job Role found.")

Total abnormal job Roles found: 4444

Abnormal job Roles and their abnormal characters:
                        Role Abnormal Characters
115544  Full-Stack Developer                 [-]
115545  Full-Stack Developer                 [-]
115546  Full-Stack Developer                 [-]
115547  Full-Stack Developer                 [-]
115548  Full-Stack Developer                 [-]
...                      ...                 ...
119983  Full-Stack Developer                 [-]
119984  Full-Stack Developer                 [-]
119985  Full-Stack Developer                 [-]
119986  Full-Stack Developer                 [-]
119987  Full-Stack Developer                 [-]

[4444 rows x 2 columns]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  abnormal_jobs['Abnormal Characters'] = abnormal_jobs['Role'].apply(find_abnormal_characters)


In [40]:
def replace_abnormal_with_space(text):
    # Using regex to replace non-alphabetical characters (excluding spaces and numbers) with space
    return re.sub(r'[^A-Za-z\s0-9]', ' ', text)

# Replace abnormal characters in job titles
df['Role'] = df['Role'].apply(replace_abnormal_with_space)


In [41]:
df['Role'][2]

'Data Analyst'

In [42]:
#Transforming job description

In [43]:
def clean_description(text):
    # Define the characters to be removed
    characters_to_remove = '-?/=+_'
    
    # Use str.replace() to remove specified characters
    cleaned_text = text
    for char in characters_to_remove:
        cleaned_text = cleaned_text.replace(char, '')

    return cleaned_text

df['Job Description'] = df['Job Description'].apply(clean_description)


In [44]:
def has_abnormal_characters(text):
    # Using regex to identify abnormal characters like emojis or parentheses
    abnormal_chars = re.findall(r'[^\w\s.,!?]', text)
    return bool(abnormal_chars)

# Identify rows with abnormal characters in job descriptions
abnormal_descriptions = df[df['Job Description'].apply(has_abnormal_characters)]

if not abnormal_descriptions.empty:
    print("Rows with abnormal characters in Job Description:")
    print(abnormal_descriptions['Job Description'])
else:
    print("No abnormal characters found in Job Description.")

Rows with abnormal characters in Job Description:
75548     DevOps Engineers focus on automating and strea...
75549     DevOps Engineers focus on automating and strea...
75550     DevOps Engineers focus on automating and strea...
75551     DevOps Engineers focus on automating and strea...
75552     DevOps Engineers focus on automating and strea...
                                ...                        
106651    SQL Database Developers design, implement, and...
106652    SQL Database Developers design, implement, and...
106653    SQL Database Developers design, implement, and...
106654    SQL Database Developers design, implement, and...
106655    SQL Database Developers design, implement, and...
Name: Job Description, Length: 8888, dtype: object


In [45]:
df["Job Description"][75550]

'DevOps Engineers focus on automating and streamlining the development and deployment processes. They work to improve collaboration between development and IT operations, implementing tools and practices for continuous integration and continuous delivery (CICD).'

In [46]:
#Transforming Benefits col

In [47]:
def clean_benefits(text):
    # Remove words inside parentheses
    text = re.sub(r'\([^)]*\)', '', text)
    
    # Remove emojis and special characters
    text = text.encode('ascii', 'ignore').decode('ascii')
    text = re.sub(r'[^\w\s,]', '', text)
    
    # Remove {, }, and '
    text = text.replace('{', '').replace('}', '').replace("'", '')

    return text.strip()

# Clean the benefits column
df['Benefits'] = df['Benefits'].apply(clean_benefits)


In [48]:
#Transforming Responsibilities col

In [49]:
def has_abnormal_characters(text):
    # Using regex to identify abnormal characters like emojis or parentheses
    abnormal_chars = re.findall(r'[^\w\s.,!?]', text)
    return bool(abnormal_chars)

# Identify rows with abnormal characters in Responsibilities
abnormal_descriptions = df[df['Responsibilities'].apply(has_abnormal_characters)]

if not abnormal_descriptions.empty:
    print("Rows with abnormal characters in Responsibilities:")
    print(abnormal_descriptions['Responsibilities'])
else:
    print("No abnormal characters found in Responsibilities")

Rows with abnormal characters in Responsibilities:
4444      Apply machine learning algorithms and statisti...
4445      Apply machine learning algorithms and statisti...
4446      Apply machine learning algorithms and statisti...
4447      Apply machine learning algorithms and statisti...
4448      Apply machine learning algorithms and statisti...
                                ...                        
119983    Work on both frontend and backend aspects, cre...
119984    Work on both frontend and backend aspects, cre...
119985    Work on both frontend and backend aspects, cre...
119986    Work on both frontend and backend aspects, cre...
119987    Work on both frontend and backend aspects, cre...
Name: Responsibilities, Length: 57772, dtype: object


In [50]:
import re

def clean_description(text):
    # Define the characters to be removed
    characters_to_remove = '-?/=+_'
    
    # Use str.replace() to remove specified characters
    cleaned_text = text
    for char in characters_to_remove:
        cleaned_text = cleaned_text.replace(char, '')

    # Remove words inside parentheses
    cleaned_text = re.sub(r'\([^)]*\)', '', cleaned_text)
    
    return cleaned_text


df['Responsibilities'] = df['Responsibilities'].apply(clean_description)


In [51]:
df['Responsibilities'][2]

'Analyze marketing data to measure campaign performance and ROI. Provide insights for marketing strategy adjustments. Create data visualizations and dashboards.'

In [52]:
#Skills transformation

In [53]:
import re
def clean_skills(skills):
    # Replace 'e.g.,' with 'example'
    skills = skills.replace('e.g.,', 'example')

    # Remove parentheses and other special characters
    skills = re.sub(r'[(){}[\]]', '', skills)

    return skills

# Apply the cleaning function to the 'Skills' column
df['skills'] = df['skills'].apply(clean_skills)


In [54]:
df['skills'][2]

'Data analysis tools example SQL, Python Data visualization tools example Tableau, Power BI Statistical analysis Data cleansing and transformation Data modeling Communication of data insights Problem-solving Attention to detail Business acumen'

In [55]:
#seperating the sector and industry,website

In [56]:
import ast

# Function to convert string representations of dictionaries to actual dictionaries
def convert_to_dict(profile_str):
    try:
        return ast.literal_eval(profile_str)
    except (SyntaxError, ValueError):
        return {}

# Apply the conversion function to the 'Company Profile' column
df['Company Profile'] = df['Company Profile'].apply(convert_to_dict)

# Function to extract values from the dictionary
def extract_value(profile, key):
    if isinstance(profile, dict):
        return profile.get(key, None)
    else:
        return None

# Keys to extract values from 'Company Profile'
keys_to_extract = ['Sector', 'Industry', 'Website']

# Apply the extraction function to create new columns with only the values
for key in keys_to_extract:
    df[key] = df['Company Profile'].apply(lambda x: extract_value(x, key))

# Drop the original 'Company Profile' column
df = df.drop('Company Profile', axis=1)



In [57]:
#transformation of Industry col

In [58]:
# Check for 'None' or empty values in 'Sector' column and replace with 'unknown'
df['Industry'].fillna('unknown', inplace=True)
df['Industry'] = df['Industry'].replace('', 'unknown')


In [59]:
# # Function to clean the 'Industry' column
def clean_industry(industry):
    # Replace ":" with "in"
    industry = re.sub(r':', ' in ', industry)
    
    # Remove text within parentheses
    industry = re.sub(r'\([^)]*\)', '', industry)
    
    # Replace "&" with "and"
    industry = re.sub(r'&', ' and ', industry)
    
    # Replace "/" with "or"
    industry = re.sub(r'/', ' or ', industry)
    
    # Replace "-" with "based on" if not surrounded by alphabets
    industry = re.sub(r'(?<![a-zA-Z])-(?![a-zA-Z])', ' based on ', industry)
    
    return industry.strip()


# Clean the 'Industry' column
df['Industry'] = df['Industry'].apply(clean_industry)


In [60]:
df.head(1)

Unnamed: 0,index,Experience,Qualifications,Salary Range,location,Country,Work Type,Company Size,Contact Person,Contact,Job Title,Role,Job Description,Benefits,skills,Responsibilities,Company,Sector,Industry,Website
0,910174,five to eleven years,Bachelor of Business Administration,Fifty-nine thousand dollars to One hundred and...,Honiara,Solomon Islands,Full-Time,25733,Maureen Walsh,460.454.1073,Marketing Analyst,Data Analyst,"Analyze data sets, generate insights, and prov...","Tuition Reimbursement, Stock Options or Equity...","Data analysis tools example SQL, Python Data v...",Analyze marketing data to measure campaign per...,"Harley-Davidson, Inc.",Automotive/Motorcycles,Automotive,https://www.harley-davidson.com/


In [61]:
#transformation of Industry col

In [62]:
# Check for 'None' or empty values in 'Sector' column and replace with 'unknown'
df['Sector'].fillna('unknown', inplace=True)
df['Sector'] = df['Sector'].replace('', 'unknown')


In [63]:
# # Function to clean the 'Industry' column
def clean_industry(industry):
    # Replace ":" with "in"
    industry = re.sub(r':', ' in ', industry)
    
    # Remove text within parentheses
    industry = re.sub(r'\([^)]*\)', '', industry)
    
    # Replace "&" with "and"
    industry = re.sub(r'&', ' and ', industry)
    
    # Replace "/" with "or"
    industry = re.sub(r'/', ' or ', industry)
    
    # Replace "-" with "based on" if not surrounded by alphabets
    industry = re.sub(r'(?<![a-zA-Z])-(?![a-zA-Z])', ' based on ', industry)
    
    return industry.strip()


# Clean the 'Industry' column
df['Sector'] = df['Sector'].apply(clean_industry)


In [64]:
df["Sector"][2]

'Communication Services'

In [65]:
#transformation of Company col

In [66]:
# Check for 'None' or empty values in 'Sector' column and replace with 'unknown'
df['Company'].fillna('unknown', inplace=True)
df['Company'] = df['Company'].replace('', 'unknown')


In [67]:
# Check for 'None' or empty values in 'Sector' column and replace with 'unknown'
df['Website'].fillna('unknown', inplace=True)
df['Website'] = df['Website'].replace('', 'unknown')


In [68]:
#Cosmetic data cleaning steps based on Tableau

In [69]:
# Check for numerical characters in 'Country' column and drop rows
numeric_country_indices = df[df['Country'].str.contains(r'\d', regex=True)].index
df.drop(numeric_country_indices, inplace=True)

# Check for numerical characters in 'Location' column and drop rows
numeric_location_indices = df[df['location'].str.contains(r'\d', regex=True)].index
df.drop(numeric_location_indices, inplace=True)

df = df[df['Sector'] != 'unknown']


In [70]:
df=df.reset_index()
df.drop(columns=['index'], inplace=True)

In [71]:
import pandas as pd

In [72]:
# df=df.to_csv("cleaned_job_descp.csv")

In [73]:
# df=pd.read_csv("cleaned_job_descp.csv")

In [74]:
df = df.sample(frac=1, random_state=100)  # Shuffle the DataFrame (frac=1 shuffles all rows)
# df = df.tail(1000)  # Select the first 1000 rows


In [75]:
df=df.drop(columns=['Contact','Contact Person'])

In [112]:
df.head(1)

Unnamed: 0,level_0,Experience,Qualifications,Salary Range,location,Country,Work Type,Company Size,Job Title,Role,Job Description,Benefits,skills,Responsibilities,Company,Sector,Industry,Website,jd_bfr_tran
47013,47262,four to eleven years,Doctor of Philosophy,Sixty-three thousand dollars to Ninety-two tho...,AsunciÃ³n,Paraguay,Contract,111636,UX UI Designer,User Interface Designer,User Interface Designers focus on the visual a...,"Tuition Reimbursement, Stock Options or Equity...",UI design principles and best practices Graphi...,Create visually appealing user interfaces tha...,Celgene Corporation,Pharmaceuticals,Pharmaceuticals,https://www.celgene.com/,Job Brief:\nWe are seeking a skilled UX UI Des...


In [77]:
df.shape

(119390, 18)

In [92]:
df['Job Description'][53532]

'Social Media Analysts analyze social media data and metrics to provide insights and recommendations for improving social media strategies. They track performance, identify trends, and support datadriven decisionmaking.'

# Creating a user input output format

In [144]:
def generate_job_description(row):
    jd = (
        f"Job Brief:\n"
        f"We are seeking a skilled {row['Job Title']} to join our team. The ideal candidate will bring {row['Experience']} of demonstrated expertise within the {row['Industry']} industry.\n\n"
        f"Key Responsibilities:\n"
        f"{row['Responsibilities']}\n"
        f"{row['Job Description']}\n\n"
        f"Requirements:\n"
        f"The preferred candidate should hold a {row['Qualifications']} degree or an equivalent educational qualification. Additionally, a robust command of the essential skills in {row['skills']} is highly valued.\n\n"
    )
    return jd

In [145]:
df['jd_prompt'] = df.apply(generate_job_description, axis=1)


In [146]:
print(df['jd_prompt'][98765])

Job Brief:
We are seeking a skilled Database Developer to join our team. The ideal candidate will bring three to thirteen years of demonstrated expertise within the Computer Software industry.

Key Responsibilities:
Design, develop, and maintain SQL databases, ensuring data integrity and performance. Write complex SQL queries and stored procedures. Troubleshoot database issues and optimize queries.
SQL Database Developers design, implement, and maintain relational databases using SQL (Structured Query Language). They write queries, optimize database performance, and ensure data integrity and security.

Requirements:
The preferred candidate should hold a Bachelor of Computer Applications degree or an equivalent educational qualification. Additionally, a robust command of the essential skills in SQL Structured Query Language Database design Query optimization Data modeling Database maintenance Problem-solving skills is highly valued.




In [385]:
# Job Title
# Experience
# Industry
# Sector
# Role
# Responsibilities
# Qualifications
# skills

In [147]:
def generate_user_prompt(row):
    user_prompt = (
        f"Generate a job description for a {row['Job Title']}."
        f"The candidate should have skills in {row['skills']}. "
        f"The job description should include the industry of the company as ({row['Industry']})."
    )
    return user_prompt


In [387]:
# Role
# skills
# Industry


In [148]:
df['user_prompt'] = df.apply(generate_user_prompt, axis=1)
 

In [149]:
df=df.reset_index()

ValueError: cannot insert level_0, already exists

In [138]:
print(df['user_prompt'][7])

Generate a job description for a UX UI Designer.The candidate should have skills in User-centered design principles UX/UI design tools example Sketch, Adobe XD Wireframing and prototyping Usability testing and user research Information architecture and user flows. The job description should include the industry of the company as (Energy  based on  Oil  and  Gas Exploration  and  Production).


In [139]:
print(df['jd_prompt'][7]) 

Job Brief:
We are seeking a skilled UX UI Designer to join our team. The ideal candidate will bring two to eight years of demonstrated expertise within the Energy  based on  Oil  and  Gas Exploration  and  Production industry.

Key Responsibilities:
Design and improve user experiences  by creating intuitive and userfriendly interfaces for websites and applications. Conduct user research and usability testing. Collaborate with stakeholders to gather and incorporate feedback.
User Experience Designers create intuitive and userfriendly digital interfaces. They conduct user research, design prototypes, and work to enhance the overall user experience of websites and applications.

Requirements:
The preferred candidate should hold a Bachelor of Computer Applications degree or an equivalent educational qualification. Additionally, a robust command of the essential skills in User-centered design principles UX/UI design tools example Sketch, Adobe XD Wireframing and prototyping Usability testin

In [394]:
df.shape

(119463, 21)

In [128]:
df_dummy

Unnamed: 0,jd_bfr_tran,user_prompt,skills
0,Job Brief:\nWe are seeking a skilled Software ...,Generate a job description for a Software Engi...,Proficiency in one or more backend programming...
1,Job Brief:\nWe are seeking a skilled Network T...,Generate a job description for a Network Techn...,Network security Cybersecurity Intrusion detec...
2,Job Brief:\nWe are seeking a skilled Social Me...,Generate a job description for a Social Media ...,Social media analytics tools example Google An...
3,Job Brief:\nWe are seeking a skilled Data Scie...,Generate a job description for a Data Scientis...,Machine learning algorithms Python programming...
4,Job Brief:\nWe are seeking a skilled Web Devel...,Generate a job description for a Web Developer...,Frontend and backend development Database inte...
...,...,...,...
119385,Job Brief:\nWe are seeking a skilled Software ...,Generate a job description for a Software Engi...,"Automation and scripting example Python, Bash ..."
119386,Job Brief:\nWe are seeking a skilled Network E...,Generate a job description for a Network Engin...,Network management Troubleshooting Network sec...
119387,Job Brief:\nWe are seeking a skilled Systems A...,Generate a job description for a Systems Admin...,Technical troubleshooting Hardware and softwar...
119388,Job Brief:\nWe are seeking a skilled Aerospace...,Generate a job description for a Aerospace Eng...,Systems engineering System architecture Integr...


# synthetic creation of dataset

In [152]:
df_dummy=df[['jd_prompt','user_prompt','skills']].copy()


In [154]:
df_dummy = df_dummy.sample(frac=1, random_state=100) 
df_dummy=df_dummy.reset_index()
df_dummy = df_dummy.drop(columns=['index'])
# df_dummy = df_dummy.drop(columns=['level_0'])

In [155]:
# Creating a new DataFrame 'df_dummy'
# Selecting only 'jd_prompt' column
instruct = "Transform the jd_prompt into a job description,the skills and responsibilities mentioned in the jd_prompt should be present in the final generated job description."
df_dummy['instruct'] = instruct
# df_dummy = df_dummy.head(5000)  # Limiting to 200 rows (adjust as needed)

# Saving the new DataFrame to a CSV file
df_dummy.to_csv("jd_template_new.csv", index=False)  # Set index=False to avoid saving row indices

In [144]:
# df_dummy['jd_prompt'][5]

In [157]:
# duplicates = combined_df[df.duplicated()]

# # Display duplicates, if any
# if not duplicates.empty:
#     print("Duplicate Rows:")
#     print(duplicates)
# else:
#     print("No duplicates found.")

In [159]:
df_1=df_dummy[:10].copy()
df_2=df_dummy[5000:10000].copy()
df_3=df_dummy[10000:15000].copy()
df_4=df_dummy[15000:20800].copy()

In [160]:
df_1.to_csv('split1.csv')
# df_2.to_csv('aneeth.csv')
# df_3.to_csv('rashmi.csv')
# df_4.to_csv('arpita.csv')

In [407]:
df1=pd.read_csv("jd_template.csv")

In [405]:
df1 = df1.head(5000)

In [406]:
df1.to_csv('aneeth_test.csv')

In [158]:
df_1

Unnamed: 0,jd_prompt,user_prompt,skills,instruct
0,Job Brief:\nWe are seeking a skilled Social Me...,Generate a job description for a Social Media ...,Social media analytics tools example Google An...,Transform the jd_prompt into a job description...
1,Job Brief:\nWe are seeking a skilled Software ...,Generate a job description for a Software Engi...,"Automation and scripting example Python, Bash ...",Transform the jd_prompt into a job description...
2,Job Brief:\nWe are seeking a skilled Systems A...,Generate a job description for a Systems Admin...,Technical troubleshooting Hardware and softwar...,Transform the jd_prompt into a job description...
3,Job Brief:\nWe are seeking a skilled Java Deve...,Generate a job description for a Java Develope...,Web application development Java web framework...,Transform the jd_prompt into a job description...
4,Job Brief:\nWe are seeking a skilled Network A...,Generate a job description for a Network Admin...,System administration Server maintenance Activ...,Transform the jd_prompt into a job description...
...,...,...,...,...
4995,Job Brief:\nWe are seeking a skilled Web Devel...,Generate a job description for a Web Developer...,Frontend and backend development Database inte...,Transform the jd_prompt into a job description...
4996,Job Brief:\nWe are seeking a skilled Systems A...,Generate a job description for a Systems Admin...,"Database management systems example MySQL, Ora...",Transform the jd_prompt into a job description...
4997,Job Brief:\nWe are seeking a skilled Systems A...,Generate a job description for a Systems Admin...,Technical troubleshooting Hardware and softwar...,Transform the jd_prompt into a job description...
4998,Job Brief:\nWe are seeking a skilled Network T...,Generate a job description for a Network Techn...,Network security Cybersecurity Intrusion detec...,Transform the jd_prompt into a job description...
