## Install Required Libraries

In [1]:
!pip install scikit-learn pandas



## Load the Career Dataset

In [2]:
import pandas as pd

# Load dataset
df = pd.read_csv("real_job_data.csv")

print(df.head())
print("\nColumns:")
print(df.columns)

   coding_skills  logical_quotient  communication_skills  \
0              7                 1                     3   
1              4                 3                     1   
2              8                 3                     5   
3              5                 5                     7   
4              7                 4                     3   

  interested_career_area management_or_technical Suggested Job Role  
0              developer               technical  Software Engineer  
1                   data              management    Project Manager  
2             management               technical    Project Manager  
3             management              management    Project Manager  
4             management               technical    Project Manager  

Columns:
Index(['coding_skills', 'logical_quotient', 'communication_skills',
       'interested_career_area', 'management_or_technical',
       'Suggested Job Role'],
      dtype='object')


## Convert Data into Text Representation

In [3]:
def create_text_representation(row):
    return (
        f"Coding skills level {row['coding_skills']}. "
        f"Logical quotient level {row['logical_quotient']}. "
        f"Communication skills level {row['communication_skills']}. "
        f"Interested career area {row['interested_career_area']}. "
        f"Role type {row['management_or_technical']}."
    )

df["career_text"] = df.apply(create_text_representation, axis=1)

df[["career_text", "Suggested Job Role"]].head()

Unnamed: 0,career_text,Suggested Job Role
0,Coding skills level 7. Logical quotient level ...,Software Engineer
1,Coding skills level 4. Logical quotient level ...,Project Manager
2,Coding skills level 8. Logical quotient level ...,Project Manager
3,Coding skills level 5. Logical quotient level ...,Project Manager
4,Coding skills level 7. Logical quotient level ...,Project Manager


## Generate TF-IDF Vectors

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()

tfidf_matrix = vectorizer.fit_transform(df["career_text"])

print("TF-IDF Shape:", tfidf_matrix.shape)

TF-IDF Shape: (20000, 17)


## Export Career Vectors to JSON

In [5]:
import json

career_vectors = []

for i in range(len(df)):
    vector = tfidf_matrix[i].toarray()[0].tolist()
    job_role = df.iloc[i]["Suggested Job Role"]

    career_vectors.append({
        "job_role": job_role,
        "vector": vector
    })

with open("career_vectors.json", "w") as f:
    json.dump(career_vectors, f)

print("Vector database saved successfully!")

Vector database saved successfully!
