In [None]:
import pandas as pd
import numpy as np
import random
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from google.colab import files
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder, MinMaxScaler


# Subjects
subjects = [
    "Calculus", "Analytic Geometry And Algebra", "Descriptive Statistics",
    "Introduction To Algorithms", "Generative Ai", "Probability Theory",
    "Discrete Math", "Python", "Introduction to Low Code",
    "Applied Statistics", "C Programming", "Computer Science", "Physics",
    "Management", "Numerical Methods", "Logic", "Information Theory",
    "Architecture", "Algorithms", "C Sharp", "Systems",
    "Math For CS", "Computer Networks", "Human Computer Interaction",
    "Machine Learning", "Complexity", "Databases", "OOP",
    "Data Structures", "Systems Programming", "Parallel Programming",
    "Functional Programming", "Probability And Statistics",
    "Graph Theory", "Theory Of Automata", "Computer Graphics",
    "Advanced Functional Programming % Abstract Data Type",
    "Signal Processing", "Software Engineering", "Computer Security",
    "Databases 2", "Artificial Intelligence"
]

# Jobs rules (simplified for demo)
jobs_rules = {
    # 1st Year
    "Data Analyst": {"Descriptive Statistics":[5,5],"Python":[4,5],"Probability Theory":[4,5],"Calculus":[3,4]},
    "Research Assistant": {"Calculus":[4,5],"Analytic Geometry And Algebra":[4,5],"Descriptive Statistics":[3,4]},
    "Statistical Assistant": {"Descriptive Statistics":[5,5],"Probability Theory":[4,5],"Applied Statistics":[3,4]},
    "Data Quality Analyst": {"Descriptive Statistics":[5,5],"Python":[3,4],"Introduction to Low Code":[3,4]},
    "Insurance Data Assistant": {"Probability Theory":[5,5],"Descriptive Statistics":[4,5],"Python":[3,4]},
    "AI Assistant Developer": {"Generative Ai":[5,5],"Python":[4,5],"Introduction To Algorithms":[3,4]},
    "Junior Python Developer": {"Python":[5,5],"Introduction To Algorithms":[4,5],"Discrete Math":[3,4]},
    "QA Automation Assistant": {"Python":[4,5],"Introduction To Algorithms":[3,4],"Introduction to Low Code":[3,4]},
    "Software Engineering Intern": {"Python":[4,5],"Introduction To Algorithms":[4,5],"Discrete Math":[3,4]},



    # 2nd Year
    "Applied Statistics Specialist": {"Applied Statistics":[5,5],"Probability Theory":[4,5],"Descriptive Statistics":[4,5]},
    "Junior Statistical Analyst": {"Applied Statistics":[5,5],"Descriptive Statistics":[4,5],"Numerical Methods":[3,4]},
    "Biostatistics Assistant": {"Applied Statistics":[5,5],"Probability Theory":[4,5]},
    "C Programming": {"C Programming":[5,5],"Architecture":[3,4]},
    "Technical Product Coordinator": {"Management":[4,5],"Introduction to Low Code":[3,4],"Information Theory":[2,3]},
    "C# Developer": {"C Sharp":[5,5],"OOP":[4,5],"Algorithms":[3,4]},
    "Data Processing Specialist": {"Applied Statistics":[4,5],"Python":[4,5],"Management":[3,4]},
    "Technical Workflow Coordinator": {"Management":[4,5],"Introduction to Low Code":[4,5]},
    "Junior Technical Consultant": {"Management":[4,5],"Python":[3,4],"Information Theory":[2,3]},
    "ASIC Design Engineer": {"Architecture":[5,5],"C Programming":[4,5],"Information Theory":[3,4]},


    # 3rd Year
    "Linux Support Technician": {"Systems":[5,5],"Computer Networks":[4,5]},
    "DevOps Intern": {"Systems":[5,5],"OOP":[3,4],"Databases":[3,4]},
    "Network Technician": {"Computer Networks":[5,5],"Systems":[4,5]},
    "UI/UX Designer": {"Human Computer Interaction":[5,5]},
    "Data Management Assistant": {"Databases":[5,5],"Data Structures":[4,5]},
    "ML Model Testing Intern": {"Machine Learning":[5,5],"Python":[4,5],"Data Structures":[3,4]},
    "QA Automation Engineer (Entry-Level)": {"Python":[4,5],"Oop":[4,5],"Data Structures":[3,4]},
    "Product Design Intern": {"Human Computer Interaction":[5,5]},
    "Junior Database Administrator": {"Databases":[5,5],"Information Theory":[3,4]},
    "Cloud Engineering Intern": {"Systems":[4,5],"Computer Networks":[4,5],"OOP":[3,4]},
    "UX Research Assistant": {"Human Computer Interaction":[5,5]},
    "Backend Developer Intern": {"OOP":[5,5],"Data Structures":[4,5],"Algorithms":[4,5]},


    # 4th Year
    "Systems Engineer": {"Systems":[5,5],"Computer Networks":[4,5],"Architecture":[4,5]},
    "Kernel / Systems Programmer": {"Systems":[5,5],"C Programming":[5,5],"Architecture":[4,5]},
    "High-Performance Computing (HPC) Engineer": {"Math For CS":[5,5],"Complexity":[4,5],"Systems":[4,5]},
    "Distributed Systems Engineer": {"Algorithms":[5,5],"Computer Networks":[4,5],"Databases":[4,5]},
    "Graph Algorithms Engineer": {"Algorithms":[5,5],"Complexity":[4,5]},
    "Verification Engineer": {"Information Theory":[4,5],"Architecture":[4,5],"Algorithms":[3,4]},
    "Graphics Programmer": {"C Sharp":[4,5],"OOP":[4,5],"Math For CS":[3,4]},
    "Rendering Engineer": {"C Sharp":[4,5],"OOP":[4,5],"Math For CS":[4,5]},
    "Computer Vision Engineer": {"Machine Learning":[5,5],"Python":[5,5],"Probability Theory":[4,5]},
    "Digital Signal Processing (DSP) Engineer": {"Information Theory":[5,5],"C Programming":[4,5],"Math For CS":[4,5]},
    "Cybersecurity Analyst / SOC Analyst": {"Computer Networks":[5,5],"Systems":[4,5],"Information Theory":[3,4]},
    "AI Engineer / NLP Engineer": {"Machine Learning":[5,5],"Generative Ai":[5,5],"Python":[4,5]},
    "Device Driver Engineer": {"Systems":[5,5],"C Programming":[5,5]},
    "Operating Systems Developer": {"Systems":[5,5],"C Programming":[5,5],"Architecture":[4,5]},
    "Big Data Engineer": {"Databases":[5,5],"Python":[4,5],"Systems":[4,5]}
}



samples_per_job = 50
data = []

for job, rules in jobs_rules.items():
    for _ in range(samples_per_job):
        student = {}
        for subj in subjects:
            if subj in rules:
                low, high = rules[subj]
                if random.random() < 0.8:
                    student[subj] = high
                else:
                    student[subj] = np.random.randint(low, high+1)
            else:
                student[subj] = np.random.randint(1, 3)
        student["Job_Title"] = job
        data.append(student)


df = pd.DataFrame(data)
df = df.sample(frac=1, random_state=42).reset_index(drop=True)


# -------------------------
# Prepare features & labels
X = df[subjects].values
y = df["Job_Title"].values
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)

# -------------------------
# Scale for KNN
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# -------------------------
# KNN
knn = KNeighborsClassifier(n_neighbors=3, weights='distance')
knn.fit(X_train_scaled, y_train)
y_pred_knn = knn.predict(X_test_scaled)
acc_knn = accuracy_score(y_test, y_pred_knn)

# -------------------------
# Naive Bayes
nb = GaussianNB()
nb.fit(X_train, y_train)
y_pred_nb = nb.predict(X_test)
acc_nb = accuracy_score(y_test, y_pred_nb)

# Decision Tree
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train, y_train)
y_pred_dt = dt.predict(X_test)
acc_dt = accuracy_score(y_test, y_pred_dt)

# -------------------------
# Results
print("KNN Accuracy:", acc_knn)
print("Naive Bayes Accuracy:", acc_nb)
print("Decision Tree Accuracy:", acc_dt)
print(df['Job_Title'].value_counts())

filename = "dataset_subjects_ratings_jobs_maybefinal.csv"
df.to_csv(filename, index=False)
files.download(filename)

KNN Accuracy: 0.5282608695652173
Naive Bayes Accuracy: 0.8760869565217392
Decision Tree Accuracy: 0.9021739130434783
Job_Title
C# Developer                                 50
Rendering Engineer                           50
Insurance Data Assistant                     50
Backend Developer Intern                     50
AI Assistant Developer                       50
Applied Statistics Specialist                50
Network Technician                           50
Graphics Programmer                          50
Statistical Assistant                        50
Cybersecurity Analyst / SOC Analyst          50
C Programming                                50
Software Engineering Intern                  50
High-Performance Computing (HPC) Engineer    50
Big Data Engineer                            50
Cloud Engineering Intern                     50
UX Research Assistant                        50
Kernel / Systems Programmer                  50
Computer Vision Engineer                     50
Technical

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
import pandas as pd
from google.colab import files

# Job information: title, short description, and monthly salary
jobs_info = {

    "Data Analyst": {"Description": "Collects and analyzes data to help make business decisions.", "Salary": 5000},
    "Research Assistant": {"Description": "Supports research projects by collecting data and performing analysis.", "Salary": 4200},
    "Statistical Assistant": {"Description": "Assists statisticians in performing calculations and interpreting results.", "Salary": 4500},
    "Data Quality Analyst": {"Description": "Ensures data accuracy and integrity within databases.", "Salary": 4800},
    "Insurance Data Assistant": {"Description": "Supports insurance data management and reporting tasks.", "Salary": 4600},
    "AI Assistant Developer": {"Description": "Helps develop AI models and assists in machine learning tasks.", "Salary": 5500},
    "Junior Python Developer": {"Description": "Writes and maintains Python code under supervision.", "Salary": 5200},
    "QA Automation Assistant": {"Description": "Supports automated testing to ensure software quality.", "Salary": 5000},
    "Software Engineering Intern": {"Description": "Learns and assists in software development projects.", "Salary": 4000},

    "Applied Statistics Specialist": {"Description": "Applies statistical methods to solve practical problems.", "Salary": 5400},
    "Junior Statistical Analyst": {"Description": "Analyzes data trends and assists in reporting findings.", "Salary": 4700},
    "Biostatistics Assistant": {"Description": "Supports statistical analysis in biology or healthcare projects.", "Salary": 4500},
    "C Programmer": {"Description": "Develops software using the C programming language.", "Salary": 5000},
    "Technical Product Coordinator": {"Description": "Coordinates technical product development and team tasks.", "Salary": 4800},
    "C# Developer": {"Description": "Writes and maintains software using C# and .NET frameworks.", "Salary": 5300},
    "Data Processing Specialist": {"Description": "Processes and manages data for analysis and reporting.", "Salary": 5000},
    "Technical Workflow Coordinator": {"Description": "Organizes and optimizes technical workflows and processes.", "Salary": 4700},
    "Junior Technical Consultant": {"Description": "Provides technical support and consulting under supervision.", "Salary": 5000},
    "ASIC Design Engineer": {"Description": "Designs application-specific integrated circuits (ASICs).", "Salary": 6000},


    "Linux Support Technician": {"Description": "Maintains Linux systems and resolves technical issues.", "Salary": 4800},
    "DevOps Intern": {"Description": "Assists in deployment and operations of software systems.", "Salary": 4200},
    "Network Technician": {"Description": "Supports and maintains network hardware and software.", "Salary": 4600},
    "UI/UX Designer": {"Description": "Designs user interfaces and improves user experiences.", "Salary": 5000},
    "Data Management Assistant": {"Description": "Manages databases and ensures data integrity.", "Salary": 4900},
    "ML Model Testing Intern": {"Description": "Tests machine learning models and verifies performance.", "Salary": 5200},
    "QA Automation Engineer (Entry-Level)": {"Description": "Creates automated tests to ensure software quality.", "Salary": 5100},
    "Product Design Intern": {"Description": "Assists in designing products and creating prototypes.", "Salary": 4300},
    "Junior Database Administrator": {"Description": "Supports database maintenance and optimization tasks.", "Salary": 5000},
    "Cloud Engineering Intern": {"Description": "Assists in managing cloud infrastructure and services.", "Salary": 5400},
    "UX Research Assistant": {"Description": "Helps conduct user research and analyze feedback.", "Salary": 4500},
    "Backend Developer Intern": {"Description": "Develops server-side code and manages databases.", "Salary": 5200},


    "Systems Engineer": {"Description": "Designs and maintains complex computer systems.", "Salary": 6500},
    "Kernel / Systems Programmer": {"Description": "Develops low-level system software and OS kernels.", "Salary": 6300},
    "High-Performance Computing (HPC) Engineer": {"Description": "Optimizes software for high-performance computing systems.", "Salary": 6800},
    "Distributed Systems Engineer": {"Description": "Builds and manages distributed software systems.", "Salary": 6600},
    "Graph Algorithms Engineer": {"Description": "Designs and implements graph-based algorithms.", "Salary": 6400},
    "Verification Engineer": {"Description": "Verifies hardware and software functionality.", "Salary": 6100},
    "Graphics Programmer": {"Description": "Develops graphics and rendering software.", "Salary": 6000},
    "Rendering Engineer": {"Description": "Works on rendering pipelines for visual applications.", "Salary": 6200},
    "Computer Vision Engineer": {"Description": "Develops algorithms for image and video analysis.", "Salary": 7000},
    "Digital Signal Processing (DSP) Engineer": {"Description": "Works on digital signal processing systems.", "Salary": 6700},
    "Cybersecurity Analyst / SOC Analyst": {"Description": "Monitors and secures computer systems against threats.", "Salary": 6400},
    "AI Engineer / NLP Engineer": {"Description": "Builds AI and natural language processing systems.", "Salary": 7200},
    "Device Driver Engineer": {"Description": "Develops device drivers for hardware components.", "Salary": 6500},
    "Operating Systems Developer": {"Description": "Works on OS development and system software.", "Salary": 6800},
    "Big Data Engineer": {"Description": "Designs systems to process and analyze large datasets.", "Salary": 7100}
}

# Convert to DataFrame
df = pd.DataFrame([
    {"Job Title": job, "Description": info["Description"], "Monthly Salary (USD)": f"${info['Salary']:,}"}
    for job, info in jobs_info.items()
])

# Save CSV
csv_filename = "career_compass_full.csv"
df.to_csv(csv_filename, index=False)

# Download CSV in Colab
files.download(csv_filename)
print("CSV saved and download initiated!")