In [53]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
import re
import joblib  # for saving the model

# Load your dataset
df = pd.read_csv('jobs_dataset.csv')
print(f"Dataset shape: {df.shape}")
print(df.head())

Dataset shape: (735, 16)
           company  rating             location  \
0           Google     4.3        San Bruno, CA   
1           BAXTER     3.7  Milwaukee, WI 53214   
2             Meta     4.2          Redmond, WA   
3             Meta     4.2   Bellevue, WA 98005   
4  Lockheed Martin     4.0    Shelton, CT 06484   

                                        positionName     min_salary  \
0    Senior Data Scientist, Research, YouTube Search  166000.000000   
1                Senior AI Engineer - Data Scientist  112000.000000   
2         Audio Software Engineer, Applied Scientist  146993.600000   
3                Software Engineer, Machine Learning  203350.000000   
4  AI / Machine Learning Research Engineer (early...  141767.402667   

   max_salary  average_salary  \
0   244000.00   205000.000000   
1   154000.00   133000.000000   
2   146993.60   146993.600000   
3   240240.00   221795.000000   
4   236245.72   189006.561333   

                                          

In [54]:
# More comprehensive mapping based on your dataset
job_category_mapping = {
    "Data Scientist": [
        "Data Scientist", "Senior Data Scientist", "Principal Data Scientist", 
        "Lead Data Scientist", "Staff Data Scientist", "Data Scientist III", 
        "Data Scientist II", "Data Scientist I", "Associate Data Scientist",
        "Clinical Data Scientist", "Research Data Scientist", "Language Data Scientist",
        "Natural Language Data Scientist", "Power Systems Data Scientist", 
        "Battery Data Scientist", "DNA Data Scientist", "Digital Health Data Scientist",
        "Enterprise Data Scientist", "Palantir Data Scientist", "Data Science Analyst",
        "Analytics Scientist", "Decision Scientist", "Data Science Consultant",
        "Data Science Manager", "Data Science Engineer", "Data Science Senior Specialist"
    ],
    "Machine Learning Engineer": [
        "Machine Learning Engineer", "ML Engineer", "Senior Machine Learning Engineer",
        "Principal Machine Learning Engineer", "Lead Machine Learning Engineer",
        "Staff Machine Learning Engineer", "Machine Learning Engineer III",
        "Early Career Machine Learning Engineer", "Machine Learning Software Engineering Manager",
        "ML/AI Engineer", "Machine Learning Ops Engineer", "ML Engineering"
    ],
    "AI Engineer": [
        "AI Engineer", "Senior AI Engineer", "AI/ML Engineer", "AI Developer",
        "Generative AI Software Engineer", "GenAI Engineer", "LLM Engineer",
        "Full Stack AI Engineer", "AI Software Engineer", "AI Infrastructure Engineer",
        "AI Workflow Engineer", "AI Operations Engineer", "AI Platform Engineer"
    ],
    "Research Scientist": [
        "Research Scientist", "Machine Learning Scientist", "AI Researcher",
        "Applied Scientist", "Research Engineer", "Staff AI Researcher",
        "Principal AI Scientist", "Research Statistician", "AI/ML Researcher",
        "Research Data Scientist", "AI Scientist"
    ],
    "Data Analyst": [
        "Data Analyst", "Senior Data Analyst", "Lead Data Analyst", "Business Data Analyst",
        "Clinical Data Analyst", "Commercial Data Analyst", "Fraud Data Analyst",
        "Healthcare Data Analyst", "Statistical Analyst", "Website Data Analyst",
        "Marketing Data Analyst", "Operations Data Analyst", "Audit Data Analyst",
        "Systems Data Analyst", "Data Analytics Lead", "Data Analytics Consultant"
    ],
    "AI/ML Leadership": [
        "Head of AI", "Head of Data & AI", "Director of Data Science", 
        "Vice President, Data Science", "Manager, Data Science", 
        "Manager, Machine Learning Operations", "AI Strategy Manager",
        "Lead AI/ML Engineer", "Principal Data Scientist", "Director, AI Solutions",
        "AI Manager", "Head of Robotics & Applied AI", "Executive Director, ML",
        "Senior Manager, Data Science", "AI & Data Manager"
    ],
    "AI Architect": [
        "AI Architect", "AI Solutions Architect", "Applied AI Architect",
        "GenAI Architect", "Machine Learning Architect", "Deep Learning Architect",
        "Data & AI Architect", "ML/AI Operations Architect"
    ],
    "Software Engineer - AI/ML": [
        "Software Engineer, Machine Learning", "Software Engineer, AI",
        "Senior Software Engineer - AI/ML", "AI Software Engineering Tech Lead",
        "Backend Software Engineer, AI", "Fullstack Software Engineer, AI",
        "Software Engineer, Machine Learning Infra", "Software Development Engineer, AI",
        "Audio Software Engineer, Applied Scientist", "Software Engineer, Generative AI"
    ],
    "Generative AI Specialist": [
        "Prompt Engineer", "GenAI Engineer", "LLM Engineer", 
        "Generative AI Software Engineer", "GenAI Technology Lead",
        "Lead Generative AI Engineer", "GenAI Specialist", "AI Tutor"
    ],
    "Data Engineer": [
        "Data Engineer", "Senior Data Engineer", "AI Data Engineer",
        "Data & AI Engineer", "Lead Data Engineer", "Data Processing Platform Engineer",
        "Data Engineering Manager", "Data Platform Engineer"
    ]
}

In [55]:
def create_training_data(mapping_dict):
    """Create training data from the mapping dictionary"""
    training_data = []
    
    for category, titles in mapping_dict.items():
        for title in titles:
            training_data.append({
                'positionName': title,
                'position_category': category
            })
    
    return pd.DataFrame(training_data)

# Create training dataframe
training_df = create_training_data(job_category_mapping)
print(f"Created {len(training_df)} training examples")
print(f"Categories: {training_df['position_category'].value_counts()}")

Created 127 training examples
Categories: position_category
Data Scientist               26
Data Analyst                 16
AI/ML Leadership             15
AI Engineer                  13
Machine Learning Engineer    12
Research Scientist           11
Software Engineer - AI/ML    10
AI Architect                  8
Generative AI Specialist      8
Data Engineer                 8
Name: count, dtype: int64


In [56]:
def extract_features(text):
    """Extract features from job titles"""
    text = str(text).lower()
    
    features = {
        'length': len(text),
        'word_count': len(text.split()),
        'has_senior': 1 if 'senior' in text else 0,
        'has_lead': 1 if 'lead' in text else 0,
        'has_principal': 1 if 'principal' in text else 0,
        'has_staff': 1 if 'staff' in text else 0,
        'has_manager': 1 if 'manager' in text else 0,
        'has_director': 1 if 'director' in text else 0,
        'has_head': 1 if 'head' in text else 0,
        'has_engineer': 1 if 'engineer' in text else 0,
        'has_scientist': 1 if 'scientist' in text else 0,
        'has_analyst': 1 if 'analyst' in text else 0,
        'has_research': 1 if 'research' in text else 0,
        'has_applied': 1 if 'applied' in text else 0,
        'has_ai': 1 if 'ai' in text or 'artificial intelligence' in text else 0,
        'has_ml': 1 if 'machine learning' in text or ' ml ' in text else 0,
        'has_data': 1 if 'data' in text else 0,
        'has_architect': 1 if 'architect' in text else 0,
        'has_software': 1 if 'software' in text else 0,
        'has_genai': 1 if 'genai' in text or 'generative' in text else 0,
        'has_llm': 1 if 'llm' in text or 'large language' in text else 0,
        'has_prompt': 1 if 'prompt' in text else 0
    }
    
    return features

def prepare_features(df, tfidf_vectorizer=None, fit_vectorizer=False):
    """Prepare features for the model"""
    # Extract structured features
    feature_list = []
    for title in df['positionName']:
        feature_list.append(extract_features(title))
    
    structured_features = pd.DataFrame(feature_list)
    
    # TF-IDF features from text
    if fit_vectorizer:
        tfidf = TfidfVectorizer(
            max_features=150,
            stop_words='english',
            ngram_range=(1, 3),
            min_df=1,
            max_df=0.8
        )
        tfidf_features = tfidf.fit_transform(df['positionName'])
    else:
        tfidf_features = tfidf_vectorizer.transform(df['positionName'])
        tfidf = tfidf_vectorizer
    
    tfidf_df = pd.DataFrame(
        tfidf_features.toarray(), 
        columns=[f"tfidf_{i}" for i in range(tfidf_features.shape[1])]
    )
    
    # Combine all features
    all_features = pd.concat([structured_features, tfidf_df], axis=1)
    return all_features, tfidf

In [57]:
def train_random_forest(X, y):
    """Train Random Forest classifier"""
    rf_model = RandomForestClassifier(
        n_estimators=200,
        max_depth=15,
        min_samples_split=3,
        min_samples_leaf=1,
        random_state=42,
        class_weight='balanced',
        n_jobs=-1  # Use all available cores
    )
    
    rf_model.fit(X, y)
    return rf_model

# Prepare features for training data
X_train, tfidf_vectorizer = prepare_features(training_df, fit_vectorizer=True)
y_train = training_df['position_category']

print(f"Training features shape: {X_train.shape}")

# Train the model
rf_model = train_random_forest(X_train, y_train)

# Check training accuracy
train_predictions = rf_model.predict(X_train)
train_accuracy = accuracy_score(y_train, train_predictions)
print(f"Training Accuracy: {train_accuracy:.3f}")

Training features shape: (127, 172)
Training Accuracy: 0.961
Training Accuracy: 0.961


In [58]:
def predict_job_categories(df, model, tfidf_vectorizer):
    """Predict job categories for the entire dataset"""
    # Prepare features for prediction
    X_pred, _ = prepare_features(df, tfidf_vectorizer=tfidf_vectorizer, fit_vectorizer=False)
    
    # Ensure feature alignment
    X_pred = X_pred.reindex(columns=X_train.columns, fill_value=0)
    
    # Predict categories
    predictions = model.predict(X_pred)
    
    return predictions

# Apply the model to your dataset
predictions = predict_job_categories(df, rf_model, tfidf_vectorizer)

# Add predictions to your dataframe
df['position_category'] = predictions

print("\nFirst 10 predictions:")
print(df[['positionName', 'position_category']].head(10))


First 10 predictions:
                                        positionName  \
0    Senior Data Scientist, Research, YouTube Search   
1                Senior AI Engineer - Data Scientist   
2         Audio Software Engineer, Applied Scientist   
3                Software Engineer, Machine Learning   
4  AI / Machine Learning Research Engineer (early...   
5   Staff Data Scientist, Research, Search Platforms   
6     Applied AI/ML, Senior Associate - Gen AI & LLM   
7          Executive Director, ML and Ad Marketplace   
8                                     AI/ML Engineer   
9                   Senior Software Engineer - AI/ML   

           position_category  
0             Data Scientist  
1             Data Scientist  
2  Software Engineer - AI/ML  
3  Software Engineer - AI/ML  
4  Machine Learning Engineer  
5             Data Scientist  
6           AI/ML Leadership  
7           AI/ML Leadership  
8                AI Engineer  
9  Software Engineer - AI/ML  


In [59]:
# Analyze the results
print("\nCategory Distribution:")
category_counts = df['position_category'].value_counts()
print(category_counts)

print(f"\nTotal jobs categorized: {len(df)}")
print(f"Number of categories: {len(category_counts)}")


Category Distribution:
position_category
Data Scientist               234
AI/ML Leadership             146
Machine Learning Engineer     80
AI Engineer                   72
Data Analyst                  62
Research Scientist            48
Software Engineer - AI/ML     43
AI Architect                  19
Generative AI Specialist      16
Data Engineer                 15
Name: count, dtype: int64

Total jobs categorized: 735
Number of categories: 10


In [60]:
df

Unnamed: 0,company,rating,location,positionName,min_salary,max_salary,average_salary,skills,skill_categories,skills_count_all,skills_count_single,total_skills,city,company_encoded,position_encoded,city_encoded,position_category
0,Google,4.3,"San Bruno, CA","Senior Data Scientist, Research, YouTube Search",166000.000000,244000.0000,205000.000000,"['python', 'r', 'statistics', 'data science', ...","[0, 0, 1, 2, 4, 5, 10, 10, 10]",9,"[2, 1, 1, 0, 1, 1, 0, 0, 0, 0, 3]",9,San Bruno,174,427,167,Data Scientist
1,BAXTER,3.7,"Milwaukee, WI 53214",Senior AI Engineer - Data Scientist,112000.000000,154000.0000,133000.000000,"['python', 'scala', 'optimization', 'machine l...","[0, 0, 1, 2, 2, 2, 2, 4, 4, 6, 7, 7, 7, 10]",14,"[2, 1, 4, 0, 2, 0, 1, 3, 0, 0, 1]",14,Milwaukee,52,374,116,Data Scientist
2,Meta,4.2,"Redmond, WA","Audio Software Engineer, Applied Scientist",146993.600000,146993.6000,146993.600000,"['c', 'c++', 'machine learning', 'generative a...","[0, 0, 2, 2, 2, 10, 10, 10]",8,"[2, 0, 3, 0, 0, 0, 0, 0, 0, 0, 3]",8,Redmond,255,103,151,Software Engineer - AI/ML
3,Meta,4.2,"Bellevue, WA 98005","Software Engineer, Machine Learning",203350.000000,240240.0000,221795.000000,"['python', 'java', 'c', 'c#', 'c++', 'haskell'...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, ...",34,"[12, 3, 6, 3, 4, 3, 1, 0, 0, 0, 2]",34,Bellevue,255,490,24,Software Engineer - AI/ML
4,Lockheed Martin,4.0,"Shelton, CT 06484",AI / Machine Learning Research Engineer (early...,141767.402667,236245.7200,189006.561333,"['python', 'c', 'c++', 'go', 'linux', 'machine...","[0, 0, 0, 0, 0, 2, 2, 2, 2, 3, 3, 3, 3, 3, 4, ...",27,"[5, 0, 4, 5, 1, 0, 2, 0, 2, 6, 2]",27,Shelton,230,10,176,Machine Learning Engineer
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
730,Citi,3.9,"Tampa, FL 33601",VP - Regulatory Reporting Ld Analyst / Data Sc...,103920.000000,155880.0000,129900.000000,"['python', 'optimization', 'machine learning',...","[0, 1, 2, 2, 2, 5, 7, 10, 10, 10]",10,"[1, 1, 3, 0, 0, 1, 0, 1, 0, 0, 3]",10,Tampa,93,546,184,Data Scientist
731,Vanguard,3.6,"Malvern, PA","Machine Learning Engineer, Specialist",125289.208000,197900.0160,161594.612000,"['python', 'statistics', 'machine learning', '...","[0, 1, 2, 2, 2, 2, 2, 2, 2, 2, 4, 4, 5, 6, 6, ...",19,"[1, 1, 8, 0, 2, 1, 3, 0, 2, 0, 1]",19,Malvern,402,292,106,Machine Learning Engineer
732,Vanguard,3.6,"Charlotte, NC","Domain Architect- AI/ML, Senior Specialist",122380.865600,213591.4500,167986.157800,"['regression', 'machine learning', 'ai/ml', 'a...","[1, 2, 2, 2, 6, 8, 10]",7,"[0, 1, 3, 0, 0, 0, 1, 0, 1, 0, 1]",7,Charlotte,402,206,42,AI Architect
733,Guidehouse,3.3,"Huntsville, AL 35806",Data Analytics Consultant,125614.668200,171024.9208,148319.794500,"['python', 'r', 'ai/ml', 'data science', 'etl'...","[0, 0, 2, 2, 4, 4, 4, 10]",8,"[2, 0, 2, 0, 3, 0, 0, 0, 0, 0, 1]",8,Huntsville,177,122,85,Data Analyst


## ML

🔧 Creating features...
🎯 Final features: (588, 94)
🎯 Final features: (588, 94)

🚀 RANDOM FOREST PERFORMANCE
Accuracy: 0.871
Test samples: 147

📊 Classification Report:
                           precision    recall  f1-score   support

             AI Architect       1.00      1.00      1.00         4
              AI Engineer       0.83      0.71      0.77        14
         AI/ML Leadership       0.94      1.00      0.97        29
             Data Analyst       0.86      1.00      0.92        12
            Data Engineer       0.50      0.33      0.40         3
           Data Scientist       0.96      0.91      0.93        47
 Generative AI Specialist       1.00      0.67      0.80         3
Machine Learning Engineer       0.61      0.69      0.65        16
       Research Scientist       1.00      0.80      0.89        10
Software Engineer - AI/ML       0.73      0.89      0.80         9

                 accuracy                           0.87       147
                macro avg 