In [6]:
# 1. Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [7]:
# 2. Load Dataset
df = pd.read_csv("jobs_dataset.csv", low_memory=False)
print("✅ Dataset loaded successfully! Rows:", len(df))

✅ Dataset loaded successfully! Rows: 735


In [8]:
# 3. Clean Columns
df.columns = df.columns.str.strip().str.lower()
df = df.loc[:, ~df.columns.str.contains('^unnamed')]
df = df.drop_duplicates()

In [9]:
# 4. Handle Missing Values
df = df.fillna("Unknown")
print(" Missing values handled.")

 Missing values handled.


In [5]:
# 5. Combine Job Type Columns
job_cols = [c for c in df.columns if 'jobtype' in c]
if job_cols:
    df['job_type'] = df[job_cols].bfill(axis=1).iloc[:, 0]
else:
    df['job_type'] = "Unknown"

df['job_type'] = df['job_type'].str.title().replace({
    'Full Time': 'Full-Time',
    'Part Time': 'Part-Time'
})

In [6]:
# 6. Clean Salary Column
def extract_salary(s):
    if pd.isna(s):
        return np.nan
    text = str(s).lower()
    if not any(k in text for k in ['$', 'year', 'month', 'hour']):
        return np.nan
    nums = re.findall(r"\d{2,7}", text.replace(",", ""))
    if not nums:
        return np.nan
    avg = np.mean(list(map(int, nums)))
    if avg < 800 or avg > 500000:
        return np.nan
    return avg

df['salary_clean'] = df['salary'].apply(extract_salary)
df = df[df['salary_clean'].notna()]

In [7]:
# 7. Salary Category
df['salary_category'] = pd.qcut(df['salary_clean'], 3, labels=['Low', 'Medium', 'High'])

In [9]:
# 8. Encode Job Type for ML
le = LabelEncoder()
df['job_type_encoded'] = le.fit_transform(df['job_type'])

In [10]:
# 9. Text Vectorization
text_features = ['description', 'positionname']
df['combined_text'] = df[text_features].astype(str).agg(' '.join, axis=1)
vectorizer = TfidfVectorizer(stop_words='english', max_features=500)
X_text = vectorizer.fit_transform(df['combined_text'])

In [11]:
# 10. Numeric Features
numeric_cols = ['salary_clean']
X_num = df[numeric_cols].values

In [12]:
# 11. Combine Features
from scipy.sparse import hstack
from scipy import sparse

X = hstack((X_text, sparse.csr_matrix(X_num)))
y = df['job_type_encoded']

In [13]:
# 12. Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [14]:
# 13. Random Forest Model
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

In [15]:
# 14. Evaluation
print("\n📊 Classification Report:")
print(classification_report(y_test, y_pred))
print("✅ Accuracy:", round(accuracy_score(y_test, y_pred)*100, 2), "%")



📊 Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         1
           1       0.72      0.91      0.80        56
           2       0.00      0.00      0.00         2
           5       0.75      0.47      0.58        32

    accuracy                           0.73        91
   macro avg       0.37      0.34      0.35        91
weighted avg       0.71      0.73      0.70        91

✅ Accuracy: 72.53 %


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [17]:
# 15. Save Cleaned Data for Tableau
keep_cols = [
    'company', 'location', 'description', 'positionname',
    'salary', 'salary_clean', 'salary_category', 'job_type'
]
keep_cols = [c for c in keep_cols if c in df.columns]
clean_df = df[keep_cols].drop_duplicates()

clean_df.to_csv("cleaned_jobs_tableau_FINAL_PRO.csv", index=False, encoding="utf-8-sig")
print("\n Exported cleaned Tableau dataset successfully!")
print(" File: cleaned_jobs_tableau_FINAL_PRO.csv")
print(clean_df.head(10))


 Exported cleaned Tableau dataset successfully!
 File: cleaned_jobs_tableau_FINAL_PRO.csv
                         company                 location  \
0                         Google            San Bruno, CA   
1                         BAXTER      Milwaukee, WI 53214   
3                           Meta       Bellevue, WA 98005   
5                         Google            Cambridge, MA   
6                  JPMorganChase   Jacksonville, FL 32256   
7            Comcast Corporation  San Francisco, CA 94108   
8   Riverside Research Institute    Beavercreek, OH 45431   
9                    Mayo Clinic      Rochester, MN 55901   
15                   Mayo Clinic      Rochester, MN 55901   
22      Iambic Therapeutics, Inc            San Diego, CA   

                                          description  \
0   Note: By applying to this position you will ha...   
1   This is where you save and sustain lives\n\nAt...   
3   Bellevue, WA • Full Time\nMeta\nSoftware Engin...   
5   Note: