In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report


In [2]:
# Load the dataset
file_path = 'Job_ML_Filtered.csv'  # Replace with your file path
data = pd.read_csv(file_path)

# Display dataset structure
print(data.head())
print(data.info())


   Job Role     Experience Qualifications Salary Range    location  \
0         1  5 to 15 Years         M.Tech    $59K-$99K     Douglas   
1         2  2 to 12 Years            BCA   $56K-$116K    Ashgabat   
2         3  0 to 12 Years            PhD   $61K-$104K       Macao   
3         4  4 to 11 Years            PhD    $65K-$91K  Porto-Novo   
4         5  1 to 12 Years            MBA    $64K-$87K    Santiago   

            Country  Work Type  Company Size Job Posting Date Preference  ...  \
0       Isle of Man     Intern         26801       2022-04-24     Female  ...   
1      Turkmenistan     Intern        100340       2022-12-19     Female  ...   
2  Macao SAR, China  Temporary         84525       2022-09-14       Male  ...   
3             Benin  Full-Time        129896       2023-02-25     Female  ...   
4             Chile     Intern         53944       2022-10-11     Female  ...   

                 Contact                     Job Title  \
0   001-381-930-7517x737  Digital 

In [9]:
def extract_avg_salary(salary):
    try:
        salary = salary.replace('$', '').replace('K', '').split('-')
        # Convert to integer and calculate average
        return (int(salary[0]) + int(salary[1])) / 2
    except:
        return np.nan

# Apply the function to the dataset
data['Salary Range'] = data['Salary Range'].apply(extract_avg_salary)


In [10]:
#Encode categorical variables:
label_encoders = {}
for col in ['Qualifications', 'Work Type']:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    label_encoders[col] = le


In [11]:
#Vectorize skills using TF-IDF:
tfidf = TfidfVectorizer(max_features=500)
skills_tfidf = tfidf.fit_transform(data['skills'])


In [12]:
#4. Combine Features and Define the Target
# Combine numeric and text features
X = np.hstack([
    data[['Experience', 'Qualifications', 'Salary Range', 'Work Type']].fillna(0).values,
    skills_tfidf.toarray()
])

# Encode the target variable (Role)
label_encoder_role = LabelEncoder()
y = label_encoder_role.fit_transform(data['Role'])


In [13]:
#5. Split the Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [14]:
#6. Train a Machine Learning Model
# Train Random Forest Classifier
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)


In [17]:
#7. Evaluate the Model
# Predict on the test set
# Generate classification report
labels = np.unique(y)  # All possible classes in the target variable
print(classification_report(y_test, y_pred, labels=labels))



              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       0.00      0.00      0.00         0
           2       1.00      1.00      1.00         1
           3       0.00      0.00      0.00         0
           4       0.00      0.00      0.00         0
           5       0.00      0.00      0.00         0
           6       0.00      0.00      0.00         0
           7       0.00      0.00      0.00         0
           8       1.00      1.00      1.00         1
           9       0.00      0.00      0.00         1
          10       1.00      1.00      1.00         1
          11       0.00      0.00      0.00         0
          12       0.00      0.00      0.00         0
          13       0.00      0.00      0.00         0
          14       1.00      1.00      1.00         2
          15       0.00      0.00      0.00         1
          16       0.00      0.00      0.00         0
          17       1.00    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [18]:
#8. Save Preprocessing and Model (Optional)
import pickle

# Save the trained model and encoders
pickle.dump(model, open('ml_model.pkl', 'wb'))
pickle.dump(label_encoder_role, open('role_encoder.pkl', 'wb'))
pickle.dump(tfidf, open('tfidf_vectorizer.pkl', 'wb'))
