# Installations

In [1]:
!pip install pandas scikit-learn joblib




[notice] A new release of pip is available: 25.0 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import pandas as pd

In [5]:
df = pd.read_csv("PathWise.csv")

In [6]:
df.head()

Unnamed: 0,interests,stream,education,skills,time,role
0,Web Development,CSE,M.Tech,"MongoDB,Java",8 hrs,Backend Developer
1,Networking,EEE,Diploma,"Linux,Networking",3 hrs,Cybersecurity Analyst
2,DevOps,Mechanical,BCA,"AWS,CI/CD,Linux",4 hrs,Cloud Engineer
3,DevOps,IT,B.Tech,"Linux,AWS,CI/CD",4 hrs,Cloud Engineer
4,Data Science,CSE,BCA,"Excel,Pandas",10 hrs,Data Analyst


# Data Preprocessing

In [11]:
print(type(df["interests"][0]))

<class 'list'>


In [12]:
from sklearn.preprocessing import LabelEncoder

le_stream = LabelEncoder()
le_education = LabelEncoder()
le_time = LabelEncoder()

stream_encoded = le_stream.fit_transform(df["stream"])
education_encoded = le_education.fit_transform(df["education"])
time_encoded = le_time.fit_transform(df["time"])

In [13]:
le_role = LabelEncoder()
role_encoded = le_role.fit_transform(df["role"])

# Combining Features

In [14]:
import numpy as np

X = np.hstack([
    interests_encoded,
    skills_encoded,
    stream_encoded.reshape(-1, 1),
    education_encoded.reshape(-1, 1),
    time_encoded.reshape(-1, 1)
])
y = role_encoded


# Training the model

In [15]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Training the RandomForest model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)


# Evaluating the model

In [16]:
from sklearn.metrics import classification_report

y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred, target_names=le_role.classes_))

                       precision    recall  f1-score   support

    Backend Developer       1.00      1.00      1.00         7
       Cloud Engineer       1.00      1.00      1.00        11
Cybersecurity Analyst       1.00      1.00      1.00        12
         Data Analyst       1.00      1.00      1.00        16
   Frontend Developer       1.00      1.00      1.00        11
 Full Stack Developer       1.00      1.00      1.00        14
          ML Engineer       1.00      1.00      1.00        14
       UI/UX Designer       1.00      1.00      1.00        15

             accuracy                           1.00       100
            macro avg       1.00      1.00      1.00       100
         weighted avg       1.00      1.00      1.00       100



In [17]:
from sklearn.model_selection import cross_val_score
import numpy as np

scores = cross_val_score(model, X, y, cv=5)
print("Cross-Validation Accuracy: ", np.mean(scores))


Cross-Validation Accuracy:  0.99


In [18]:
import joblib

joblib.dump(model, "career_model.pkl")
joblib.dump(le_role, "label_encoder_role.pkl")
joblib.dump(le_stream, "label_encoder_stream.pkl")
joblib.dump(le_education, "label_encoder_education.pkl")
joblib.dump(le_time, "label_encoder_time.pkl")
joblib.dump(mlb_interests, "mlb_interests.pkl")
joblib.dump(mlb_skills, "mlb_skills.pkl")


['mlb_skills.pkl']