In [1]:
import pandas as pd 
import numpy as np
import matplotlib as plt 

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report

In [2]:
df = pd.read_csv("career_dataset.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,Acedamic percentage in Operating Systems,percentage in Algorithms,Percentage in Programming Concepts,Percentage in Software Engineering,Percentage in Computer Networks,Percentage in Electronics Subjects,Percentage in Computer Architecture,Percentage in Mathematics,Percentage in Communication skills,...,interested in games,Interested Type of Books,Salary Range Expected,In a Realtionship?,Gentle or Tuff behaviour?,Management or Technical,Salary/work,hard/smart worker,worked in teams ever?,Suggested Job Role
0,0,69,63,78,87,94,94,87,84,61,...,no,Prayer books,salary,no,stubborn,Management,salary,hard worker,yes,Database Developer
1,1,78,62,73,60,71,70,73,84,91,...,yes,Childrens,salary,yes,gentle,Technical,salary,hard worker,no,Database Administrator
2,2,71,86,91,87,61,81,72,72,94,...,yes,Travel,Work,no,stubborn,Management,work,hard worker,no,Database Administrator
3,3,76,87,60,84,89,73,62,88,69,...,no,Romance,Work,yes,gentle,Management,work,smart worker,yes,Database Developer
4,4,92,62,90,67,71,89,73,71,73,...,yes,Cookbooks,salary,no,stubborn,Management,work,hard worker,yes,Systems Analyst


In [3]:
df.drop(columns=['Salary/work'], inplace=True)
df.drop(columns=['Unnamed: 0'], inplace=True)
df.head()

Unnamed: 0,Acedamic percentage in Operating Systems,percentage in Algorithms,Percentage in Programming Concepts,Percentage in Software Engineering,Percentage in Computer Networks,Percentage in Electronics Subjects,Percentage in Computer Architecture,Percentage in Mathematics,Percentage in Communication skills,Hours working per day,...,Taken inputs from seniors or elders,interested in games,Interested Type of Books,Salary Range Expected,In a Realtionship?,Gentle or Tuff behaviour?,Management or Technical,hard/smart worker,worked in teams ever?,Suggested Job Role
0,69,63,78,87,94,94,87,84,61,9,...,no,no,Prayer books,salary,no,stubborn,Management,hard worker,yes,Database Developer
1,78,62,73,60,71,70,73,84,91,12,...,yes,yes,Childrens,salary,yes,gentle,Technical,hard worker,no,Database Administrator
2,71,86,91,87,61,81,72,72,94,11,...,yes,yes,Travel,Work,no,stubborn,Management,hard worker,no,Database Administrator
3,76,87,60,84,89,73,62,88,69,7,...,no,no,Romance,Work,yes,gentle,Management,smart worker,yes,Database Developer
4,92,62,90,67,71,89,73,71,73,4,...,no,yes,Cookbooks,salary,no,stubborn,Management,hard worker,yes,Systems Analyst


In [4]:
# Replace 'Work' with 'work' in the 'Salary Range Expected' column
df['Salary Range Expected'] = df['Salary Range Expected'].replace('Work', 'work')
df.head()

Unnamed: 0,Acedamic percentage in Operating Systems,percentage in Algorithms,Percentage in Programming Concepts,Percentage in Software Engineering,Percentage in Computer Networks,Percentage in Electronics Subjects,Percentage in Computer Architecture,Percentage in Mathematics,Percentage in Communication skills,Hours working per day,...,Taken inputs from seniors or elders,interested in games,Interested Type of Books,Salary Range Expected,In a Realtionship?,Gentle or Tuff behaviour?,Management or Technical,hard/smart worker,worked in teams ever?,Suggested Job Role
0,69,63,78,87,94,94,87,84,61,9,...,no,no,Prayer books,salary,no,stubborn,Management,hard worker,yes,Database Developer
1,78,62,73,60,71,70,73,84,91,12,...,yes,yes,Childrens,salary,yes,gentle,Technical,hard worker,no,Database Administrator
2,71,86,91,87,61,81,72,72,94,11,...,yes,yes,Travel,work,no,stubborn,Management,hard worker,no,Database Administrator
3,76,87,60,84,89,73,62,88,69,7,...,no,no,Romance,work,yes,gentle,Management,smart worker,yes,Database Developer
4,92,62,90,67,71,89,73,71,73,4,...,no,yes,Cookbooks,salary,no,stubborn,Management,hard worker,yes,Systems Analyst


In [5]:
# Extract and print unique values in the 'Suggested Job Role' column
unique_values = df['Suggested Job Role'].unique()

print("Unique values in 'Suggested Job Role':")
for value in unique_values:
    print(value)

Unique values in 'Suggested Job Role':
Database Developer
Database Administrator
Systems Analyst
Software Engineer
Solutions Architect
Design & UX
Information Security Analyst
E-Commerce Analyst
Network Security Engineer


In [6]:
df.shape

(20000, 37)

In [7]:
percentage_null_rows = (df.isnull().any(axis=1).mean()) * 100

In [8]:
print(percentage_null_rows)

0.0


In [9]:
# Identify columns with categorical values (dtype == 'object')
categorical_columns_new = df.select_dtypes(include=['object']).columns

# Print the names of categorical columns
print("Columns with categorical values:")
for col in categorical_columns_new:
    print(col)

Columns with categorical values:
can work long time before system?
self-learning capability?
Extra-courses did
certifications
workshops
talenttests taken?
olympiads
reading and writing skills
memory capability score
Interested subjects
Introvert
Job/Higher Studies?
Type of company want to settle in?
Taken inputs from seniors or elders
interested in games
Interested Type of Books
Salary Range Expected
In a Realtionship?
Gentle or Tuff behaviour?
Management or Technical
hard/smart worker
worked in teams ever?
Suggested Job Role


In [10]:
# Encode categorical columns
label_encoders = {}
categorical_columns = categorical_columns_new

for col in categorical_columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

In [11]:
# Encode target variable ('Suggested Job Role')
target_encoder = LabelEncoder()
df['Suggested Job Role'] = target_encoder.fit_transform(df['Suggested Job Role'])

# Separate features and target
X = df.drop(columns=['Suggested Job Role'])
y = df['Suggested Job Role']

In [12]:
X

Unnamed: 0,Acedamic percentage in Operating Systems,percentage in Algorithms,Percentage in Programming Concepts,Percentage in Software Engineering,Percentage in Computer Networks,Percentage in Electronics Subjects,Percentage in Computer Architecture,Percentage in Mathematics,Percentage in Communication skills,Hours working per day,...,Type of company want to settle in?,Taken inputs from seniors or elders,interested in games,Interested Type of Books,Salary Range Expected,In a Realtionship?,Gentle or Tuff behaviour?,Management or Technical,hard/smart worker,worked in teams ever?
0,69,63,78,87,94,94,87,84,61,9,...,8,0,0,21,0,0,1,0,0,1
1,78,62,73,60,71,70,73,84,91,12,...,4,1,1,5,0,1,0,1,0,0
2,71,86,91,87,61,81,72,72,94,11,...,5,1,1,29,1,0,1,0,0,0
3,76,87,60,84,89,73,62,88,69,7,...,7,0,0,23,1,1,0,0,1,1
4,92,62,90,67,71,89,73,71,73,4,...,9,0,1,7,0,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,83,67,62,63,81,74,90,84,69,4,...,7,0,1,17,1,1,1,0,1,1
19996,80,69,83,87,82,66,66,92,67,7,...,3,1,1,7,0,0,1,0,0,0
19997,83,70,80,87,64,85,69,94,88,7,...,4,1,1,10,1,1,0,1,0,0
19998,68,87,91,88,66,74,61,87,61,5,...,1,1,0,29,1,0,0,0,1,1


In [13]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [14]:
# Train the model (Random Forest Classifier)
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

In [15]:
# Evaluate the model
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.17725
Classification Report:
              precision    recall  f1-score   support

           0       0.09      0.01      0.01       375
           1       0.17      0.28      0.21       706
           2       0.00      0.00      0.00       219
           3       0.00      0.00      0.00       232
           4       0.00      0.00      0.00       216
           5       0.12      0.05      0.07       459
           6       0.20      0.54      0.29       810
           7       0.12      0.02      0.04       453
           8       0.11      0.07      0.09       530

    accuracy                           0.18      4000
   macro avg       0.09      0.11      0.08      4000
weighted avg       0.12      0.18      0.12      4000



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [16]:
# Predict a new student's career
new_student = {
    'Acedamic percentage in Operating Systems': 80,
    'percentage in Algorithms': 85,
    'Percentage in Programming Concepts': 90,
    'Percentage in Software Engineering': 88,
    'Percentage in Computer Networks': 75,
    'Percentage in Electronics Subjects': 70,
    'Percentage in Computer Architecture': 80,
    'Percentage in Mathematics': 85,
    'Percentage in Communication skills': 90,
    'Hours working per day': 8,
    'Taken inputs from seniors or elders': 1,  # Encoded value for 'yes'
    'interested in games': 0,  # Encoded value for 'no'
    'Interested Type of Books': 2,  # Encoded value for 'Travel'
    'Salary Range Expected': 1,  # Encoded value for 'salary'
    'In a Realtionship?': 0,  # Encoded value for 'no'
    'Gentle or Tuff behaviour?': 1,  # Encoded value for 'gentle'
    'Management or Technical': 1,  # Encoded value for 'Technical'
    'hard/smart worker': 0,  # Encoded value for 'hard worker'
    'worked in teams ever?': 1  # Encoded value for 'yes'
}

In [17]:
# Convert to DataFrame
new_student_df = pd.DataFrame([new_student])

# Check if the column names match
missing_columns = set(X.columns) - set(new_student_df.columns)
extra_columns = set(new_student_df.columns) - set(X.columns)

if missing_columns:
    print("Missing columns in new student data:", missing_columns)
if extra_columns:
    print("Extra columns in new student data:", extra_columns)

# If there are no differences, print a success message
if not missing_columns and not extra_columns:
    print("All columns match!")

Missing columns in new student data: {'workshops', 'olympiads', 'memory capability score', 'Logical quotient rating', 'public speaking points', 'can work long time before system?', 'reading and writing skills', 'hackathons', 'Interested subjects', 'coding skills rating', 'talenttests taken?', 'Extra-courses did', 'Job/Higher Studies?', 'certifications', 'Type of company want to settle in?', 'Introvert', 'self-learning capability?'}


In [18]:
# Columns that are missing from the new student data
missing_columns = {'Introvert', 'hackathons', 'self-learning capability?', 'coding skills rating', 
                   'Interested subjects', 'olympiads', 'Extra-courses did', 'public speaking points', 
                   'certifications', 'talenttests taken?', 'Logical quotient rating', 'Type of company want to settle in?', 
                   'workshops', 'Job/Higher Studies?', 'can work long time before system?', 'memory capability score', 
                   'reading and writing skills'}

# Add the missing columns with default values (e.g., 0 or 'Unknown')
for col in missing_columns:
    new_student[col] = 0  # Default value for numerical columns, or you could use 'Unknown' for categorical columns

# Convert the new student's input data to a DataFrame (updated)
new_student_df = pd.DataFrame([new_student])

# Recheck if all columns match after adding missing ones
missing_columns = set(X.columns) - set(new_student_df.columns)
extra_columns = set(new_student_df.columns) - set(X.columns)

if missing_columns:
    print("Missing columns in new student data:", missing_columns)
if extra_columns:
    print("Extra columns in new student data:", extra_columns)

# If there are no differences, print a success message
if not missing_columns and not extra_columns:
    print("All columns match!")

All columns match!


Label for Job Roles are:

[1] Database Developer

[2] Database Administrator

[3] Systems Analyst

[4] Software Engineer

[5] Solutions Architect

[6] Design & UX

[7] Information Security Analyst

[8] E-Commerce Analyst

[9] Network Security Engineer

In [19]:
# Reorder the new student's DataFrame columns to match the order of columns in X (training data)
new_student_df = new_student_df[X.columns]

# Predict the career role based on the reordered data
predicted_role = model.predict(new_student_df)

# Print the predicted role
print("Predicted Job Role:", predicted_role)

Predicted Job Role: [6]
