In [30]:
# STEP 1: Install required package to read Excel
!pip install openpyxl

# STEP 2: Upload the file
from google.colab import files
uploaded = files.upload()




In [32]:
import pandas as pd

# Load your CSV
df = pd.read_csv("AI-based Career Recommendation System.csv")

# Show column names clearly
print("Columns:", df.columns.tolist())


Columns: ['CandidateID', 'Name', 'Age', 'Education', 'Skills', 'Interests', 'Recommended_Career', 'Recommendation_Score']


In [33]:
# Strip spaces and convert to lowercase
df.columns = df.columns.str.strip().str.lower()

# Check again
print("Cleaned columns:", df.columns.tolist())


Cleaned columns: ['candidateid', 'name', 'age', 'education', 'skills', 'interests', 'recommended_career', 'recommendation_score']


In [24]:
# STEP 3: Load the Excel file
import pandas as pd

df = pd.read_excel("Dataset Project 404.xlsx")
print("Original columns:", df.columns.tolist())



Original columns: ['Sr.No.', 'Course', 'Job profession', 'Student', 'Linguistic', 'Musical', 'Bodily', 'Logical - Mathematical', 'Spatial-Visualization', 'Interpersonal', 'Intrapersonal', 'Naturalist', 's/p', 'P1', 'P2', 'P3', 'P4', 'P5', 'P6', 'P7', 'P8']


In [25]:
# STEP 4: Clean column names and rename 'interst' → 'interests'
df.columns = df.columns.str.strip().str.lower()
df.rename(columns={'interst': 'interests'}, inplace=True)
print("Cleaned columns:", df.columns.tolist())


Cleaned columns: ['sr.no.', 'course', 'job profession', 'student', 'linguistic', 'musical', 'bodily', 'logical - mathematical', 'spatial-visualization', 'interpersonal', 'intrapersonal', 'naturalist', 's/p', 'p1', 'p2', 'p3', 'p4', 'p5', 'p6', 'p7', 'p8']


In [34]:
# STEP 5: Fill missing values in 'skills' and 'interests'
# Fill missing values
df['skills'] = df['skills'].fillna("")
df['interests'] = df['interests'].fillna("")
df['recommended_career'] = df['recommended_career'].fillna("")

# Create features column
df['features'] = df['skills'] + " " + df['interests']

# Drop missing careers
df.dropna(subset=['recommended_career'], inplace=True)

# Preview
df[['skills', 'interests', 'features', 'recommended_career']].head()


Unnamed: 0,skills,interests,features,recommended_career
0,Python;Data Analysis;Machine Learning,Technology;Data Science,Python;Data Analysis;Machine Learning Technolo...,Data Scientist
1,Java;System Design;Cloud Computing,Software Development;AI,Java;System Design;Cloud Computing Software De...,Software Engineer
2,Graphic Design;UI/UX;Adobe Creative Suite,Arts;Digital Media,Graphic Design;UI/UX;Adobe Creative Suite Arts...,UX Designer
3,Python;Deep Learning;Statistics,Healthcare;AI,Python;Deep Learning;Statistics Healthcare;AI,AI Researcher
4,Project Management;Communication;Agile,Business;Management,Project Management;Communication;Agile Busines...,Project Manager


In [38]:
# STEP 6: Label encode the career column
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df['career_label'] = le.fit_transform(df['recommended_career'])

# Optional: see what numbers mean
label_map = dict(zip(le.classes_, le.transform(le.classes_)))
print("Label mapping:", label_map)



Label mapping: {'AI Researcher': np.int64(0), 'AI Specialist': np.int64(1), 'Automation Engineer': np.int64(2), 'Backend Developer': np.int64(3), 'Biostatistician': np.int64(4), 'Business Analyst': np.int64(5), 'Cloud Engineer': np.int64(6), 'Content Strategist': np.int64(7), 'Cybersecurity Analyst': np.int64(8), 'Cybersecurity Specialist': np.int64(9), 'Data Analyst': np.int64(10), 'Data Engineer': np.int64(11), 'Data Scientist': np.int64(12), 'Deep Learning Engineer': np.int64(13), 'DevOps Engineer': np.int64(14), 'Digital Marketer': np.int64(15), 'Embedded Systems Engineer': np.int64(16), 'Financial Analyst': np.int64(17), 'Front-end Developer': np.int64(18), 'Full Stack Developer': np.int64(19), 'Graphic Designer': np.int64(20), 'Machine Learning Engineer': np.int64(21), 'Marketing Manager': np.int64(22), 'Mobile Developer': np.int64(23), 'NLP Engineer': np.int64(24), 'Project Manager': np.int64(25), 'Research Analyst': np.int64(26), 'Research Scientist': np.int64(27), 'Software De

In [39]:
# STEP 7: Convert text to features using CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer()
X = cv.fit_transform(df['features'])   # input features
y = df['career_label']                 # encoded target labels



In [40]:
# STEP 8: Train-Test Split & Model Training
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
model = MultinomialNB()
model.fit(X_train, y_train)

# Predict
y_pred = model.predict(X_test)

# Evaluate
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           2       0.00      0.00      0.00         1
           3       1.00      1.00      1.00         1
           4       0.00      0.00      0.00         2
           5       0.00      0.00      0.00         2
           6       0.00      0.00      0.00         1
           7       0.50      1.00      0.67         1
           8       0.00      0.00      0.00         3
          10       0.27      0.75      0.40         4
          11       0.00      0.00      0.00         0
          12       0.00      0.00      0.00         2
          13       0.00      0.00      0.00         1
          15       1.00      0.60      0.75         5
          16       1.00      1.00      1.00         1
          17       0.00      0.00      0.00         2
          18       0.50      1.00      0.67         1
          19       0.00      0.00      0.00         1
          20       1.00      0.50      0.67         2
          21       0.00    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [42]:
import joblib

# Save model and vectorizer
joblib.dump(model, "career_model.pkl")
joblib.dump(cv, "cv.pkl")
joblib.dump(le, "label_encoder.pkl")  # also save label encoder


['label_encoder.pkl']