In [6]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns 
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression

In [7]:
df = pd.read_csv(r"student-scores.csv")

df.head()

Unnamed: 0,id,first_name,last_name,email,gender,part_time_job,absence_days,extracurricular_activities,weekly_self_study_hours,career_aspiration,math_score,history_score,physics_score,chemistry_score,biology_score,english_score,geography_score
0,1,Paul,Casey,paul.casey.1@gslingacademy.com,male,False,3,False,27,Lawyer,73,81,93,97,63,80,87
1,2,Danielle,Sandoval,danielle.sandoval.2@gslingacademy.com,female,False,2,False,47,Doctor,90,86,96,100,90,88,90
2,3,Tina,Andrews,tina.andrews.3@gslingacademy.com,female,False,9,True,13,Government Officer,81,97,95,96,65,77,94
3,4,Tara,Clark,tara.clark.4@gslingacademy.com,female,False,5,False,3,Artist,71,74,88,80,89,63,86
4,5,Anthony,Campos,anthony.campos.5@gslingacademy.com,male,False,5,False,10,Unknown,84,77,65,65,80,74,76


In [128]:
df.dtypes

id                             int64
first_name                    object
last_name                     object
email                         object
gender                        object
part_time_job                   bool
absence_days                   int64
extracurricular_activities      bool
weekly_self_study_hours        int64
career_aspiration             object
math_score                     int64
history_score                  int64
physics_score                  int64
chemistry_score                int64
biology_score                  int64
english_score                  int64
geography_score                int64
dtype: object

In [129]:
df.isnull().sum()

id                            0
first_name                    0
last_name                     0
email                         0
gender                        0
part_time_job                 0
absence_days                  0
extracurricular_activities    0
weekly_self_study_hours       0
career_aspiration             0
math_score                    0
history_score                 0
physics_score                 0
chemistry_score               0
biology_score                 0
english_score                 0
geography_score               0
dtype: int64

In [9]:

df= df.drop(columns=[ "first_name", "last_name", "email",'id','gender' , 'part_time_job', 'extracurricular_activities', 'absence_days'], errors='ignore')


In [66]:
# df['total_score'] = df[['math_score', 'history_score', 'physics_score', 
#                        'chemistry_score', 'biology_score', 'english_score', 
#                        'geography_score']].sum(axis=1)

# # "Science Score" (for STEM career recommendations)
# df['science_score'] = df[['physics_score', 'chemistry_score', 'biology_score']].mean(axis=1)

# # "Arts Score" (for humanities/arts careers)
# df['arts_score'] = df[['history_score', 'english_score', 'geography_score']].mean(axis=1)

In [10]:
df.head()

Unnamed: 0,weekly_self_study_hours,career_aspiration,math_score,history_score,physics_score,chemistry_score,biology_score,english_score,geography_score
0,27,Lawyer,73,81,93,97,63,80,87
1,47,Doctor,90,86,96,100,90,88,90
2,13,Government Officer,81,97,95,96,65,77,94
3,3,Artist,71,74,88,80,89,63,86
4,10,Unknown,84,77,65,65,80,74,76


In [11]:
df['career_aspiration'].unique()

array(['Lawyer', 'Doctor', 'Government Officer', 'Artist', 'Unknown',
       'Software Engineer', 'Teacher', 'Business Owner', 'Scientist',
       'Banker', 'Writer', 'Accountant', 'Designer',
       'Construction Engineer', 'Game Developer', 'Stock Investor',
       'Real Estate Developer'], dtype=object)

In [19]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE
from scipy.stats import randint, uniform
import time
import joblib

# Load your dataset
# df = pd.read_csv("your_data.csv")

# ======================================================================
# STEP 1: DATA PREPROCESSING (FASTER IMPLEMENTATION)
# ======================================================================

# Clean career aspirations with vectorized operations
df["career_aspiration"] = (df["career_aspiration"]
                          .str.strip()
                          .str.title()
                          .str.replace(r'[^a-zA-Z\s]', '', regex=True))

# Enhanced career mapping with fallback
career_group_mapping = {
    # Business
    "Accountant": "Business", "Banker": "Business", "Business Owner": "Business",
    "Entrepreneur": "Business", "Stock Investor": "Business", "Finance": "Business",
    "Financial Analyst": "Business", "Investment Banker": "Business", "Wealth Manager": "Business",
    
    # Creative
    "Artist": "Creative", "Designer": "Creative", "Content Creator": "Creative",
    
    # Engineering
    "Engineer": "Engineering", "Civil Engineer": "Engineering",
    
    # Public Services
    "Lawyer": "Public Services", "Teacher": "Public Services",
    
    # Science
    "Doctor": "Science", "Scientist": "Science",
    
    # Technology
    "Software Engineer": "Technology", "Developer": "Technology", 
    "Data Scientist": "Technology","Machine Learning Engineer": "Technology",
    "DevOps Engineer": "Technology", "Frontend Developer": "Technology",
}

# Vectorized mapping with fallback
df["career_group"] = df["career_aspiration"].map(career_group_mapping).fillna("Other")

# ======================================================================
# STEP 2: OPTIMIZED FEATURE ENGINEERING
# ======================================================================


# Select features and target
X = df.drop(columns=["career_aspiration", "career_group"])
y = df["career_group"]

# Impute missing values (faster strategy)
imputer = SimpleImputer(strategy="most_frequent")
X_imputed = imputer.fit_transform(X)

# Balance classes (optimized SMOTE)
smote = SMOTE(sampling_strategy='not majority', k_neighbors=3, random_state=42)
X_res, y_res = smote.fit_resample(X_imputed, y)

# Train-test split with stratification
X_train, X_test, y_train, y_test = train_test_split(
    X_res, y_res, test_size=0.2, random_state=42, stratify=y_res
)

# ======================================================================
# STEP 3: RANDOMIZED SEARCH (FASTER THAN GRIDSEARCH)
# ======================================================================

# Define parameter distributions
param_dist = {
    'n_estimators': randint(50, 300),       # Random integers between 50-300
    'max_depth': randint(5, 30),            # 5-30 levels
    'min_samples_split': randint(2, 10),    # 2-10 samples to split
    'min_samples_leaf': randint(1, 5),      # 1-5 samples per leaf
    'max_features': ['sqrt', 'log2'],       # Categorical
    'bootstrap': [True, False]              # Categorical
}

# Initialize RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    param_distributions=param_dist,
    n_iter=50,               # Number of parameter combinations to try
    cv=3,                    # Faster 3-fold CV
    scoring='accuracy',
    random_state=42,
    n_jobs=-1,               # Use all cores
    verbose=1
)

# Run search with timing
print("Starting randomized search...")
start_time = time.time()
random_search.fit(X_train, y_train)
print(f"Optimization completed in {time.time()-start_time:.2f} seconds")

# Best model
best_rf = random_search.best_estimator_
print("\nBest Parameters:", random_search.best_params_)
print(f"Best CV Accuracy: {random_search.best_score_:.2f}")

# ======================================================================
# STEP 4: EVALUATION
# ======================================================================

# Predictions
y_pred = best_rf.predict(X_test)

# Enhanced classification report
print("\nOptimized Classification Report:")
print(classification_report(y_test, y_pred, digits=3))
print(f"Test Accuracy: {accuracy_score(y_test, y_pred):.3f}")


Starting randomized search...
Fitting 3 folds for each of 50 candidates, totalling 150 fits
Optimization completed in 65.75 seconds

Best Parameters: {'bootstrap': False, 'max_depth': 22, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 3, 'n_estimators': 102}
Best CV Accuracy: 0.78

Optimized Classification Report:
                 precision    recall  f1-score   support

       Business      0.780     0.526     0.628       135
       Creative      0.842     0.911     0.875       135
          Other      0.724     0.654     0.687       136
Public Services      0.838     0.912     0.873       136
        Science      0.930     0.971     0.950       136
     Technology      0.742     0.896     0.812       135

       accuracy                          0.812       813
      macro avg      0.809     0.812     0.804       813
   weighted avg      0.809     0.812     0.804       813

Test Accuracy: 0.812


In [18]:
import pandas as pd
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE


# Step 3: Clean 'career_aspiration'
df["career_aspiration"] = df["career_aspiration"].str.strip()

# Step 4: Define mapping
career_group_mapping = {
    # Business
    "Accountant": "Business",
    "Banker": "Business",
    "Business Owner": "Business",
    "Entrepreneur": "Business",
    "Stock Investor": "Business",
    "Real Estate Developer": "Business",
    "Finance": "Business",  # Catch-all for finance roles
    "Manager": "Business",

    # Creative
    "Artist": "Creative",
    "Designer": "Creative",
    "Graphic Designer": "Creative",
    "Writer": "Creative",
    "Musician": "Creative",
    "Content Creator": "Creative",

    # Engineering
    "Construction Engineer": "Engineering",
    "Civil Engineer": "Engineering",
    "Mechanical Engineer": "Engineering",
    "Engineer": "Engineering",  # Generic fallback

    # Public Services
    "Lawyer": "Public Services",
    "Government Officer": "Public Services",
    "Teacher": "Public Services",
    "Professor": "Public Services",
    "Police Officer": "Public Services",

    # Science
    "Doctor": "Science",
    "Scientist": "Science",
    "Biologist": "Science",
    "Researcher": "Science",
    "Chemist": "Science",

    # Technology
    "Software Engineer": "Technology",
    "Software Developer": "Technology",
    "Game Developer": "Technology",
    "Data Scientist": "Technology",
    "AI Engineer": "Technology",
    "Programmer": "Technology",
    "Web Developer": "Technology",
    "IT Specialist": "Technology",

    # Other/Catch-all
    "Unknown": "Other",
    "Student": "Other",
    "Freelancer": "Other"
}
# career_group_mapping = {
#     "Accountant": "Business",
#     "Banker": "Business",
#     "Business Owner": "Business",
#     "Stock Investor": "Business",
#     "Real Estate Developer": "Business",

#     "Artist": "Creative",
#     "Designer": "Creative",
#     "Writer": "Creative",

#     "Construction Engineer": "Engineering",

#     "Lawyer": "Public Services",
#     "Government Officer": "Public Services",
#     "Teacher": "Public Services",

#     "Doctor": "Science",
#     "Scientist": "Science",

#     "Software Engineer": "Technology",
#     "Game Developer": "Technology",

#     "Unknown": "Other"
# }


# # Step 5: Map career group
# df["career_group"] = df["career_aspiration"].map(career_group_mapping)

# # Step 6: Drop rows where mapping failed
# df = df.dropna(subset=["career_group"])

# if df.empty:
#     raise ValueError("After mapping careers, no valid samples remain. Check your mapping or dataset.")

# # Step 8: Define features and target
# X = df.drop(columns=["career_aspiration", "career_group"])
# y = df["career_group"]

# # Step 9: Handle missing values
# imputer = SimpleImputer(strategy="mean")
# X_imputed = imputer.fit_transform(X)

# # Step 10: Apply SMOTE to balance classes
# smote = SMOTE(random_state=42)
# X_resampled, y_resampled = smote.fit_resample(X_imputed, y)

# # Step 11: Train/test split
# X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# # STEP 3: MODEL TUNING WITH GRIDSEARCH
# # ======================================================================

# # Parameter grid for tuning
# param_grid = {
#     'n_estimators': [100, 200, 300],
#     'max_depth': [10, 20, 30, None],
#     'min_samples_split': [2, 5, 10],
#     'min_samples_leaf': [1, 2, 4],
#     'max_features': ['sqrt', 'log2']
# }

# # Initialize and fit GridSearchCV
# rf = RandomForestClassifier(random_state=42)
# grid_search = GridSearchCV(
#     estimator=rf,
#     param_grid=param_grid,
#     cv=5,
#     scoring='accuracy',
#     n_jobs=-1,
#     verbose=1
# )
# grid_search.fit(X_train, y_train)

# # Get best model
# best_model = grid_search.best_estimator_
# print(f"\nBest Parameters: {grid_search.best_params_}")
# print(f"Best CV Accuracy: {grid_search.best_score_:.2f}")

# # ======================================================================
# # STEP 4: EVALUATION
# # ======================================================================

# # Predictions
# y_pred = best_model.predict(X_test)

# # Classification report
# print("\nClassification Report:")
# print(classification_report(y_test, y_pred))
# print(f"Test Accuracy: {accuracy_score(y_test, y_pred):.2f}")

# Step 12: Train model
model = RandomForestClassifier(n_estimators=200, max_depth=20, random_state=42)
model.fit(X_train, y_train)

# Step 13: Predict and evaluate
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f"Accuracy: {accuracy:.2f}")
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred))


Accuracy: 0.80

Classification Report:

                 precision    recall  f1-score   support

       Business       0.84      0.47      0.60       135
       Creative       0.80      0.91      0.85       135
          Other       0.70      0.60      0.65       136
Public Services       0.84      0.91      0.88       136
        Science       0.90      0.97      0.93       136
     Technology       0.71      0.91      0.80       135

       accuracy                           0.80       813
      macro avg       0.80      0.80      0.78       813
   weighted avg       0.80      0.80      0.78       813



In [8]:

feat_importance = model.feature_importances_
# feat_df = pd.DataFrame({'feature': X.columns, 'importance': feat_importance})
# feat_df = feat_df.sort_values(by='importance', ascending=False)

plt.figure(figsize=(10,6))
sns.barplot(x='importance', y='feature', data=feat_df)
plt.title("Feature Importance")
plt.tight_layout()
plt.show()


NameError: name 'model' is not defined

In [105]:
df['career_aspiration'].value_counts()


career_aspiration
Software Engineer        315
Business Owner           309
Unknown                  223
Banker                   169
Lawyer                   138
Accountant               126
Doctor                   119
Real Estate Developer     83
Stock Investor            73
Construction Engineer     68
Artist                    67
Game Developer            63
Government Officer        61
Teacher                   59
Designer                  56
Scientist                 39
Writer                    32
Name: count, dtype: int64

In [4]:
career_mapping = {
    'Doctor': {
        'weights': {
            'biology_score': 0.40,
            'chemistry_score': 0.35,
            'math_score': 0.15,
            'english_score': 0.10
        },
        'threshold': 78,
        'description': 'Requires strong life sciences and analytical skills'
    },

    'Software Engineer': {
        'weights': {
            'math_score': 0.45,
            'physics_score': 0.25,
            'computer_score': 0.20,
            'english_score': 0.10
        },
        'threshold': 72,
        'description': 'Focuses on programming and problem-solving abilities'
    },

    'Data Scientist': {
        'weights': {
            'math_score': 0.50,
            'computer_score': 0.30,
            'statistics_score': 0.15,
            'english_score': 0.05
        },
        'threshold': 74,
        'description': 'Combines math, programming, and analytical skills'
    },

    'Electrical Engineer': {
        'weights': {
            'math_score': 0.40,
            'physics_score': 0.35,
            'computer_score': 0.15,
            'english_score': 0.10
        },
        'threshold': 68,
        'description': 'Focuses on electronics, circuits, and physics.'
    },

    'Mechanical Engineer': {
        'weights': {
            'math_score': 0.40,
            'physics_score': 0.40,
            'english_score': 0.10,
            'history_score': 0.10
        },
        'threshold': 66,
        'description': 'Requires knowledge of mechanics, materials, and physics.'
    },

    'Pharmacist': {
        'weights': {
            'biology_score': 0.40,
            'chemistry_score': 0.40,
            'math_score': 0.10,
            'english_score': 0.10
        },
        'threshold': 65,
        'description': 'Requires strong knowledge of chemistry and biology for medicine preparation.'
    },

    'Marketing Manager': {
        'weights': {
            'economics_score': 0.40,
            'english_score': 0.30,
            'history_score': 0.20,
            'computer_score': 0.10
        },
        'threshold': 60,
        'description': 'Focuses on business strategy, analytics, and marketing campaigns.'
    },

    'Lawyer': {
        'weights': {
            'english_score': 0.40,
            'history_score': 0.30,
            'geography_score': 0.20,
            'economics_score': 0.10
        },
        'threshold': 70,
        'description': 'Requires strong legal knowledge and reasoning skills.'
    },

    'Film Director': {
        'weights': {
            'art_score': 0.40,
            'english_score': 0.30,
            'history_score': 0.20,
            'computer_score': 0.10
        },
        'threshold': 55,
        'description': 'Focuses on storytelling, creativity, and film production.'
    },

    'Investment Banker': {
        'weights': {
            'math_score': 0.40,
            'economics_score': 0.30,
            'english_score': 0.20,
            'history_score': 0.10
        },
        'threshold': 76,
        'description': 'Requires quantitative and communication skills'
    },

    'Graphic Designer': {
        'weights': {
            'art_score': 0.50,
            'computer_score': 0.30,
            'english_score': 0.15,
            'history_score': 0.05
        },
        'threshold': 58,
        'description': 'Emphasizes visual creativity and technical skills'
    },

    'University Professor (STEM)': {
        'weights': {
            'math_score': 0.30,
            'physics_score': 0.30,
            'english_score': 0.20,
            'research_score': 0.20
        },
        'threshold': 80,
        'description': 'Requires deep subject knowledge and teaching ability'
    },

    'Civil Engineer': {
        'weights': {
            'physics_score': 0.35,
            'math_score': 0.35,
            'geography_score': 0.20,
            'english_score': 0.10
        },
        'threshold': 68,
        'description': 'Combines physics, math, and spatial reasoning'
    },

    'AI/Machine Learning Specialist': {
        'weights': {
            'math_score': 0.45,
            'computer_score': 0.35,
            'statistics_score': 0.15,
            'english_score': 0.05
        },
        'threshold': 75,
        'description': 'Requires advanced math and programming skills'
    },

    'Environmental Policy Analyst': {
        'weights': {
            'biology_score': 0.30,
            'geography_score': 0.30,
            'english_score': 0.25,
            'history_score': 0.15
        },
        'threshold': 62,
        'description': 'Combines science with policy analysis'
    }
}


In [5]:
def recommend_careers_from_scores(student_scores, career_mapping, top_k=3):
    recommendations = []

    for career, details in career_mapping.items():
        weights = details['weights']
        threshold = details.get('threshold', 0)
        
        # Weighted score calculation
        weighted_score = sum(
            student_scores.get(subject, 0) * weight 
            for subject, weight in weights.items()
        )
        normalized_score = weighted_score / sum(weights.values())

        # Append all (even below threshold), then sort and filter
        recommendations.append((career, normalized_score, details['description']))

    # Sort by score descending
    recommendations.sort(key=lambda x: x[1], reverse=True)

    top_recommendations = recommendations[:top_k]
    
    return {
        "recommended_careers": [c[0] for c in top_recommendations],
        "career_scores": [round(c[1], 2) for c in top_recommendations],
        "descriptions": [c[2] for c in top_recommendations],
    }


In [16]:
# def full_recommender(student_scores, model, features, career_mapping, scaler=None):
#     import pandas as pd

#     # Prepare input
#     user_df = pd.DataFrame([student_scores], columns=features)

#     # Scale input if scaler is provided
#     if scaler:
#         user_df = pd.DataFrame(scaler.transform(user_df), columns=features)

#     # Predict career using trained classifier
#     predicted_career = model.predict(user_df)[0]

#     # Recommend learning path based on model prediction
#     learning_path = "Customized Path: Explore structured resources aligned to " + predicted_career

#     # Identify weak subjects
#     weak_subjects = [subject for subject, score in student_scores.items() if score < 60]

#     # Evaluate compatibility with other careers (for top 3 recommendations)
#     recommended_careers = []
#     for career, details in career_mapping.items():
#         weighted_score = sum(
#             student_scores.get(subject, 0) * weight
#             for subject, weight in details['weights'].items()
#         )
#         avg_score = weighted_score / sum(details['weights'].values())

#         if avg_score >= details['threshold'] - 5:
#             recommended_careers.append((career, avg_score, details['description']))

#     # Sort and pick top 3
#     recommended_careers.sort(key=lambda x: x[1], reverse=True)
#     top_careers = recommended_careers[:3] if recommended_careers else []

#     # Return result dictionary
#     return {
#         "predicted_career": predicted_career,
#         "recommended_path": learning_path,
#         "recommended_careers": [c[0] for c in top_careers] if top_careers else ["No strong match found"],
#         "career_scores": [round(c[1], 2) for c in top_careers] if top_careers else ["< 40%"],
#         "descriptions": [c[2] for c in top_careers] if top_careers else ["Consider improving subject scores."],
#         "weak_subjects": weak_subjects
#     }


In [7]:

student_scores = {
    "biology_score": 95,
    "chemistry_score": 80,
    "math_score": 95,
    "english_score": 88,
    "physics_score": 90,
    "computer_score": 92,
    "statistics_score": 84,
    "economics_score": 56,
    "history_score": 60,
    "geography_score": 65
}

# Run the recommendation
# result = full_recommender(student_scores, model, features, career_mapping)
result = recommend_careers_from_scores(student_scores, career_mapping)
print(result)


{'recommended_careers': ['Software Engineer', 'Data Scientist', 'Electrical Engineer'], 'career_scores': [92.45, 92.1, 92.1], 'descriptions': ['Focuses on programming and problem-solving abilities', 'Combines math, programming, and analytical skills', 'Focuses on electronics, circuits, and physics.']}


In [32]:
import joblib

joblib.dump(model, 'learning_path_model.pkl')
joblib.dump(career_mapping, 'career_mapping.pkl')

['career_mapping.pkl']