#Exploratory Data Analysis

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer, OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split
import re
from fuzzywuzzy import process, fuzz

# Load your expanded dataset
df = pd.read_csv('LU_Major_Recommendation_Dataset_New.csv')

# Convert string lists to actual lists
def parse_string_list(x):
    if isinstance(x, str):
        # Clean and split the string
        return [item.strip() for item in re.split(r',|;', x) if item.strip()]
    return x

df['Skills'] = df['Skills'].apply(parse_string_list)
df['Courses and Certificates'] = df['Courses and Certificates'].apply(parse_string_list)

# Convert RIASEC to 0/1
riasec_columns = ['R', 'I', 'A', 'S', 'E', 'C']
for col in riasec_columns:
    df[col] = df[col].astype(int)

# Separate passion and goals (and drop goals as requested)
def extract_passion(text):
    if isinstance(text, str):
        # Split by comma and take first part as passion
        parts = text.split(',')
        return parts[0].strip() if parts else text
    return text

df['Passion'] = df['Passion and Goals'].apply(extract_passion)
df = df.drop('Passion and Goals', axis=1)

# Create master lists for NLP matching
all_skills = set()
for skills in df['Skills']:
    all_skills.update(skills)
all_skills = list(all_skills)

all_courses = set()
for courses in df['Courses and Certificates']:
    all_courses.update(courses)
all_courses = list(all_courses)

all_passions = set(df['Passion'])
all_passions = list(all_passions)

all_work_styles = set(df['Work Style'])
all_work_styles = list(all_work_styles)

# Save these master lists for later use in prediction
import joblib
joblib.dump(all_skills, 'master_skills.pkl')
joblib.dump(all_courses, 'master_courses.pkl')
joblib.dump(all_passions, 'master_passions.pkl')
joblib.dump(all_work_styles, 'master_work_styles.pkl')

# Prepare features for ML model
# 1. RIASEC features (already 0/1)
X_riasec = df[riasec_columns]

# 2. Skills features (MultiLabelBinarizer)
mlb_skills = MultiLabelBinarizer()
X_skills = mlb_skills.fit_transform(df['Skills'])
joblib.dump(mlb_skills, 'mlb_skills.pkl')

# 3. Courses features (MultiLabelBinarizer)
mlb_courses = MultiLabelBinarizer()
X_courses = mlb_courses.fit_transform(df['Courses and Certificates'])
joblib.dump(mlb_courses, 'mlb_courses.pkl')

# 4. Work Style features (OneHotEncoder)
ohe_work_style = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
X_work_style = ohe_work_style.fit_transform(df[['Work Style']])
joblib.dump(ohe_work_style, 'ohe_work_style.pkl')

# 5. Passion features (OneHotEncoder)
ohe_passion = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
X_passion = ohe_passion.fit_transform(df[['Passion']])
joblib.dump(ohe_passion, 'ohe_passion.pkl')

# Combine all features
X = np.hstack([X_riasec, X_skills, X_courses, X_work_style, X_passion])

# Prepare target variables
le_major = LabelEncoder()
le_faculty = LabelEncoder()
le_degree = LabelEncoder()
le_campus = LabelEncoder()

y_major = le_major.fit_transform(df['Program/Major'])
y_faculty = le_faculty.fit_transform(df['Faculty'])
y_degree = le_degree.fit_transform(df['Typical Degree'])
y_campus = le_campus.fit_transform(df['Primary Campus'])

y = np.column_stack((y_major, y_faculty, y_degree, y_campus))

# Save label encoders
joblib.dump(le_major, 'le_major.pkl')
joblib.dump(le_faculty, 'le_faculty.pkl')
joblib.dump(le_degree, 'le_degree.pkl')
joblib.dump(le_campus, 'le_campus.pkl')

['le_campus.pkl']

In [None]:
def process_user_text_input(user_input, master_list, threshold=70):
    """
    Process user text input using fuzzy matching
    """
    if not user_input or not isinstance(user_input, str):
        return []

    detected_items = []
    # Try to match the whole input first
    best_match, score = process.extractOne(user_input, master_list, scorer=fuzz.partial_ratio)
    if score >= threshold:
        detected_items.append(best_match)

    # Also try to match individual words
    words = re.findall(r'\b\w+\b', user_input.lower())
    for word in words:
        if len(word) > 3:  # Only consider words longer than 3 characters
            best_match, score = process.extractOne(word, master_list, scorer=fuzz.partial_ratio)
            if score >= threshold:
                detected_items.append(best_match)

    return list(set(detected_items))  # Remove duplicates

def prepare_user_input(user_data):
    """
    Prepare user input for prediction
    user_data should be a dictionary with:
    - riasec: dict with R,I,A,S,E,C as keys and 0/1 as values
    - skills_text: string of user skills
    - courses_text: string of user courses
    - work_style: string of selected work style
    - passion_text: string of user passion
    """
    # Load master lists
    all_skills = joblib.load('master_skills.pkl')
    all_courses = joblib.load('master_courses.pkl')
    all_passions = joblib.load('master_passions.pkl')
    all_work_styles = joblib.load('master_work_styles.pkl')

    # Load encoders
    mlb_skills = joblib.load('mlb_skills.pkl')
    mlb_courses = joblib.load('mlb_courses.pkl')
    ohe_work_style = joblib.load('ohe_work_style.pkl')
    ohe_passion = joblib.load('ohe_passion.pkl')

    # Process RIASEC
    riasec_order = ['R', 'I', 'A', 'S', 'E', 'C']
    X_riasec = np.array([[user_data['riasec'].get(col, 0) for col in riasec_order]])

    # Process Skills with NLP
    detected_skills = process_user_text_input(user_data['skills_text'], all_skills)
    X_skills = mlb_skills.transform([detected_skills])

    # Process Courses with NLP
    detected_courses = process_user_text_input(user_data['courses_text'], all_courses)
    X_courses = mlb_courses.transform([detected_courses])

    # Process Work Style
    # If user's work style isn't found, use the most common one
    work_style = user_data['work_style']
    if work_style not in all_work_styles:
        work_style = all_work_styles[0]  # or use most common
    X_work_style = ohe_work_style.transform([[work_style]])

    # Process Passion with NLP
    detected_passion = process_user_text_input(user_data['passion_text'], all_passions)
    passion = detected_passion[0] if detected_passion else all_passions[0]
    X_passion = ohe_passion.transform([[passion]])

    # Combine all features
    X_user = np.hstack([X_riasec, X_skills, X_courses, X_work_style, X_passion])

    return X_user, {
        'detected_skills': detected_skills,
        'detected_courses': detected_courses,
        'detected_passion': passion
    }

In [None]:
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import cross_val_score

# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y[:, 0]  # stratify by major
)

# Train a MultiOutput Classifier
model = MultiOutputClassifier(RandomForestClassifier(
    n_estimators=100,
    random_state=42,
    max_depth=10,
    class_weight='balanced'
))

model.fit(X_train, y_train)
# Predictions



# Bias vs Variance insight
print("\nIf Training Accuracy >> Test Accuracy → High Variance (overfitting)")
print("If Training Accuracy and Test Accuracy are both low → High Bias (underfitting)")
print("If both are high and close → Good Generalization")
# Evaluate
y_pred = model.predict(X_test)
Y_train_pred = model.predict(X_train)
Y_test_pred = model.predict(X_test)
# accuracy = accuracy_score(y_test, y_pred) # This line is causing the error
# print(f"Overall Accuracy: {accuracy:.4f}")

# Evaluate each target separately
target_names = ['Major', 'Faculty', 'Degree', 'Campus']
for i, name in enumerate(target_names):



# Compute training and test accuracy
  train_accuracy = accuracy_score(y_train[:, i], Y_train_pred[:, i])
  test_accuracy = accuracy_score(y_test[:, i], Y_test_pred[:, i])


  print(f"Training Accuracy: {train_accuracy:.2f}")
  print(f"Test Accuracy: {test_accuracy:.2f}")
  acc = accuracy_score(y_test[:, i], y_pred[:, i])
  print(f"{name} Accuracy: {acc:.4f}")

# Save the model
joblib.dump(model, 'major_recommendation_model.pkl')


If Training Accuracy >> Test Accuracy → High Variance (overfitting)
If Training Accuracy and Test Accuracy are both low → High Bias (underfitting)
If both are high and close → Good Generalization
Training Accuracy: 1.00
Test Accuracy: 0.96
Major Accuracy: 0.9569
Training Accuracy: 0.99
Test Accuracy: 0.97
Faculty Accuracy: 0.9741
Training Accuracy: 0.98
Test Accuracy: 0.93
Degree Accuracy: 0.9310
Training Accuracy: 0.95
Test Accuracy: 0.92
Campus Accuracy: 0.9224


['major_recommendation_model.pkl']

#After we checked the training accuracy and testing accuracy we noticed that both bias and variance are both low and thats good there is no overfitting and no underfitting problems.

In [None]:
from sklearn.metrics import classification_report, f1_score

# Evaluate each target separately
target_names = ['Major', 'Faculty', 'Degree', 'Campus']
for i, name in enumerate(target_names):
    print(f"\n{name} Evaluation Report:")
    print(classification_report(y_test[:, i], y_pred[:, i], digits=4))

    # If you want just the macro F1 (good for imbalanced classes):
    f1 = f1_score(y_test[:, i], y_pred[:, i], average='macro')
    print(f"{name} Macro F1 Score: {f1:.4f}")



Major Evaluation Report:
              precision    recall  f1-score   support

           0     1.0000    1.0000    1.0000         3
           1     1.0000    0.7500    0.8571         4
           2     1.0000    1.0000    1.0000         4
           3     1.0000    1.0000    1.0000         4
           4     1.0000    1.0000    1.0000         3
           5     1.0000    1.0000    1.0000         4
           6     1.0000    1.0000    1.0000         4
           7     1.0000    1.0000    1.0000         4
           8     1.0000    1.0000    1.0000         3
           9     1.0000    1.0000    1.0000         3
          10     0.8000    1.0000    0.8889         4
          11     1.0000    0.7500    0.8571         4
          12     1.0000    1.0000    1.0000         3
          13     1.0000    0.7500    0.8571         4
          14     1.0000    1.0000    1.0000         4
          15     1.0000    1.0000    1.0000         3
          16     1.0000    1.0000    1.0000         3
 

In [None]:
def predict_major(user_data):
    """
    Predict major based on user input
    """
    # Load model and encoders
    model = joblib.load('major_recommendation_model.pkl')
    le_major = joblib.load('le_major.pkl')
    le_faculty = joblib.load('le_faculty.pkl')
    le_degree = joblib.load('le_degree.pkl')
    le_campus = joblib.load('le_campus.pkl')

    # Prepare user input
    X_user, detected_info = prepare_user_input(user_data)

    # Make prediction
    prediction = model.predict(X_user)

    # Decode predictions
    result = {
        'major': le_major.inverse_transform([prediction[0][0]])[0],
        'faculty': le_faculty.inverse_transform([prediction[0][1]])[0],
        'degree': le_degree.inverse_transform([prediction[0][2]])[0],
        'campus': le_campus.inverse_transform([prediction[0][3]])[0],
        'detected_info': detected_info
    }

    # Get probabilities for top recommendations
    if hasattr(model, 'predict_proba'):
        probas = [estimator.predict_proba(X_user)[0] for estimator in model.estimators_]
        major_probas = list(zip(le_major.classes_, probas[0]))
        major_probas.sort(key=lambda x: x[1], reverse=True)
        result['top_recommendations'] = major_probas[:3]  # Top 3 majors

    return result

In [None]:
# Example user input
user_data = {
    'riasec': {'R': 1, 'I': 1, 'A': 0, 'S': 0, 'E': 0, 'C': 1},
    'skills_text': "I know Python, Java, and some database management",
    'courses_text': "AP Computer Science, web development bootcamp",
    'work_style': "Office/Data",
    'passion_text': "I love technology and building software"
}

# Get prediction
result = predict_major(user_data)
print("Predicted Major:", result['major'])
print("Faculty:", result['faculty'])
print("Detected Skills:", result['campus'])
print("Detected Courses:", result['degree'])



Predicted Major: Computer Science
Faculty: Faculty of Sciences
Detected Skills: Distributed
Detected Courses: BS


In [None]:
%pip install fuzzywuzzy python-Levenshtein

Collecting fuzzywuzzy
  Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl.metadata (4.9 kB)
Collecting python-Levenshtein
  Downloading python_levenshtein-0.27.1-py3-none-any.whl.metadata (3.7 kB)
Collecting Levenshtein==0.27.1 (from python-Levenshtein)
  Downloading levenshtein-0.27.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.6 kB)
Collecting rapidfuzz<4.0.0,>=3.9.0 (from Levenshtein==0.27.1->python-Levenshtein)
  Downloading rapidfuzz-3.14.1-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (12 kB)
Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl (18 kB)
Downloading python_levenshtein-0.27.1-py3-none-any.whl (9.4 kB)
Downloading levenshtein-0.27.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (159 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m159.9/159.9 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading rapidfuzz-3.14.1-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (3.2 MB)

In [None]:
from google.colab import files
import joblib
import os

# Create a models directory in Colab
!mkdir -p models

# Save all your models (add this after training)
joblib.dump(model, 'models/major_recommendation_model.pkl')
joblib.dump(mlb_skills, 'models/mlb_skills.pkl')
joblib.dump(mlb_courses, 'models/mlb_courses.pkl')
joblib.dump(ohe_work_style, 'models/ohe_work_style.pkl')
joblib.dump(ohe_passion, 'models/ohe_passion.pkl')
joblib.dump(le_major, 'models/le_major.pkl')
joblib.dump(le_faculty, 'models/le_faculty.pkl')
joblib.dump(le_degree, 'models/le_degree.pkl')
joblib.dump(le_campus, 'models/le_campus.pkl')
joblib.dump(list(all_skills), 'models/master_skills.pkl')
joblib.dump(list(all_courses), 'models/master_courses.pkl')
joblib.dump(list(all_passions), 'models/master_passions.pkl')
joblib.dump(list(all_work_styles), 'models/master_work_styles.pkl')

print("All models saved successfully!")

All models saved successfully!


In [None]:
# Create a zip file of all models
!zip -r models.zip models/

# Download the zip file
from google.colab import files
files.download('models.zip')

  adding: models/ (stored 0%)
  adding: models/le_campus.pkl (deflated 38%)
  adding: models/ohe_passion.pkl (deflated 48%)
  adding: models/le_degree.pkl (deflated 33%)
  adding: models/le_faculty.pkl (deflated 44%)
  adding: models/mlb_skills.pkl (deflated 42%)
  adding: models/master_skills.pkl (deflated 42%)
  adding: models/master_courses.pkl (deflated 57%)
  adding: models/master_passions.pkl (deflated 48%)
  adding: models/mlb_courses.pkl (deflated 56%)
  adding: models/ohe_work_style.pkl (deflated 55%)
  adding: models/le_major.pkl (deflated 40%)
  adding: models/master_work_styles.pkl (deflated 67%)
  adding: models/major_recommendation_model.pkl (deflated 79%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>