<a href="https://colab.research.google.com/github/Aashigupta1288/Career_Recommendation_System/blob/main/model_train_video_fully_cleaned.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from google.colab import files


uploaded = files.upload()
uploaded

In [None]:
! pip install DeepFace  librosa

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import classification_report
import joblib

In [None]:
# For Jupyter widget upload
import ipywidgets as widgets
from IPython.display import display



In [None]:
# --- Step 1: Load and preprocess CSV ---

df = pd.read_csv('updated_data.csv')

# Clean and preprocess Career Role column
df = df.dropna(subset=['Openness', 'Conscientiousness', 'Extraversion', 'Agreeableness', 'Neuroticism', 'Career Role'])
df['Career Role'] = df['Career Role'].astype(str).apply(lambda x: [role.strip() for role in x.split(',')])



In [None]:
# Features and target
features = ['Openness', 'Conscientiousness', 'Extraversion', 'Agreeableness', 'Neuroticism']
X = df[features].values

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)



In [None]:
# Multi-label binarize target
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(df['Career Role'])

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)



In [None]:
# --- Step 2: Train multi-label classifier ---
from sklearn.metrics import accuracy_score


base_clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf = MultiOutputClassifier(base_clf)
clf.fit(X_train, y_train)

# Evaluate
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.2f}")
print("Classification report:\n")
print(classification_report(y_test, y_pred, target_names=mlb.classes_, zero_division=0))



In [None]:
# Save model and encoders for reuse
joblib.dump(clf, 'career_big5_model.pkl')
joblib.dump(scaler, 'scaler.pkl')
joblib.dump(mlb, 'mlb.pkl')



In [None]:
# --- Step 3: Prediction function ---

def predict_career(openness, conscientiousness, extraversion, agreeableness, neuroticism):
    traits = np.array([[openness, conscientiousness, extraversion, agreeableness, neuroticism]])
    traits_scaled = scaler.transform(traits)
    pred = clf.predict(traits_scaled)
    careers = mlb.inverse_transform(pred)
    return careers[0] if careers else []



In [None]:
import cv2
import librosa
from deepface import DeepFace
import moviepy.editor as mp
from sklearn.preprocessing import StandardScaler
import joblib


def extract_video_features(video_path):
    cap = cv2.VideoCapture(video_path)
    embeddings = []

    # Iterate over the video frames
    while True:
        ret, frame = cap.read()
        if not ret:
            break

        # Extract facial embeddings using DeepFace
        try:
            result = DeepFace.represent(frame, enforce_detection=False)
            embeddings.append(result[0]['embedding'])
        except Exception as e:
            print(f"Error in processing frame: {e}")

    cap.release()

    # If we have extracted multiple frames, average the embeddings
    if embeddings:
        return np.mean(embeddings, axis=0)
    return np.zeros(128)  # Default value if no face is detected

def extract_audio_features_from_video(video_path):
    # Extract audio from video using moviepy
    video = mp.VideoFileClip(video_path)
    audio = video.audio
    audio_path = 'extracted_audio.wav'
    audio.write_audiofile(audio_path)

    # Extract MFCCs (Mel Frequency Cepstral Coefficients) as audio features
    y, sr = librosa.load(audio_path, sr=None)
    mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
    mfccs_mean = np.mean(mfccs, axis=1)  # Take the mean of each MFCC coefficient across the audio

    return mfccs_mean

In [None]:
def extract_traits_from_video(video_path):
    print(f"Processing video for trait extraction: {video_path}")

    # Extract features from video (face embeddings)
    video_features = extract_video_features(video_path)
    print(f"Extracted video features (face embeddings): {video_features.shape}")

    # Extract audio features from video
    audio_features = extract_audio_features_from_video(video_path)
    print(f"Extracted audio features (MFCCs): {audio_features.shape}")

    # Combine both video and audio features
    combined_features = np.concatenate((video_features, audio_features), axis=0)
    return combined_features

In [None]:
# --- Step 4: Placeholder for Big Five trait extraction from video ---

def extract_big5_traits_from_video(video_path):

    print(f"Processing video for trait extraction: {video_path}")
    return np.random.uniform(2, 4, size=5)


uploader = widgets.FileUpload(
    accept='.mp4, .avi, .mov',
    multiple=False,
    description='Upload Video'
)
display(uploader)

def on_upload_change(change):
    if uploader.value:
        for filename, file_info in uploader.value.items():
            # Save uploaded video locally
            with open(filename, 'wb') as f:
                f.write(file_info['content'])
            print(f"Saved uploaded video as {filename}")

            # Extract Big Five traits from the video (placeholder)
            traits = extract_big5_traits_from_video(filename)
            print(f"Extracted Big Five traits: {traits}")

            # Predict career roles
            predicted_roles = predict_career(*traits)
            print(f"Predicted career roles: {predicted_roles}")

uploader.observe(on_upload_change, names='value')



In [None]:
from sklearn.metrics import hamming_loss
print("Hamming Loss:", hamming_loss(y_test, y_pred))


In [None]:
from sklearn.metrics import accuracy_score
print("Exact Match Accuracy:", accuracy_score(y_test, y_pred))
