
## Course Recommmender System

In [None]:
import os
import json
import whisper
import PyMuPDF
import pandas as pd
import re
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoModel, AutoTokenizer
import torch

# Load Spacy NLP model for preprocessing
nlp = spacy.load("en_core_web_sm")

# Load RoBERTa model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("roberta-base")
model = AutoModel.from_pretrained("roberta-base")

# Function to process text input
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z0-9 ]', '', text)
    doc = nlp(text)
    return " ".join([token.lemma_ for token in doc if not token.is_stop])

# Function to process voice input using Whisper
def process_voice(audio_path):
    model = whisper.load_model("base")
    result = model.transcribe(audio_path)
    return preprocess_text(result["text"])

# Function to process resume PDF
def process_resume(pdf_path):
    text = ""
    doc = PyMuPDF.open(pdf_path)
    for page in doc:
        text += page.get_text("text")
    return preprocess_text(text)

# Function for TF-IDF keyword matching
def tfidf_keyword_matching(user_input, course_descriptions):
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform([user_input] + course_descriptions)
    cosine_sim = cosine_similarity(tfidf_matrix[0], tfidf_matrix[1:])
    return cosine_sim.flatten()

# Function for RoBERTa semantic similarity
def roberta_semantic_similarity(user_input, course_descriptions):
    inputs = tokenizer([user_input] + course_descriptions, padding=True, truncation=True, return_tensors="pt")
    with torch.no_grad():
        embeddings = model(**inputs).last_hidden_state[:, 0, :]
    cosine_sim = cosine_similarity(embeddings[0].unsqueeze(0), embeddings[1:])
    return cosine_sim.flatten().tolist()

# Example Usage
# if __name__ == "__main__":
#     text_input = "I am looking for AI and ML courses to enhance my skills."
#     voice_input = process_voice("sample_audio.mp3")
#     resume_input = process_resume("resume.pdf")
    
#     course_descriptions = [
#         "This course covers machine learning algorithms and AI applications.",
#         "Learn web development with JavaScript and React.",
#         "An advanced deep learning course with neural networks.",
#     ]
    
#     print("Processed Text Input:", preprocess_text(text_input))
#     print("Processed Voice Input:", voice_input)
#     print("Processed Resume Input:", resume_input)
    
#     tfidf_scores = tfidf_keyword_matching(text_input, course_descriptions)
#     print("TF-IDF Similarity Scores:", tfidf_scores)
    
#     roberta_scores = roberta_semantic_similarity(text_input, course_descriptions)
#     print("RoBERTa Similarity Scores:", roberta_scores)
