In [5]:
import pandas as pd
import re
import numpy as np
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from typing import Dict, List, Union
from ast import literal_eval

class CourseRecommender:
    def __init__(self, data_path: str):
        """Initialize recommender with robust data handling"""
        self.skill_synonyms = self._initialize_skill_synonyms()
        self.df = self.load_and_preprocess_data(data_path)
        self.tfidf = None
        self.tfidf_matrix = None

        # Enhanced category keywords with tech/data focus
        self.category_keywords = {
            'business': ['business', 'marketing', 'finance', 'management',
                       'entrepreneurship', 'economics', 'accounting', 'investment'],
            'health': ['health', 'medical', 'nutrition', 'wellness',
                     'nursing', 'pharmacy', 'fitness', 'medicine'],
            'arts': ['art', 'history', 'philosophy', 'literature',
                   'music', 'design', 'writing', 'language'],
            'data': ['data', 'analytics', 'statistics', 'analysis',
                   'python', 'sql', 'machine learning', 'ai', 'artificial intelligence']
        }

    def _initialize_skill_synonyms(self) -> Dict[str, List[str]]:
        """Initialize comprehensive skill synonyms with focused groupings"""
        return {
            "Data & Analytics": ["Data Analysis", "Analytics", "Business Intelligence", "Data Science", "Machine Learning", "Artificial Intelligence", "Big Data", "Data Management", "Statistical Analysis", "Data Visualization"],
            "Programming & Development": ["Software Development", "Programming", "Web Development", "Cloud Computing", "Cybersecurity", "Databases", "API", "DevOps", "System Design", "Mobile Development"],
            "Marketing & Sales": ["Digital Marketing", "Marketing Strategy", "Advertising", "Branding", "Sales", "Content Marketing", "Social Media Marketing", "Market Research", "Customer Relationship Management", "E-commerce"],
            "Project & Operations Management": ["Project Management", "Operations Management", "Agile Methodology", "Supply Chain Management", "Process Improvement", "Lean Methodologies", "Risk Management", "Strategic Planning", "Resource Management", "Quality Management"],
            "Business & Finance": ["Business Strategy", "Financial Management", "Accounting", "Economics", "Entrepreneurship", "Investment", "Corporate Finance", "Risk Management", "Financial Analysis", "Business Ethics"],
            "Human Resources & Organizational Development": ["Human Resources", "Talent Management", "Recruitment", "Employee Engagement", "Organizational Development", "Leadership Development", "Performance Management", "Diversity and Inclusion", "Compensation Management", "Team Management"],
            "Healthcare & Clinical": ["Public Health", "Nursing Practices", "Patient Care", "Medical Emergency", "Mental Health", "Pharmacology", "Epidemiology", "Health Informatics", "Clinical Leadership", "Preventative Care"],
            "Design & Creative Arts": ["Graphic Design", "UI/UX Design", "User Experience Design", "Web Design", "Animation", "Photography", "Video Production", "Storytelling", "Creative Thinking", "Game Design"],
            "Communication & Soft Skills": ["Communication", "Problem Solving", "Collaboration", "Leadership", "Critical Thinking", "Decision Making", "Negotiation", "Presentation Skills", "Adaptability", "Teamwork"],
            "Research & Analysis": ["Research Methodologies", "Data Analysis", "Qualitative Research", "Quantitative Research", "Market Research", "Statistical Analysis", "Business Analysis", "System Analysis", "Competitive Analysis", "Policy Analysis"],
            "Cybersecurity & IT Operations": ["Cybersecurity", "Network Security", "Cloud Computing", "Information Security", "Threat Detection", "Access Management", "Security Controls", "Vulnerability Management", "Incident Response", "IT Operations"],
            "Product & UX/UI": ["Product Management", "Product Design", "User Experience Design", "User Interface Design", "User Research", "Prototyping", "User Story", "Product Strategy", "Usability Testing", "Service Design"],
            "Education & Training": ["Training and Development", "Education Technology", "Professional Development", "Learning and Development", "Instructional Design", "Mentorship", "Coaching", "Curriculum Development", "Learning Management Systems", "Educational Planning"],
            "Legal & Compliance": ["Regulatory Compliance", "Legal Research", "Contract Management", "Data Privacy", "Intellectual Property", "Corporate Governance", "Risk Management", "Ethical Standards", "Tax Compliance", "Labor Law"]
        }

    def safe_literal_eval(self, x: str) -> List[str]:
        """Safely evaluate stringified lists"""
        try:
            return literal_eval(x) if isinstance(x, str) else []
        except (ValueError, SyntaxError):
            return []

    def load_and_preprocess_data(self, path: str) -> pd.DataFrame:
        """Robust data loading and preprocessing"""
        try:
            df = pd.read_csv("cleaned_coursera_4300.csv")
        except Exception as e:
            raise ValueError(f"Error loading CSV file: {str(e)}")

        # Ensure required columns exist
        required_columns = ['title', 'description', 'skill_tags', 'duration_weeks',
                          'effort_hours', 'level', 'price', 'rating', 'provider', 'url']
        missing_cols = [col for col in required_columns if col not in df.columns]
        if missing_cols:
            raise ValueError(f"Missing required columns: {missing_cols}")

        # Convert all text columns to strings
        text_cols = ['title', 'description', 'skill_tags', 'level', 'provider']
        for col in text_cols:
            df[col] = df[col].astype(str)

        # Text cleaning function
        def clean_text(text: str) -> str:
            text = str(text).lower().strip()
            text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
            text = re.sub(r'\s+', ' ', text)     # Normalize whitespace
            return text

        # Apply text cleaning
        df['cleaned_description'] = df['description'].apply(clean_text).fillna('')
        df['cleaned_title'] = df['title'].apply(clean_text).fillna('')

        # Process skill tags safely
        df['skill_tags'] = df['skill_tags'].apply(self.safe_literal_eval)

        # Expand skills with conservative approach
        df['expanded_skills'] = df['skill_tags'].apply(
            lambda tags: list(set(
                tag for tag in tags if isinstance(tag, str)  # Ensure tags are strings
            )) + [
                new_term for tag in tags
                for new_term in self.skill_synonyms.get(tag, [])[:2]  # Only expand 2 synonyms
                if isinstance(tag, str)
            ]
        )

        # Combine text features safely
        df['combined_text'] = (
            df['cleaned_title'] + " " +
            df['expanded_skills'].apply(lambda x: " ".join(x) if x else "") + " " +
            df['cleaned_description']
        ).str.strip()

        # Numeric feature processing
        df['duration_weeks'] = pd.to_numeric(df['duration_weeks'], errors='coerce')
        df['duration_weeks'] = df['duration_weeks'].clip(1, 5).fillna(2)

        df['effort_hours'] = pd.to_numeric(df['effort_hours'], errors='coerce')
        df['effort_hours'] = df['effort_hours'].clip(1, 20).fillna(5)

        # Difficulty levels
        level_map = {
            'beginner level': 1,
            'intermediate level': 2,
            'advanced level': 3
        }
        df['difficulty_score'] = (
            df['level'].str.lower().map(level_map).fillna(1)
        )

        # Price handling - modified to keep original price string
        if 'price' in df.columns:
            # Create numeric price for calculations
            df['price_numeric'] = (
                df['price'].replace('[\$,]', '', regex=True)
                .astype(float)
                .fillna(0)
            )
            df['is_free'] = df['price_numeric'].apply(lambda x: 1 if x == 0 else 0)
            df['price_normalized'] = df['price_numeric'] / df['price_numeric'].max() if df['price_numeric'].max() > 0 else 0
        else:
            df['price_numeric'] = 0
            df['is_free'] = 1
            df['price_normalized'] = 0

        # Rating handling
        df['rating'] = pd.to_numeric(df['rating'], errors='coerce').clip(1, 5).fillna(3)

        # Add category if missing
        if 'category' not in df.columns:
            df['category'] = 'unknown'
        df['category'] = df['category'].astype(str).str.lower()

        return df

    def detect_category(self, user_prompt: str) -> Union[str, None]:
        """Improved category detection with tech focus"""
        prompt = user_prompt.lower()
        category_scores = {category: 0 for category in self.category_keywords}

        # Exact category mentions get highest weight
        for category in self.category_keywords:
            if re.search(rf'\b{category}\b', prompt):
                category_scores[category] += 3

        # Keyword matches
        for category, keywords in self.category_keywords.items():
            for keyword in keywords:
                if re.search(rf'\b{keyword}\b', prompt):
                    category_scores[category] += 1

        # Data/tech terms reduce business matches
        tech_terms = ['data', 'analytics', 'python', 'sql', 'machine learning', 'ai']
        if any(re.search(rf'\b{term}\b', prompt) for term in tech_terms):
            category_scores['business'] = max(0, category_scores['business'] - 2)
            category_scores['data'] += 3

        max_score = max(category_scores.values())
        return max(category_scores, key=category_scores.get) if max_score > 0 else None

    def extract_user_requirements(self, user_prompt: str) -> Dict:
        """Robust requirements extraction"""
        prompt = user_prompt.lower()
        requirements = {
            "category": None,
            "timeframe_weeks": 4,
            "weekly_hours": 5,
            "preferred_skills": [],
            "current_level": "beginner",
            "strict_duration": True,
            "preferred_language": "english",
            "preferred_provider": None,
            "free_only": False,
            "min_rating": 3.0
        }

        # Category detection
        requirements['category'] = self.detect_category(prompt)

        # Level detection
        if re.search(r'\bbeginner\b', prompt):
            requirements['current_level'] = "beginner"
        elif re.search(r'\bintermediate\b', prompt):
            requirements['current_level'] = "intermediate"
        elif re.search(r'\badvanced\b', prompt):
            requirements['current_level'] = "advanced"

        # Duration handling
        duration_match = re.search(r'(\d+)\s*(month|week|day)s?', prompt)
        if duration_match:
            num = int(duration_match.group(1))
            unit = duration_match.group(2)
            if unit == 'month':
                requirements['timeframe_weeks'] = num * 4
                requirements['strict_duration'] = False
            elif unit == 'day':
                requirements['timeframe_weeks'] = max(1, round(num / 7))
            else:
                requirements['timeframe_weeks'] = num

        # Effort hours
        hours_match = re.search(r'(\d+)\s*hour', prompt)
        if hours_match:
            requirements['weekly_hours'] = int(hours_match.group(1))

        # Free courses
        requirements['free_only'] = bool(re.search(r'\bfree\b', prompt))

        # Rating filter
        rating_match = re.search(r'rating\s*([\d.]+)', prompt)
        if rating_match:
            requirements['min_rating'] = float(rating_match.group(1))

        # Provider preference
        providers = ['coursera', 'edx', 'udemy', 'university', 'ibm', 'google']
        for provider in providers:
            if re.search(rf'\b{provider}\b', prompt):
                requirements['preferred_provider'] = provider
                break

        # Extract skills
        for skill_category, skills in self.skill_synonyms.items():
            for skill in skills:
                skill_lower = skill.lower()
                if (re.search(rf'\b{skill_lower}\b', prompt) and
                    skill_lower not in requirements['preferred_skills']):
                    requirements['preferred_skills'].append(skill_lower)

        return requirements

    def filter_by_constraints(self, requirements: Dict) -> pd.DataFrame:
        """Apply all filters with robust handling"""
        filtered = self.df.copy()

        # 1. Category Filter
        if requirements['category']:
            if requirements['category'] == 'data':
                # Special handling for data/tech queries
                tech_keywords = self.category_keywords['data']
                pattern = r'\b(?:' + '|'.join(re.escape(kw) for kw in tech_keywords) + r')\b'
                filtered = filtered[
                    filtered['combined_text'].str.contains(pattern, case=False, na=False)
                ]
            else:
                filtered = filtered[
                    filtered['category'].str.lower() == requirements['category'].lower()
                ]

        # 2. Duration Filter
        if requirements['strict_duration']:
            max_duration = min(5, requirements['timeframe_weeks'] * 1.5)
            filtered = filtered[filtered['duration_weeks'] <= max_duration]

        # 3. Level Filter
        level_map = {'beginner': 1, 'intermediate': 2, 'advanced': 3}
        target_level = level_map.get(requirements['current_level'], 1)
        filtered = filtered[filtered['difficulty_score'] == target_level]

        # 4. Effort Filter
        max_effort = requirements['weekly_hours'] * 1.5
        filtered = filtered[filtered['effort_hours'] <= max_effort]

        # 5. Free Filter
        if requirements['free_only']:
            filtered = filtered[filtered['is_free'] == 1]

        # 6. Provider Filter
        if requirements['preferred_provider']:
            filtered = filtered[
                filtered['provider'].str.lower().str.contains(
                    requirements['preferred_provider'], na=False
                )
            ]

        # 7. Rating Filter
        filtered = filtered[filtered['rating'] >= requirements['min_rating']]

        # 8. Skill Filter
        if requirements['preferred_skills']:
            skill_pattern = r'\b(?:' + '|'.join(re.escape(skill) for skill in requirements['preferred_skills']) + r')\b'
            filtered = filtered[
                filtered['expanded_skills'].apply(
                    lambda skills: any(
                        re.search(skill_pattern, str(skill), re.I)
                        for skill in skills
                    ) if skills else False
                )
            ]

        return filtered

    def initialize_tfidf(self, filtered_df: pd.DataFrame):
        """Robust TF-IDF initialization that handles small datasets"""
        if filtered_df.empty:
            self.tfidf = None
            self.tfidf_matrix = None
            return

        # Dynamic parameter adjustment based on dataset size
        n_docs = len(filtered_df)
        min_df = max(1, min(2, n_docs//3))  # Ensure min_df <= number of docs
        max_df = 0.75 if n_docs >= 4 else 1.0  # Relax max_df for small datasets

        self.tfidf = TfidfVectorizer(
            stop_words='english',
            max_features=5000,
            ngram_range=(1, 2),
            min_df=min_df,  # Dynamic minimum
            max_df=max_df   # Dynamic maximum
        )

        try:
            self.tfidf_matrix = self.tfidf.fit_transform(filtered_df['combined_text'])
        except ValueError as e:
            # Fallback to simple vectorizer if standard parameters fail
            self.tfidf = TfidfVectorizer(
                stop_words='english',
                max_features=5000,
                ngram_range=(1, 1),
                min_df=1,
                max_df=1.0
            )
            self.tfidf_matrix = self.tfidf.fit_transform(filtered_df['combined_text'])

    def content_based_recommendation(self,
                                   filtered_df: pd.DataFrame,
                                   requirements: Dict,
                                   top_n: int = 5) -> pd.DataFrame:
        """Enhanced recommendation with robust scoring"""
        if filtered_df.empty or self.tfidf is None:
            return pd.DataFrame()

        # Build boosted query
        boosted_terms = []
        for skill in requirements['preferred_skills']:
            boosted_terms.extend([skill] * 3)  # 3x weight for preferred skills

        if requirements['category'] == 'data':
            boosted_terms.extend(['data', 'analysis'] * 2)  # Boost data terms

        query_text = " ".join(boosted_terms)

        # Calculate similarities
        query_vector = self.tfidf.transform([query_text])

        # Main content similarity
        content_similarity = cosine_similarity(query_vector, self.tfidf_matrix)[0]
        filtered_df['content_score'] = content_similarity

        # Title/skill specific similarity
        title_skill_text = (
            filtered_df['cleaned_title'] + " " +
            filtered_df['expanded_skills'].apply(lambda x: " ".join(x) if x else "")
        )
        title_skill_matrix = self.tfidf.transform(title_skill_text)
        title_skill_similarity = cosine_similarity(query_vector, title_skill_matrix)[0]
        filtered_df['title_skill_score'] = title_skill_similarity

        # Duration scoring
        if requirements['strict_duration']:
            filtered_df['duration_score'] = filtered_df['duration_weeks'].apply(
                lambda x: max(0, 1 - abs(x - requirements['timeframe_weeks']) / 5)
            )
        else:
            # For long requests, prefer shorter courses that can be combined
            filtered_df['duration_score'] = 1 - (
                filtered_df['duration_weeks'] / 10  # Favor shorter courses
            )

        # Composite scoring
        filtered_df['composite_score'] = (
            0.40 * filtered_df['content_score'] +
            0.30 * filtered_df['title_skill_score'] +
            0.15 * filtered_df['duration_score'] +
            0.10 * (filtered_df['rating'] / 5) +  # Normalize rating to 0-1
            0.05 * (1 - filtered_df['price_normalized'])  # Lower price is better
        )

        return filtered_df.sort_values('composite_score', ascending=False).head(top_n)

    def recommend(self, user_prompt: str) -> Dict:
        """Complete recommendation pipeline with fallback logic"""
        try:
            requirements = self.extract_user_requirements(user_prompt)

            # First try with strict filters
            filtered_courses = self.filter_by_constraints(requirements)

            # If too few results, relax duration constraint
            if len(filtered_courses) < 3:
                requirements['strict_duration'] = False
                filtered_courses = self.filter_by_constraints(requirements)

            if not filtered_courses.empty:
                self.initialize_tfidf(filtered_courses)

                if self.tfidf_matrix is not None:
                    recommendations = self.content_based_recommendation(
                        filtered_courses, requirements
                    )
                else:
                    # Fallback to rating-based sorting if TF-IDF fails
                    recommendations = filtered_courses.sort_values(
                        ['rating', 'difficulty_score'],
                        ascending=[False, True]
                    ).head(5)
            else:
                recommendations = pd.DataFrame()

            output_fields = [
                'title', 'provider', 'duration_weeks', 'effort_hours',
                'level', 'url', 'rating', 'price', 'skill_tags', 'category'
            ]

            return {
                "recommendations": recommendations[
                    [col for col in output_fields if col in recommendations.columns]
                ].to_dict('records'),
                "user_requirements": requirements,
                "explanation": self._generate_explanation(user_prompt, requirements, recommendations)
            }
        except Exception as e:
            return {
                "error": str(e),
                "recommendations": [],
                "explanation": "An error occurred while processing your request."
            }

    def _generate_explanation(self,
                            prompt: str,
                            requirements: Dict,
                            recommendations: pd.DataFrame) -> str:
        """Generate detailed explanation for recommendations"""
        if recommendations.empty:
            return "No courses found matching all your requirements."

        explanation = []

        # Duration explanation
        if "month" in prompt.lower() and requirements['timeframe_weeks'] > 4:
            explanation.append(
                f"While you requested {requirements['timeframe_weeks']//4} month(s) long courses, "
                "we're recommending high-quality shorter courses that could be combined:"
            )

        # Category info
        category = requirements['category'] or "your interests"
        explanation.append(f"\nSelected {category} courses matching:")

        # Course list
        course_list = []
        for _, row in recommendations.iterrows():
            course_str = f"- {row['title']} by {row['provider']}"
            course_str += f" ({row['duration_weeks']} weeks"
            course_str += f", {row['level']}" if 'level' in row else ""
            # Add price information
            if 'price' in row:
                price = float(row['price']) if isinstance(row['price'], str) and row['price'].replace('.','',1).isdigit() else row['price']
                course_str += f", ${price:.2f}" if isinstance(price, (int, float)) and price > 0 else ", Free"
            course_str += ")"
            course_list.append(course_str)
        explanation.append("\n".join(course_list))

        # Skills info
        if requirements['preferred_skills']:
            explanation.append(
                f"\nMatching skills: {', '.join(requirements['preferred_skills'])}"
            )

        # Level info
        explanation.append(
            f"\nSelected {requirements['current_level']}-level courses."
        )

        return "\n".join(explanation)


if __name__ == "__main__":
    try:
        # Initialize with your CSV path
        recommender = CourseRecommender("cleaned_coursera_4300.csv")
        print("Course recommender initialized successfully!\n")

        # Test queries
        test_queries = [
            "I'm a business graduate with no tech background. I want to become a Data Analyst in 3 months. I can spend 5 hours per week.",
            "Intermediate web development with APIs, 6 hours weekly for 2 months",
            "Beginner data analysis and statistics courses for 8 weeks"
        ]

        for query in test_queries:
            print(f"Query: '{query}'")
            results = recommender.recommend(query)

            if results.get('error'):
                print(f"Error: {results['error']}")
            elif results['recommendations']:
                print("\nExplanation:")
                print(results['explanation'])
                print("\nTop Recommendations:")
                for i, course in enumerate(results['recommendations'][:5], 1):
                    print(f"{i}. {course['title']} ({course['provider']})")
                    print(f"   Duration: {course['duration_weeks']} weeks")
                    print(f"   Level: {course.get('level', 'N/A')}")
                    print(f"   Price: {'Free' if float(course.get('price', 0)) == 0 else '$' + str(round(float(course.get('price', 0)), 2))}")
                    print(f"   Skills: {', '.join(course.get('skill_tags', [])[:3])}")
                    print(f"   URL: {course.get('url', 'N/A')}")
            else:
                print("No matching courses found.")
            print("\n" + "="*80 + "\n")

    except Exception as e:
        print(f"Initialization error: {str(e)}")

if __name__ == "__main__":
    try:
        csv_path = "data/cleaned_coursera_4300.csv"  # Relative path
        recommender = CourseRecommender(csv_path)
        print("CourseRecommender initialized successfully!")

        #  Save to pickle
        with open("recommender.pkl", "wb") as f:
            pickle.dump(recommender, f)
        print("Model saved to models/recommender.pkl")

        # Optional test run (can be removed if not needed)
        test_queries = [
            "I'm a business graduate with no tech background. I want to become a Data Analyst in 3 months. I can spend 5 hours per week.",
            "Intermediate web development with APIs, 6 hours weekly for 2 months",
            "Beginner data analysis and statistics courses for 8 weeks"
        ]

        for query in test_queries:
            print(f"\nQuery: '{query}'")
            results = recommender.recommend(query)

            if results.get('error'):
                print(f"Error: {results['error']}")
            elif results['recommendations']:
                print("\nExplanation:")
                print(results['explanation'])
                print("\nTop Recommendations:")
                for i, course in enumerate(results['recommendations'][:5], 1):
                    print(f"{i}. {course['title']} ({course['provider']})")
                    print(f"   Duration: {course['duration_weeks']} weeks")
                    print(f"   Level: {course.get('level', 'N/A')}")
                    print(f"   Price: {'Free' if float(course.get('price', 0)) == 0 else '$' + str(round(float(course.get('price', 0)), 2))}")
                    print(f"   Skills: {', '.join(course.get('skill_tags', [])[:3])}")
                    print(f"   URL: {course.get('url', 'N/A')}")
            else:
                print("No matching courses found.")

    except Exception as e:
        print(f"Initialization or save error: {str(e)}")

  df['price'].replace('[\$,]', '', regex=True)


Course recommender initialized successfully!

Query: 'I'm a business graduate with no tech background. I want to become a Data Analyst in 3 months. I can spend 5 hours per week.'

Explanation:
While you requested 3 month(s) long courses, we're recommending high-quality shorter courses that could be combined:

Selected data courses matching:
- AI-Powered Data Analysis: A Practical Introduction by University of Michigan (2.0 weeks, beginner level, Free)
- Capstone: Create Value from Open Data by ESSEC Business School (1.0 weeks, beginner level, Free)
- Chat with Your Data: Generative AI-Powered SQL Data Analysis by Vanderbilt University (2.0 weeks, beginner level, $3800.00)
- Exploratory Data Analysis with MATLAB by MathWorks (2.0 weeks, beginner level, $2600.00)
- Introduction to Data Analytics by Meta (2.0 weeks, beginner level, $2200.00)

Selected beginner-level courses.

Top Recommendations:
1. AI-Powered Data Analysis: A Practical Introduction (University of Michigan)
   Duration: 2