In [8]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import json
import re
from tqdm.notebook import tqdm  # JUPYTER-OPTIMIZED: For progress bars
import ipywidgets as widgets     # JUPYTER-OPTIMIZED: For interactive controls
from IPython.display import display, clear_output

# Register tqdm for pandas apply/map functions
tqdm.pandas()

class InternshipRecommender:
    """
    Recommends internships using a hybrid approach optimized for Jupyter.
    """
    def __init__(self, dataset_path: str):
        print("Loading and preprocessing data...")
        self.df = pd.read_csv(internship.csv)
        self._preprocess()

        # Combine text fields for TF-IDF
        self.df['Combined'] = self.df['internship_title'].astype(str).str.lower() + ' ' + self.df['company_name'].astype(str).str.lower()
        
        # Pre-fit TF-IDF vectorizer
        print("Building TF-IDF matrix...")
        self.vectorizer = TfidfVectorizer(stop_words="english")
        self.internship_tfidf_matrix = self.vectorizer.fit_transform(self.df['Combined'])
        print("✅ Recommender is ready.")

    def _preprocess(self):
        """Clean and normalize internship dataset"""
        text_cols = ['internship_title', 'company_name', 'location']
        for col in text_cols:
            if col not in self.df.columns:
                raise ValueError(f"Missing column: {col} in dataset")
            self.df[col] = (
                self.df[col].astype(str)
                .fillna('')
                .str.lower()
                .str.strip()
            )

        if 'InternshipID' not in self.df.columns:
            self.df['InternshipID'] = range(1, len(self.df) + 1)

        if 'stipend' in self.df.columns:
            self.df['stipend'] = pd.to_numeric(self.df['stipend'], errors='coerce').fillna(0)

    def _clean_text(self, text: str) -> str:
        text = str(text).lower()
        text = re.sub(r'[^a-zA-Z0-9\s]', ' ', text)
        return text.strip()

    def _get_match_reason(self, candidate_skills_str: str, internship_title: str) -> str:
        candidate_skills = set(candidate_skills_str.split())
        internship_words = set(internship_title.split())
        matched = candidate_skills.intersection(internship_words)
        if matched:
            return f"Matched skills: {', '.join(matched)}."
        return "Relevant based on title/company."

    def recommend(self, candidate: dict, top_n: int = 5, location_boost: float = 0.1, stipend_boost_weight: float = 0.0):
        # This core logic remains the same, it's already very good.
        candidate_skills = self._clean_text(candidate.get('Skills', ''))
        candidate_sector = self._clean_text(candidate.get('SectorInterest', ''))
        candidate_loc = self._clean_text(candidate.get('LocationPreference', ''))

        filtered_df = self.df[self.df['location'].str.contains(candidate_loc, na=False)].copy()
        if filtered_df.empty:
            filtered_df = self.df.copy()

        candidate_text = candidate_skills + ' ' + candidate_sector
        candidate_vec = self.vectorizer.transform([candidate_text])

        filtered_indices = filtered_df.index
        filtered_internship_vecs = self.internship_tfidf_matrix[filtered_indices]

        similarity_scores = cosine_similarity(candidate_vec, filtered_internship_vecs).flatten()
        filtered_df.loc[:, 'SimilarityScore'] = similarity_scores
        filtered_df.loc[filtered_df['location'] == candidate_loc, 'SimilarityScore'] += location_boost

        if stipend_boost_weight > 0 and 'stipend' in filtered_df.columns:
            max_stipend = filtered_df['stipend'].max()
            if max_stipend > 0:
                stipend_score = filtered_df['stipend'] / max_stipend
                filtered_df.loc[:, 'SimilarityScore'] += stipend_score * stipend_boost_weight

        top_recommendations = filtered_df.sort_values(by='SimilarityScore', ascending=False).head(top_n)

        # Instead of a list of dicts, we'll build a DataFrame for display
        result_df = pd.DataFrame()
        result_df['Title'] = top_recommendations['internship_title']
        result_df['Company'] = top_recommendations['company_name']
        result_df['Location'] = top_recommendations['location']
        result_df['Stipend'] = top_recommendations['stipend']
        result_df['Score'] = top_recommendations['SimilarityScore']
        result_df['Reason'] = top_recommendations['internship_title'].apply(lambda title: self._get_match_reason(candidate_skills, title))
        
        return result_df

    # JUPYTER-OPTIMIZED: An interactive method to wrap the recommender
    def recommend_interactive(self):
        # Define widgets
        skills_widget = widgets.Text(value='python ml data', description='Skills:', layout={'width': '400px'})
        location_widget = widgets.Text(value='delhi', description='Location:', layout={'width': '400px'})
        top_n_widget = widgets.IntSlider(value=5, min=1, max=15, step=1, description='Top N:')
        loc_boost_widget = widgets.FloatSlider(value=0.1, min=0, max=0.5, step=0.05, description='Location Boost:')
        stipend_boost_widget = widgets.FloatSlider(value=0.0, min=0, max=0.5, step=0.05, description='Stipend Boost:')
        button = widgets.Button(description="Get Recommendations", button_style='success')
        output = widgets.Output()

        def on_button_click(b):
            with output:
                clear_output(wait=True)
                candidate_profile = {
                    'Skills': skills_widget.value,
                    'LocationPreference': location_widget.value,
                    'SectorInterest': '' # Can be added as another widget if needed
                }
                
                results_df = self.recommend(
                    candidate_profile,
                    top_n=top_n_widget.value,
                    location_boost=loc_boost_widget.value,
                    stipend_boost_weight=stipend_boost_widget.value
                )
                
                # Style the output DataFrame for better visualization
                styled_df = results_df.style.background_gradient(cmap='Greens', subset=['Score']) \
                                            .format({'Score': '{:.3f}', 'Stipend': '₹{:,.0f}'}) \
                                            .set_properties(**{'text-align': 'left'}) \
                                            .set_table_styles([dict(selector='th', props=[('text-align', 'left')])])
                display(styled_df)

        button.on_click(on_button_click)
        
        # Display all widgets
        display(skills_widget, location_widget, top_n_widget, loc_boost_widget, stipend_boost_widget, button, output)


In [9]:
# --- Data Handling & Core Logic ---
import pandas as pd
import re

# --- Machine Learning ---
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# --- Jupyter Notebook Interactive Widgets & Display ---
import ipywidgets as widgets
from IPython.display import display, clear_output

print("✅ Libraries imported successfully!")

✅ Libraries imported successfully!


In [10]:
class InternshipRecommender:
    """
    Recommends internships using a hybrid approach, optimized for Jupyter Notebooks.
    (Updated to handle complex stipend formats)
    """
    def __init__(self, dataframe: pd.DataFrame):
        print("Preprocessing data...")
        self.df = dataframe.copy() 
        self._preprocess()

        # Combine text fields for TF-IDF vectorization
        self.df['Combined'] = self.df['internship_title'].astype(str) + ' ' + self.df['company_name'].astype(str)
        
        print("Building TF-IDF matrix...")
        self.vectorizer = TfidfVectorizer(stop_words="english", min_df=2)
        self.internship_tfidf_matrix = self.vectorizer.fit_transform(self.df['Combined'])
        print("✅ Recommender is ready to use.")

    def _preprocess(self):
        """Cleans and normalizes the internship dataset."""
        required_cols = ['internship_title', 'company_name', 'location', 'stipend']
        for col in required_cols:
            if col not in self.df.columns:
                raise ValueError(f"Missing required column: '{col}' in the dataset.")
        
        for col in ['internship_title', 'company_name', 'location']:
            self.df[col] = self.df[col].astype(str).str.lower().str.strip()

        if "InternshipID" not in self.df.columns:
            self.df["InternshipID"] = range(1, len(self.df) + 1)
        
        # --- NEW & IMPROVED STIPEND PARSING LOGIC ---
        def parse_stipend(stipend_text):
            stipend_text = str(stipend_text).lower()
            if 'unpaid' in stipend_text:
                return 0
            
            # Find all number sequences (handles commas)
            numbers = re.findall(r'[\d,]+', stipend_text)
            if not numbers:
                return 0
            
            # Convert found numbers to integers
            cleaned_numbers = [int(n.replace(',', '')) for n in numbers]
            
            # If it's a range (e.g., 5,000-10,000), return the average
            if len(cleaned_numbers) > 1:
                return sum(cleaned_numbers) / len(cleaned_numbers)
            # Otherwise, return the single number found
            else:
                return cleaned_numbers[0]

        # Apply the new function to the stipend column
        self.df['stipend'] = self.df['stipend'].apply(parse_stipend)
        print("✅ Stipend column processed successfully.")

    def _clean_text(self, text: str) -> str:
        """A simple utility function to clean input text."""
        return re.sub(r'[^a-zA-Z0-9\s]', ' ', str(text).lower()).strip()

    def _get_match_reason(self, candidate_skills_str: str, internship_title: str) -> str:
        """Generates an explanation based on overlapping words."""
        candidate_skills = set(candidate_skills_str.split())
        internship_words = set(internship_title.split())
        matched = candidate_skills.intersection(internship_words)
        if matched:
            return f"Matched on: {', '.join(matched)}"
        return "Relevant by title/company"

    def recommend(self, candidate: dict, top_n: int = 5, location_boost: float = 0.1, stipend_boost: float = 0.05):
        """Generates top N recommendations and returns a styled DataFrame."""
        candidate_skills = self._clean_text(candidate.get('Skills', ''))
        candidate_loc = self._clean_text(candidate.get('LocationPreference', ''))

        candidate_text = candidate_skills
        candidate_vec = self.vectorizer.transform([candidate_text])
        similarity_scores = cosine_similarity(candidate_vec, self.internship_tfidf_matrix).flatten()
        
        results_df = self.df.copy()
        results_df['Score'] = similarity_scores
        
        results_df.loc[results_df['location'].str.contains(candidate_loc, na=False), 'Score'] += location_boost
        if stipend_boost > 0 and results_df['stipend'].max() > 0:
            stipend_score = results_df['stipend'] / results_df['stipend'].max()
            results_df['Score'] += stipend_score * stipend_boost

        top_recommendations = results_df.sort_values(by='Score', ascending=False).head(top_n)
        
        display_df = pd.DataFrame()
        display_df['Title'] = top_recommendations['internship_title']
        display_df['Company'] = top_recommendations['company_name']
        display_df['Location'] = top_recommendations['location']
        display_df['Stipend (₹)'] = top_recommendations['stipend']
        display_df['Relevance Score'] = top_recommendations['Score']
        display_df['Reason'] = top_recommendations['internship_title'].apply(lambda title: self._get_match_reason(candidate_skills, title))
        
        return display_df.style.background_gradient(cmap='Greens', subset=['Relevance Score']) \
                           .format({'Relevance Score': '{:.3f}', 'Stipend (₹)': '{:,.0f}'}) \
                           .set_properties(**{'text-align': 'left'}) \
                           .set_table_styles([dict(selector='th', props=[('text-align', 'left')])])

    def recommend_interactive(self):
        """Creates and displays an interactive widget for getting recommendations."""
        skills_widget = widgets.Text(value='python machine learning data science', description='Your Skills:', layout={'width': '500px'})
        location_widget = widgets.Text(value='delhi', description='Location:', layout={'width': '500px'})
        top_n_widget = widgets.IntSlider(value=5, min=1, max=15, description='Top N:')
        loc_boost_widget = widgets.FloatSlider(value=0.1, min=0, max=0.5, step=0.05, description='Location Boost:')
        stipend_boost_widget = widgets.FloatSlider(value=0.05, min=0, max=0.5, step=0.05, description='Stipend Boost:')
        button = widgets.Button(description="Get Recommendations", button_style='success')
        output = widgets.Output()

        def on_button_click(b):
            with output:
                clear_output(wait=True)
                candidate_profile = {
                    'Skills': skills_widget.value,
                    'LocationPreference': location_widget.value
                }
                results = self.recommend(
                    candidate_profile,
                    top_n=top_n_widget.value,
                    location_boost=loc_boost_widget.value,
                    stipend_boost=stipend_boost_widget.value
                )
                display(results)

        button.on_click(on_button_click)
        display(widgets.VBox([skills_widget, location_widget, top_n_widget, loc_boost_widget, stipend_boost_widget, button, output]))

In [11]:
# The name of your dataset file
dataset_path = "internship.csv"

try:
    # Load the dataset using pandas
    internship_df = pd.read_csv(dataset_path)
    print(f"📄 Dataset '{dataset_path}' loaded successfully. Shape: {internship_df.shape}")
    
    # Create an instance of the recommender with your data
    recommender = InternshipRecommender(internship_df)

except FileNotFoundError:
    print(f"❌ Error: The file '{dataset_path}' was not found.")
    print("Please make sure your CSV file is in the same folder as this Jupyter Notebook.")

📄 Dataset 'internship.csv' loaded successfully. Shape: (6485, 6)
Preprocessing data...
✅ Stipend column processed successfully.
Building TF-IDF matrix...
✅ Recommender is ready to use.


In [12]:
# This will display the interactive widgets in your notebook
recommender.recommend_interactive()

VBox(children=(Text(value='python machine learning data science', description='Your Skills:', layout=Layout(wi…