In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
import pickle
from typing import List, Dict, Tuple
import re
import warnings
warnings.filterwarnings('ignore')

class ContentBasedRecommendationModel:
    """
    Content-Based Filtering for Internship Recommendations
    Uses TF-IDF and feature engineering for accurate matching
    """

    def __init__(self):
        # Text vectorizers
        self.title_vectorizer = TfidfVectorizer(max_features=500, stop_words='english', ngram_range=(1, 2))
        self.description_vectorizer = TfidfVectorizer(max_features=1000, stop_words='english', ngram_range=(1, 2))
        self.skills_vectorizer = TfidfVectorizer(max_features=800, ngram_range=(1, 2))
        self.company_vectorizer = TfidfVectorizer(max_features=200)

        # Feature matrices
        self.internship_features = None
        self.student_profiles = {}
        self.content_similarity_matrix = None

        # Scalers
        self.scaler = StandardScaler()

        self.is_trained = False

    def preprocess_text(self, text: str) -> str:
        """
        Clean and preprocess text data
        """
        if pd.isna(text):
            return ""

        text = str(text).lower()
        # Remove special characters but keep spaces
        text = re.sub(r'[^a-zA-Z0-9\s+#]', ' ', text)
        # Remove extra whitespace
        text = ' '.join(text.split())

        return text

    def extract_internship_features(self, internships_df: pd.DataFrame) -> np.ndarray:
        """
        Extract and vectorize features from internship data
        """
        print("Extracting internship features...")

        # Preprocess text fields
        internships_df['clean_title'] = internships_df['title'].apply(self.preprocess_text)
        internships_df['clean_description'] = internships_df['description'].apply(self.preprocess_text)
        internships_df['clean_skills'] = internships_df['required_skills'].apply(self.preprocess_text)
        internships_df['clean_company'] = internships_df['company'].apply(self.preprocess_text)

        # Vectorize text features
        title_features = self.title_vectorizer.fit_transform(internships_df['clean_title'])
        description_features = self.description_vectorizer.fit_transform(internships_df['clean_description'])
        skills_features = self.skills_vectorizer.fit_transform(internships_df['clean_skills'])
        company_features = self.company_vectorizer.fit_transform(internships_df['clean_company'])

        # Numerical features
        numerical_features = []
        for _, row in internships_df.iterrows():
            features = [
                float(row.get('duration_months', 0)),
                float(row.get('stipend', 0)),
                float(row.get('company_rating', 0)),
                float(row.get('difficulty_level', 0)),
                len(str(row.get('required_skills', '')).split(',')),
                1 if str(row.get('remote_option', '')).lower() == 'yes' else 0,
                1 if str(row.get('type', '')).lower() == 'paid' else 0,
            ]
            numerical_features.append(features)

        numerical_features = np.array(numerical_features)

        # Scale numerical features
        numerical_features_scaled = self.scaler.fit_transform(numerical_features)

        # Combine all features
        combined_features = np.hstack([
            title_features.toarray(),
            description_features.toarray(),
            skills_features.toarray(),
            company_features.toarray(),
            numerical_features_scaled
        ])

        print(f"Internship features shape: {combined_features.shape}")
        return combined_features

    def create_student_profile(self, student_data: Dict) -> np.ndarray:
        """
        Create feature vector for a student based on their profile
        """
        # Combine student text data
        student_text = {
            'title': f"{student_data.get('field', '')} {student_data.get('specialization', '')}",
            'description': f"{student_data.get('interests', '')} {student_data.get('career_goals', '')}",
            'skills': student_data.get('skills', ''),
            'company': student_data.get('preferred_companies', '')
        }

        # Preprocess
        clean_student_text = {k: self.preprocess_text(v) for k, v in student_text.items()}

        # Vectorize using trained vectorizers
        title_vec = self.title_vectorizer.transform([clean_student_text['title']])
        description_vec = self.description_vectorizer.transform([clean_student_text['description']])
        skills_vec = self.skills_vectorizer.transform([clean_student_text['skills']])
        company_vec = self.company_vectorizer.transform([clean_student_text['company']])

        # Numerical features
        numerical_features = np.array([[
            float(student_data.get('preferred_duration', 0)),
            float(student_data.get('expected_stipend', 0)),
            float(student_data.get('gpa', 0)) / 10.0 * 5,  # Convert to 0-5 scale
            float(student_data.get('year', 0)),
            len(str(student_data.get('skills', '')).split(',')),
            1 if str(student_data.get('remote_preference', '')).lower() == 'yes' else 0,
            1 if str(student_data.get('paid_preference', '')).lower() == 'yes' else 0,
        ]])

        # Scale numerical features
        numerical_features_scaled = self.scaler.transform(numerical_features)

        # Combine all features
        student_vector = np.hstack([
            title_vec.toarray(),
            description_vec.toarray(),
            skills_vec.toarray(),
            company_vec.toarray(),
            numerical_features_scaled
        ])

        return student_vector.flatten()

    def calculate_skill_similarity(self, student_skills: str, required_skills: str) -> float:
        """
        Calculate detailed skill similarity with weighted matching
        """
        if pd.isna(student_skills) or pd.isna(required_skills):
            return 0.0

        student_skill_list = [skill.strip().lower() for skill in str(student_skills).split(',')]
        required_skill_list = [skill.strip().lower() for skill in str(required_skills).split(',')]

        if not required_skill_list:
            return 1.0

        # Exact matches
        exact_matches = len(set(student_skill_list) & set(required_skill_list))

        # Partial matches (substring matching)
        partial_matches = 0
        for req_skill in required_skill_list:
            for student_skill in student_skill_list:
                if req_skill in student_skill or student_skill in req_skill:
                    if req_skill not in set(required_skill_list) & set(student_skill_list):
                        partial_matches += 0.5
                        break

        total_score = exact_matches + partial_matches
        max_possible_score = len(required_skill_list)

        return min(1.0, total_score / max_possible_score)

    def calculate_location_preference_score(self, student_location: str, student_preferences: str,
                                          internship_location: str) -> float:
        """
        Calculate location preference score
        """
        if pd.isna(internship_location):
            return 0.5  # Neutral if location not specified

        student_loc = str(student_location).lower() if not pd.isna(student_location) else ""
        intern_loc = str(internship_location).lower()
        preferences = str(student_preferences).lower() if not pd.isna(student_preferences) else ""

        # Exact city match
        if student_loc and student_loc in intern_loc:
            return 1.0

        # Preference match
        if preferences and any(pref.strip() in intern_loc for pref in preferences.split(',')):
            return 0.9

        # Same state (if available)
        if student_loc and len(student_loc.split(',')) > 1 and len(intern_loc.split(',')) > 1:
            student_state = student_loc.split(',')[1].strip()
            intern_state = intern_loc.split(',')[1].strip()
            if student_state == intern_state:
                return 0.7

        return 0.3  # Different location

    def train(self, internships_df: pd.DataFrame, students_df: pd.DataFrame):
        """
        Train the content-based recommendation model
        """
        print("Training content-based recommendation model...")

        # Extract internship features
        self.internship_features = self.extract_internship_features(internships_df)
        self.internships_df = internships_df.copy()

        # Calculate content similarity matrix for internships
        self.content_similarity_matrix = cosine_similarity(self.internship_features)

        print(f"Content similarity matrix shape: {self.content_similarity_matrix.shape}")

        self.is_trained = True
        print("Content-based model training completed!")

    def get_recommendations(self, student_data: Dict, n_recommendations: int = 10,
                          filters: Dict = None) -> List[Dict]:
        """
        Get personalized recommendations for a student
        """
        if not self.is_trained:
            raise ValueError("Model must be trained before getting recommendations!")

        # Create student profile vector
        student_vector = self.create_student_profile(student_data)

        # Calculate similarity with all internships
        similarities = cosine_similarity([student_vector], self.internship_features).flatten()

        # Apply additional scoring factors
        enhanced_scores = []

        for idx, similarity in enumerate(similarities):
            internship = self.internships_df.iloc[idx]

            # Base content similarity
            base_score = similarity

            # Skill similarity boost
            skill_sim = self.calculate_skill_similarity(
                student_data.get('skills', ''),
                internship.get('required_skills', '')
            )

            # Location preference boost
            location_score = self.calculate_location_preference_score(
                student_data.get('location', ''),
                student_data.get('location_preferences', ''),
                internship.get('location', '')
            )

            # Combine scores with weights
            final_score = (
                0.5 * base_score +
                0.3 * skill_sim +
                0.2 * location_score
            )

            enhanced_scores.append((idx, final_score, skill_sim, location_score))

        # Sort by final score
        enhanced_scores.sort(key=lambda x: x[1], reverse=True)

        # Apply filters if provided
        if filters:
            enhanced_scores = self.apply_filters(enhanced_scores, filters)

        # Format recommendations
        recommendations = []
        for idx, final_score, skill_sim, location_score in enhanced_scores[:n_recommendations]:
            internship = self.internships_df.iloc[idx]

            rec = {
                'internship_id': internship['internship_id'],
                'title': internship['title'],
                'company': internship['company'],
                'location': internship.get('location', 'Not specified'),
                'content_similarity': round(final_score * 100, 1),
                'skill_match': round(skill_sim * 100, 1),
                'location_match': round(location_score * 100, 1),
                'duration': internship.get('duration_months', 'N/A'),
                'stipend': internship.get('stipend', 'N/A'),
                'required_skills': internship.get('required_skills', ''),
                'description': internship.get('description', '')[:200] + '...'
            }
            recommendations.append(rec)

        return recommendations

    def apply_filters(self, scored_internships: List[Tuple], filters: Dict) -> List[Tuple]:
        """
        Apply filters to recommendations
        """
        filtered_internships = []

        for idx, score, skill_sim, location_score in scored_internships:
            internship = self.internships_df.iloc[idx]

            # Apply filters
            if filters.get('min_stipend') and internship.get('stipend', 0) < filters['min_stipend']:
                continue

            if filters.get('max_duration') and internship.get('duration_months', 0) > filters['max_duration']:
                continue

            if filters.get('location') and filters['location'].lower() not in str(internship.get('location', '')).lower():
                continue

            if filters.get('company_type') and filters['company_type'].lower() not in str(internship.get('company', '')).lower():
                continue

            if filters.get('remote_only') and str(internship.get('remote_option', '')).lower() != 'yes':
                continue

            filtered_internships.append((idx, score, skill_sim, location_score))

        return filtered_internships

    def get_similar_internships(self, internship_id: int, n_similar: int = 5) -> List[Dict]:
        """
        Find similar internships based on content
        """
        if not self.is_trained:
            return []

        try:
            # Find internship index
            internship_idx = self.internships_df[self.internships_df['internship_id'] == internship_id].index[0]

            # Get similarities
            similarities = self.content_similarity_matrix[internship_idx]

            # Get top similar internships (excluding itself)
            similar_indices = similarities.argsort()[::-1][1:n_similar+1]

            similar_internships = []
            for idx in similar_indices:
                internship = self.internships_df.iloc[idx]
                similar_internships.append({
                    'internship_id': internship['internship_id'],
                    'title': internship['title'],
                    'company': internship['company'],
                    'similarity': round(similarities[idx] * 100, 1)
                })

            return similar_internships

        except Exception as e:
            print(f"Error finding similar internships: {e}")
            return []

    def save_model(self, filepath: str):
        """Save the trained model"""
        with open(filepath, 'wb') as f:
            pickle.dump(self, f)
        print(f"Content-based model saved to {filepath}")

    @classmethod
    def load_model(cls, filepath: str):
        """Load a trained model"""
        with open(filepath, 'rb') as f:
            model = pickle.load(f)
        return model

# Example usage
if __name__ == "__main__":
    # Initialize model
    content_model = ContentBasedRecommendationModel()

    # Example student profile
    student_profile = {
        'skills': 'Python, Machine Learning, React, SQL',
        'interests': 'Artificial Intelligence, Web Development',
        'field': 'Computer Science',
        'location': 'Bangalore, Karnataka',
        'location_preferences': 'Bangalore, Hyderabad, Chennai',
        'gpa': 8.5,
        'year': 3,
        'preferred_duration': 6,
        'expected_stipend': 20000,
        'remote_preference': 'no',
        'career_goals': 'Machine Learning Engineer'
    }

    # Example filters
    filters = {
        'min_stipend': 15000,
        'max_duration': 6,
        'location': 'Bangalore'
    }

    # After training:
    # recommendations = content_model.get_recommendations(student_profile, n_recommendations=5, filters=filters)
    # print("Content-based recommendations:", recommendations)

In [None]:
import pandas as pd


internships_df = pd.read_csv('internship_data.csv')  # make sure the filename is correct


print(internships_df.head())




   Internship Id                                        Role  \
0        2456465     Business Development (Sales) Internship   
1        2456478             Human Resources (HR) Internship   
2        2452185  Content & E-Commerce Management Internship   
3        2450936               Project Management Internship   
4        2450882                Digital Marketing Internship   

                                       Company Name  \
0                        Madbrains Technologies LLP   
1                    Jobs Flash Consulting Services   
2                                    Fall For Flora   
3  Special Situation Advisors India Private Limited   
4  Special Situation Advisors India Private Limited   

                                         Location  Duration  \
0                         ('Chandigarh, Mohali',)  4 Months   
1                                    ('Gurgaon',)  6 Months   
2           ('Faridabad, Delhi, Gurgaon, Noida',)  4 Months   
3  ('Thane, Navi Mumbai, Mumbai,

In [None]:



print(internships_df.columns.tolist())


['Internship Id', 'Role', 'Company Name', 'Location', 'Duration', 'Stipend', 'Intern Type', 'Skills', 'Perks', 'Hiring Since', 'Opportunity Date', 'Opening', 'Hired Candidate', 'Number of Applications', 'Website Link']


In [None]:
internships_df.rename(columns={
    'Internship Id': 'internship_id',
    'Role': 'title',
    'Company Name': 'company',
    'Location': 'location',
    'Duration': 'duration_months',
    'Stipend': 'stipend',
    'Intern Type': 'type',
    'Skills': 'required_skills'
}, inplace=True)


In [None]:
def train(self, internships_df: pd.DataFrame, students_df: pd.DataFrame = None):
    ...


In [None]:
# Remove non-numeric characters and convert to float
internships_df['duration_months'] = internships_df['duration_months'].astype(str).str.extract('(\d+)')
internships_df['duration_months'] = internships_df['duration_months'].astype(float)


In [None]:
internships_df['stipend'] = internships_df['stipend'].astype(str).str.replace('[^\d.]', '', regex=True)
internships_df['stipend'] = internships_df['stipend'].replace('', '0').astype(float)


In [None]:
# Minimal fix: fill missing text columns with a placeholder
internships_df['title'] = internships_df['title'].fillna('no_title')
internships_df['description'] = internships_df.get('description', pd.Series(['no_description']*len(internships_df)))
internships_df['required_skills'] = internships_df['required_skills'].fillna('no_skills')
internships_df['company'] = internships_df['company'].fillna('no_company')




In [None]:
# Remove rows that still have empty strings after fill (optional, for safety)
internships_df = internships_df[(internships_df['title'] != '') &
                                (internships_df['description'] != '') &
                                (internships_df['required_skills'] != '')]


In [None]:

content_model.train(internships_df, students_df=None)


Training content-based recommendation model...
Extracting internship features...
Internship features shape: (6642, 1508)
Content similarity matrix shape: (6642, 6642)
Content-based model training completed!


In [None]:
# 1. Define student profile
student_profile = {
    'skills': 'Python, Machine Learning, React, SQL',
    'interests': 'Artificial Intelligence, Web Development',
    'field': 'Computer Science',
    'location': 'Bangalore, Karnataka',
    'location_preferences': 'Bangalore, Hyderabad, Chennai',
    'gpa': 8.5,
    'year': 3,
    'preferred_duration': 6,
    'expected_stipend': 20000,
    'remote_preference': 'no',
    'career_goals': 'Machine Learning Engineer'
}

# 2. Get top 5 recommendations
recommendations = content_model.get_recommendations(student_profile, n_recommendations=4)

# 3. Print recommendations
for rec in recommendations:
    print(rec)


{'internship_id': np.int64(2460722), 'title': 'Agri-Tech - Yield Forecasting Using GIS Data Internship', 'company': 'One Root', 'location': "('Bangalore',)", 'content_similarity': np.float64(35.4), 'skill_match': 33.3, 'location_match': 90.0, 'duration': np.float64(6.0), 'stipend': np.float64(10000.0), 'required_skills': "['Image Processing', 'Machine Learning', 'Python']", 'description': 'no_description...'}
{'internship_id': np.int64(2456431), 'title': 'PHP Development Internship', 'company': 'Kramah Software India Private Limited', 'location': "('Bangalore', 'Bangalore', 'Bangalore', 'Bangalore')", 'content_similarity': np.float64(32.4), 'skill_match': 25.0, 'location_match': 90.0, 'duration': np.float64(6.0), 'stipend': np.float64(5000.0), 'required_skills': "['HTML', 'Machine Learning', 'MySQL', 'PHP']", 'description': 'no_description...'}
{'internship_id': np.int64(2461258), 'title': 'Product Analyst Internship', 'company': 'Grapevine', 'location': "('Bangalore',)", 'content_simi