In [2]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF
from sklearn.metrics.pairwise import cosine_similarity
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

# Download NLTK resources (uncomment these lines first time you run the script)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Text preprocessing
def preprocess_text(text):
    """
    Clean and preprocess text data for NLP tasks.
    """
    if not isinstance(text, str) or not text:
        return ""
    
    # Convert to lowercase
    text = text.lower()
    
    # Remove URLs
    text = re.sub(r'http\S+', '', text)
    
    # Remove special characters and replace with space
    text = re.sub(r'[^a-zA-Z\s]', ' ', text)
    
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

# Basic stopwords - we'll enhance these based on corpus analysis
STOP_WORDS = set(stopwords.words('english'))
CS_BASIC_STOPWORDS = {
    'job', 'jobs', 'looking', 'work', 'position', 'positions', 'company', 'companies',
    'experience', 'programming', 'code', 'coding', 'developer', 'development', 'software',
    'career', 'careers', 'role', 'roles', 'opportunity', 'opportunities', 'hiring',
    'apply', 'application', 'interview', 'requirement', 'requirements', 'skill', 'skills',
    'year', 'years', 'month', 'months', 'week', 'weeks', 'day', 'days',
    'like', 'want', 'need', 'would', 'could', 'should', 'may', 'might', 'must',
    'know', 'get', 'got', 'go', 'going', 'think', 'thought', 'see', 'look', 'help'
}
STOP_WORDS.update(CS_BASIC_STOPWORDS)

# Tokenize and lemmatize
def tokenize_and_lemmatize(text):
    """
    Tokenize and lemmatize text, removing stopwords.
    """
    lemmatizer = WordNetLemmatizer()
    
    # Tokenize
    tokens = nltk.word_tokenize(text)
    
    # Lemmatize and remove stopwords
    tokens = [lemmatizer.lemmatize(token) for token in tokens 
              if token not in STOP_WORDS and len(token) > 2]
    
    return tokens

# This class holds preprocessed data and analysis results, separated from the recommender logic
class RedditJobCorpusAnalysis:
    def __init__(self, df):
        """
        Analyze a Reddit job post corpus to extract key insights.
        
        Parameters:
        -----------
        df : pandas.DataFrame
            DataFrame containing Reddit posts with columns like 'id', 'title', 'selftext', etc.
        """
        self.df = df.copy()
        self.vectorizer = None
        self.tfidf_matrix = None
        self.term_importance = {}
        self.common_bigrams = {}
        self.topics = {}
        self.additional_stopwords = set()
        
        # Run preprocessing and analysis
        self._preprocess_data()
        self._analyze_corpus()
        self._create_tfidf_matrix()
        
        # Print insights
        self._print_insights()
    
    def _preprocess_data(self):
        """
        Preprocess the data for analysis.
        """
        print("Preprocessing corpus text...")
        
        # Create combined text field (title + selftext)
        self.df['combined_text'] = self.df['title'].fillna('') + ' ' + self.df['selftext'].fillna('')
        
        # Preprocess text
        self.df['processed_text'] = self.df['combined_text'].apply(preprocess_text)
    
    def _analyze_corpus(self):
        """
        Analyze the corpus to identify important terms, common bigrams,
        and additional stopwords specific to this dataset.
        """
        print("Analyzing corpus to identify important terms...")
        
        # Extract the processed text
        corpus = self.df['processed_text'].tolist()
        
        # Find additional stopwords based on high document frequency
        count_vec = CountVectorizer(min_df=5)
        count_matrix = count_vec.fit_transform(corpus)
        count_features = count_vec.get_feature_names_out()
        
        # Calculate document frequency for each term
        doc_freq = np.array((count_matrix > 0).sum(axis=0)).flatten()
        doc_freq_percent = doc_freq / len(corpus)
        
        # Terms that appear in more than 70% of documents might be domain-specific stopwords
        potential_stopwords = {count_features[i] for i in range(len(count_features)) 
                               if doc_freq_percent[i] > 0.7}
        
        # But we should be careful not to remove important CS terms
        # So let's check their TF-IDF scores
        tfidf_vec = TfidfVectorizer(min_df=5, max_df=0.95)
        tfidf_matrix = tfidf_vec.fit_transform(corpus)
        tfidf_features = tfidf_vec.get_feature_names_out()
        
        # Calculate average TF-IDF for each term
        avg_tfidf = np.array(tfidf_matrix.mean(axis=0)).flatten()
        
        # Only add to stopwords if the average TF-IDF is low (not very informative)
        for term in potential_stopwords:
            if term in tfidf_features:
                idx = list(tfidf_features).index(term)
                if avg_tfidf[idx] < 0.01:  # Low information content
                    self.additional_stopwords.add(term)
        
        print(f"Identified {len(self.additional_stopwords)} additional domain-specific stopwords")
        
        # Update stopwords
        global STOP_WORDS
        STOP_WORDS.update(self.additional_stopwords)
        
        # Extract bigrams (phrases)
        print("Extracting common bigrams...")
        bigram_vectorizer = CountVectorizer(ngram_range=(2, 2), min_df=5)
        bigram_matrix = bigram_vectorizer.fit_transform(corpus)
        bigram_features = bigram_vectorizer.get_feature_names_out()
        
        # Calculate bigram frequency
        bigram_freq = np.array(bigram_matrix.sum(axis=0)).flatten()
        
        # Get top bigrams
        top_bigram_indices = bigram_freq.argsort()[-300:][::-1]  # Top 300 bigrams
        self.common_bigrams = {bigram_features[i]: bigram_freq[i] for i in top_bigram_indices}
        
        print(f"Extracted {len(self.common_bigrams)} significant bigrams")
        
        # Topic modeling using NMF
        print("Performing topic modeling...")
        nmf_model = NMF(n_components=10, random_state=42)
        nmf_topics = nmf_model.fit_transform(tfidf_matrix)
        
        # Get top terms for each topic
        for topic_idx, topic in enumerate(nmf_model.components_):
            top_terms_idx = topic.argsort()[-20:][::-1]  # Top 20 terms
            top_terms = [tfidf_features[i] for i in top_terms_idx]
            self.topics[f"Topic {topic_idx+1}"] = top_terms
        
        # Calculate TF-IDF importance for each term
        self.term_importance = {}
        for i, term in enumerate(tfidf_features):
            self.term_importance[term] = avg_tfidf[i]
        
        # Sort by importance
        self.term_importance = dict(sorted(self.term_importance.items(), 
                                           key=lambda x: x[1], 
                                           reverse=True))
        
        print("Corpus analysis complete!")
    
    def _create_tfidf_matrix(self):
        """
        Create TF-IDF matrix for the entire corpus.
        """
        # Create a corpus of processed documents
        corpus = self.df['processed_text'].tolist()
        
        # Initialize and fit TF-IDF vectorizer
        print("Creating TF-IDF vectors...")
        self.vectorizer = TfidfVectorizer(
            tokenizer=tokenize_and_lemmatize,
            min_df=2,             # Ignore terms that appear in less than 2 documents
            max_df=0.90,          # Ignore terms that appear in more than 90% of documents
            max_features=5000,    # Limit vocabulary size
            ngram_range=(1, 2)    # Use both unigrams and bigrams
        )
        
        # Create TF-IDF matrix
        self.tfidf_matrix = self.vectorizer.fit_transform(corpus)
        
        # Get feature names
        feature_names = self.vectorizer.get_feature_names_out()
        
        print(f"Vocabulary size: {len(feature_names)}")
        print(f"TF-IDF matrix shape: {self.tfidf_matrix.shape}")
    
    def _print_insights(self):
        """
        Print key insights from the corpus analysis.
        """
        print("Recommendation system ready!")
        
        # Print top terms for debugging
        top_terms = list(self.term_importance.items())[:30]
        print("Top 30 terms by importance:")
        for term, score in top_terms:
            print(f"  {term}: {score:.4f}")
        
        # Print example topics
        print("\nExample topics discovered:")
        for topic_name, terms in list(self.topics.items())[:3]:
            print(f"  {topic_name}: {', '.join(terms[:10])}")
    
    def get_top_terms(self, n=50):
        """
        Return the top n most important terms from the corpus.
        """
        return list(self.term_importance.items())[:n]
    
    def get_topics(self):
        """
        Return the topics discovered in the corpus.
        """
        return self.topics


# Base recommender class that uses an existing corpus analysis
class DataDrivenRedditJobRecommender:
    def __init__(self, corpus_analysis):
        """
        Initialize the recommender with a corpus analysis object.
        
        Parameters:
        -----------
        corpus_analysis : RedditJobCorpusAnalysis
            A complete corpus analysis with preprocessed data and TF-IDF matrix
        """
        self.df = corpus_analysis.df
        self.vectorizer = corpus_analysis.vectorizer
        self.tfidf_matrix = corpus_analysis.tfidf_matrix
        self.term_importance = corpus_analysis.term_importance
        self.common_bigrams = corpus_analysis.common_bigrams
        self.topics = corpus_analysis.topics
    
    def expand_query(self, query_tokens):
        """
        Expand the query with related terms based on corpus analysis.
        """
        expanded_tokens = query_tokens.copy()
        
        # Expand with topic-related terms
        for token in query_tokens:
            # Check if token appears in any topic
            for topic, terms in self.topics.items():
                if token in terms:
                    # Add some related terms from the same topic
                    related_terms = [t for t in terms[:10] if t != token]
                    expanded_tokens.extend(related_terms[:3])  # Add up to 3 related terms
        
        # Weight tokens by importance
        weighted_tokens = []
        for token in expanded_tokens:
            # If token is in term_importance, weight it accordingly
            if token in self.term_importance:
                weight = min(5, int(self.term_importance[token] * 100) + 1)
                weighted_tokens.extend([token] * weight)
            else:
                weighted_tokens.append(token)
        
        # Add common bigrams that contain query tokens
        for bigram, freq in self.common_bigrams.items():
            token1, token2 = bigram.split()
            if token1 in query_tokens or token2 in query_tokens:
                # Add the bigram with a weight based on frequency
                weight = min(3, int(np.log10(freq + 1)) + 1)
                weighted_tokens.extend([bigram] * weight)
        
        return weighted_tokens
    
    def recommend(self, query, top_n=10):
        """
        Recommend posts based on a query string.
        
        Parameters:
        -----------
        query : str
            The query string (can be a long sentence)
        top_n : int
            Number of recommendations to return
            
        Returns:
        --------
        pandas.DataFrame
            DataFrame with top_n recommendations and relevance scores
        """
        # Preprocess the query
        processed_query = preprocess_text(query)
        
        # Tokenize and lemmatize
        query_tokens = tokenize_and_lemmatize(processed_query)
        
        if not query_tokens:
            print("Warning: Query is too short or only contains stopwords.")
            return pd.DataFrame(columns=['id', 'title', 'selftext', 'subreddit', 'score', 'url', 'relevance_score'])
        
        # Print tokens for debugging
        print(f"Query tokens: {query_tokens}")
        
        # Expand the query with related terms
        expanded_query_tokens = self.expand_query(query_tokens)
        
        # Convert the expanded query tokens back to a string
        expanded_query = ' '.join(expanded_query_tokens)
        
        # Vectorize the query
        query_vector = self.vectorizer.transform([expanded_query])
        
        # Calculate cosine similarity between query and all posts
        cosine_similarities = cosine_similarity(query_vector, self.tfidf_matrix).flatten()
        
        # Add similarity scores to a temporary dataframe (to avoid modifying the original)
        temp_df = self.df.copy()
        temp_df['relevance_score'] = cosine_similarities
        
        # Get top N recommendations
        recommendations = temp_df.sort_values('relevance_score', ascending=False).head(top_n)
        
        # Select columns to return
        result = recommendations[['id', 'title', 'selftext', 'subreddit', 'score', 'url', 'relevance_score']]
        
        # Round relevance score for readability
        result['relevance_score'] = result['relevance_score'].round(3)
        
        return result
    
    def search(self, query, top_n=10):
        """
        Alias for recommend method with formatted output for display.
        """
        recommendations = self.recommend(query, top_n)
        
        print(f"\nTop {top_n} Recommendations for query: {query}")
        print("=" * 80)
        
        if recommendations.empty:
            print("No relevant results found.")
            return recommendations
        
        for i, row in recommendations.iterrows():
            # Format the selftext to show a preview
            selftext = row['selftext'] if pd.notna(row['selftext']) else ""
            selftext_preview = selftext[:150] + "..." if len(str(selftext)) > 150 else selftext
            
            print(f"{i+1}. [{row['relevance_score']:.3f}] {row['title']}")
            print(f"   {selftext_preview}")
            print(f"   subreddit: r/{row['subreddit']}, score: {row['score']}")
            print(f"   URL: {row['url']}")
            print("-" * 80)
        
        return recommendations


# Enhanced recommender that considers score and recency
class EnhancedDataDrivenRecommender(DataDrivenRedditJobRecommender):
    def recommend(self, query, top_n=10, relevance_weight=0.7, score_weight=0.15, recency_weight=0.15):
        """
        Enhanced recommendation that considers relevance, post score, and recency.
        
        Parameters:
        -----------
        query : str
            The query string
        top_n : int
            Number of recommendations to return
        relevance_weight : float
            Weight given to text relevance (0-1)
        score_weight : float
            Weight given to post score (0-1)
        recency_weight : float
            Weight given to post recency (0-1)
            
        Returns:
        --------
        pandas.DataFrame
            DataFrame with top_n recommendations and scores
        """
        # Get base recommendations with higher initial pool
        initial_pool_size = min(top_n * 3, len(self.df))
        base_recommendations = super().recommend(query, initial_pool_size)
        
        if base_recommendations.empty:
            return base_recommendations
        
        # Normalize post score
        max_score = base_recommendations['score'].max()
        if max_score > 0:
            base_recommendations['score_normalized'] = base_recommendations['score'] / max_score
        else:
            base_recommendations['score_normalized'] = 0
        
        # Check if created_utc exists before trying to use it
        has_created_utc = 'created_utc' in base_recommendations.columns
        
        if has_created_utc:
            try:
                # Convert to datetime
                base_recommendations['created_dt'] = pd.to_datetime(base_recommendations['created_utc'], unit='s')
                
                max_date = base_recommendations['created_dt'].max()
                min_date = base_recommendations['created_dt'].min()
                date_range = (max_date - min_date).total_seconds()
                
                if date_range > 0:
                    base_recommendations['recency_score'] = base_recommendations.apply(
                        lambda x: (x['created_dt'] - min_date).total_seconds() / date_range, axis=1
                    )
                else:
                    base_recommendations['recency_score'] = 1.0
            except Exception as e:
                print(f"Warning: Could not process recency from created_utc: {e}")
                has_created_utc = False
        
        if not has_created_utc:
            # If no created_utc field or error processing it, don't use recency
            base_recommendations['recency_score'] = 0
            
            # Redistribute weights
            total = relevance_weight + score_weight
            relevance_weight = relevance_weight / total
            score_weight = score_weight / total
            recency_weight = 0
        
        # Calculate combined score
        base_recommendations['combined_score'] = (
            (relevance_weight * base_recommendations['relevance_score']) +
            (score_weight * base_recommendations['score_normalized']) +
            (recency_weight * base_recommendations['recency_score'])
        )
        
        # Get top N by combined score
        final_recommendations = base_recommendations.sort_values('combined_score', ascending=False).head(top_n)
        
        # Rename and round for output
        final_recommendations = final_recommendations.rename(columns={'combined_score': 'relevance_score'})
        final_recommendations['relevance_score'] = final_recommendations['relevance_score'].round(3)
        
        # Select columns to return
        cols = ['id', 'title', 'selftext', 'subreddit', 'score', 'url', 'relevance_score']
        result = final_recommendations[cols]
        
        return result


# Factory function to create both recommender types with shared analysis
def create_recommenders(df):
    """
    Factory function that creates both basic and enhanced recommenders,
    sharing the preprocessing and analysis work.
    
    Parameters:
    -----------
    df : pandas.DataFrame
        DataFrame containing Reddit posts
        
    Returns:
    --------
    dict
        Dictionary containing both recommender instances
    """
    print("\nInitializing corpus analysis (this only happens once)...")
    
    # Perform corpus analysis once
    corpus_analysis = RedditJobCorpusAnalysis(df)
    
    # Create both recommender types using the same analysis
    basic = DataDrivenRedditJobRecommender(corpus_analysis)
    enhanced = EnhancedDataDrivenRecommender(corpus_analysis)
    
    return {
        'basic': basic,
        'enhanced': enhanced
    }


# Example usage
if __name__ == "__main__":
    # Load data from CSV
    df = pd.read_csv('posts_job.csv')
    
    # Create both recommender types with shared analysis
    recommenders = create_recommenders(df)
    
    # Get the basic recommender
    basic_recommender = recommenders['basic']
    
    # Example queries
    print("\n=== Testing Basic Recommender ===")
    queries = [
        "I am a JHU graduate student in data science, looking for internships in machine learning",
        "Senior software engineer with React and Node.js experience",
        "Entry level Python developer positions in Baltimore"
    ]
    
    for query in queries:
        print(f"\n\nTesting query: {query}")
        recommendations = basic_recommender.search(query, top_n=5)
    
    # Use the enhanced recommender with the same corpus analysis
    print("\n\n=== Testing Enhanced Recommender ===")
    enhanced_recommender = recommenders['enhanced']
    recommendations = enhanced_recommender.search(queries[0], top_n=5)
    
    # Show key insights from the corpus
    print("\n\n=== Top Terms in the Corpus ===")
    corpus_analysis = recommenders['basic'].term_importance
    top_terms = list(corpus_analysis.items())[:20]
    for term, score in top_terms:
        print(f"{term}: {score:.4f}")
    
    print("\n=== Key Topics Discovered ===")
    topics = recommenders['basic'].topics
    for topic_name, terms in list(topics.items())[:5]:
        print(f"{topic_name}: {', '.join(terms[:10])}")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\23109\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\23109\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\23109\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!



Initializing corpus analysis (this only happens once)...
Preprocessing corpus text...
Analyzing corpus to identify important terms...
Identified 0 additional domain-specific stopwords
Extracting common bigrams...
Extracted 300 significant bigrams
Performing topic modeling...
Corpus analysis complete!
Creating TF-IDF vectors...
Vocabulary size: 5000
TF-IDF matrix shape: (113411, 5000)
Recommendation system ready!
Top 30 terms by importance:
  and: 0.0523
  to: 0.0501
  the: 0.0442
  in: 0.0407
  for: 0.0371
  gesucht: 0.0338
  hiring: 0.0315
  engineer: 0.0281
  of: 0.0270
  job: 0.0264
  is: 0.0253
  you: 0.0219
  remote: 0.0216
  my: 0.0209
  usd: 0.0204
  it: 0.0195
  with: 0.0194
  at: 0.0178
  senior: 0.0171
  software: 0.0169
  or: 0.0166
  on: 0.0166
  jobs: 0.0166
  apply: 0.0164
  have: 0.0163
  more: 0.0155
  that: 0.0153
  me: 0.0139
  this: 0.0137
  are: 0.0136

Example topics discovered:
  Topic 1: to, my, the, it, and, in, but, have, that, of
  Topic 2: usd, engineer, ca,

TypeError: unsupported format string passed to Series.__format__