In [24]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF
from sklearn.metrics.pairwise import cosine_similarity
from collections import Counter
import warnings
import pickle
import os
from tqdm import tqdm
warnings.filterwarnings('ignore')
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Text preprocessing
def preprocess_text(text):

    if not isinstance(text, str) or not text:
        return ""
    
    # Convert to lowercase
    text = text.lower()
    
    # Remove URLs
    text = re.sub(r'http\S+', '', text)
    
    # Remove special characters and replace with space
    text = re.sub(r'[^a-zA-Z\s]', ' ', text)
    
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

# Basic stopwords
STOP_WORDS = set(stopwords.words('english'))
CS_BASIC_STOPWORDS = {
    'job', 'jobs', 'looking', 'work', 'position', 'positions', 'company', 'companies',
    'experience', 'programming', 'code', 'coding', 'developer', 'development', 'software',
    'career', 'careers', 'role', 'roles', 'opportunity', 'opportunities', 'hiring',
    'apply', 'application', 'interview', 'requirement', 'requirements', 'skill', 'skills',
    'year', 'years', 'month', 'months', 'week', 'weeks', 'day', 'days',
    'like', 'want', 'need', 'would', 'could', 'should', 'may', 'might', 'must',
    'know', 'get', 'got', 'go', 'going', 'think', 'thought', 'see', 'look', 'help'
}
STOP_WORDS.update(CS_BASIC_STOPWORDS)

# Tokenize and lemmatize
def tokenize_and_lemmatize(text):

    lemmatizer = WordNetLemmatizer()
    
    # Tokenize
    tokens = nltk.word_tokenize(text)
    
    # Lemmatize and remove stopwords
    tokens = [lemmatizer.lemmatize(token) for token in tokens 
              if token not in STOP_WORDS and len(token) > 2]
    
    return tokens

# This class holds preprocessed data and analysis results, separated from the recommender logic
class RedditJobCorpusAnalysis:
    def __init__(self, df):

        self.df = df.copy()
        self.vectorizer = None
        self.tfidf_matrix = None
        self.term_importance = {}
        self.common_bigrams = {}
        self.topics = {}
        self.additional_stopwords = set()
        self._preprocess_data()
        self._analyze_corpus()
        self._create_tfidf_matrix()
    def _preprocess_data(self):
        
        print("Preprocessing corpus text...")
        # Create combined text field (title + selftext)
        self.df['combined_text'] = self.df['title'].fillna('') + ' ' + self.df['selftext'].fillna('')
        # Preprocess text with progress bar
        print("Processing text fields...")
        self.df['processed_text'] = [preprocess_text(text) for text in tqdm(self.df['combined_text'], desc="Preprocessing texts")]
    
    def _analyze_corpus(self):
        
        print("Analyzing corpus to identify important terms...")
        
        # Extract the processed text
        corpus = self.df['processed_text'].tolist()
        
        # Find additional stopwords based on high document frequency
        print("Vectorizing corpus for stopword analysis...")
        count_vec = CountVectorizer(min_df=5)
        count_matrix = count_vec.fit_transform(corpus)
        count_features = count_vec.get_feature_names_out()
        
        # Calculate document frequency for each term
        print("Calculating document frequencies...")
        doc_freq = np.array((count_matrix > 0).sum(axis=0)).flatten()
        doc_freq_percent = doc_freq / len(corpus)
        
        # Terms that appear in more than 70% of documents might be domain-specific stopwords
        potential_stopwords = {count_features[i] for i in range(len(count_features)) 
                               if doc_freq_percent[i] > 0.7}
        
        # TF-IDF scores
        print("Creating TF-IDF representation for stopword filtering...")
        tfidf_vec = TfidfVectorizer(min_df=5, max_df=0.95)
        tfidf_matrix = tfidf_vec.fit_transform(corpus)
        tfidf_features = tfidf_vec.get_feature_names_out()
        
        # Calculate average TF-IDF for each term
        avg_tfidf = np.array(tfidf_matrix.mean(axis=0)).flatten()
        print("Filtering potential stopwords...")
        for term in tqdm(potential_stopwords, desc="Analyzing stopwords"):
            if term in tfidf_features:
                idx = list(tfidf_features).index(term)
                if avg_tfidf[idx] < 0.01:  # Low information content
                    self.additional_stopwords.add(term)
        
        print(f"Identified {len(self.additional_stopwords)} additional domain-specific stopwords")
        
        # Update stopwords
        global STOP_WORDS
        STOP_WORDS.update(self.additional_stopwords)
        
        # Extract bigrams (phrases)
        print("Extracting common bigrams...")
        bigram_vectorizer = CountVectorizer(ngram_range=(2, 2), min_df=5)
        bigram_matrix = bigram_vectorizer.fit_transform(corpus)
        bigram_features = bigram_vectorizer.get_feature_names_out()
        
        # Calculate bigram frequency
        bigram_freq = np.array(bigram_matrix.sum(axis=0)).flatten()
        
        # Get top bigrams
        top_bigram_indices = bigram_freq.argsort()[-300:][::-1]  # Top 300 bigrams
        self.common_bigrams = {}
        print("Organizing top bigrams...")
        for i in tqdm(top_bigram_indices, desc="Processing bigrams"):
            self.common_bigrams[bigram_features[i]] = bigram_freq[i]
        
        print(f"Extracted {len(self.common_bigrams)} significant bigrams")
        
        # Topic modeling using NMF
        print("Performing topic modeling...")
        nmf_model = NMF(n_components=10, random_state=42)
        nmf_topics = nmf_model.fit_transform(tfidf_matrix)
        
        # Get top terms for each topic
        print("Extracting topics...")
        for topic_idx in tqdm(range(len(nmf_model.components_)), desc="Processing topics"):
            topic = nmf_model.components_[topic_idx]
            top_terms_idx = topic.argsort()[-20:][::-1]  # Top 20 terms
            top_terms = [tfidf_features[i] for i in top_terms_idx]
            self.topics[f"Topic {topic_idx+1}"] = top_terms
        
        # Calculate TF-IDF importance for each term
        print("Calculating term importance...")
        self.term_importance = {}
        for i, term in enumerate(tqdm(tfidf_features, desc="Processing terms")):
            self.term_importance[term] = avg_tfidf[i]
        
        # Sort by importance
        self.term_importance = dict(sorted(self.term_importance.items(), 
                                           key=lambda x: x[1], 
                                           reverse=True))
        
        print("Corpus analysis complete!")
    
    def _create_tfidf_matrix(self):

        # Create a corpus of processed documents
        corpus = self.df['processed_text'].tolist()
        
        # Initialize and fit TF-IDF vectorizer
        print("Creating TF-IDF vectors...")
        self.vectorizer = TfidfVectorizer(
            tokenizer=tokenize_and_lemmatize,
            min_df=2,
            max_df=0.90,
            max_features=5000,
            ngram_range=(1, 2)
        )
        print("Fitting TF-IDF vectorizer (this may take a while)...")
        self.tfidf_matrix = self.vectorizer.fit_transform(tqdm(corpus, desc="Vectorizing documents"))
        feature_names = self.vectorizer.get_feature_names_out()
        
        print(f"Vocabulary size: {len(feature_names)}")
        print(f"TF-IDF matrix shape: {self.tfidf_matrix.shape}")
    
    def get_top_terms(self, n=50):

        return list(self.term_importance.items())[:n]
    
    def get_topics(self):

        return self.topics


# Base recommender class that uses an existing corpus analysis
class DataDrivenRedditJobRecommender:
    def __init__(self, corpus_analysis):

        self.df = corpus_analysis.df
        self.vectorizer = corpus_analysis.vectorizer
        self.tfidf_matrix = corpus_analysis.tfidf_matrix
        self.term_importance = corpus_analysis.term_importance
        self.common_bigrams = corpus_analysis.common_bigrams
        self.topics = corpus_analysis.topics
    
    def expand_query(self, query_tokens):

        expanded_tokens = query_tokens.copy()
        for token in query_tokens:
            # Check if token appears in any topic
            for topic, terms in self.topics.items():
                if token in terms:
                    # Add some related terms from the same topic
                    related_terms = [t for t in terms[:10] if t != token]
                    expanded_tokens.extend(related_terms[:3])  # Add up to 3 related terms
        
        # Weight tokens by importance
        weighted_tokens = []
        for token in expanded_tokens:
            # If token is in term_importance, weight it accordingly
            if token in self.term_importance:
                weight = min(5, int(self.term_importance[token] * 100) + 1)
                weighted_tokens.extend([token] * weight)
            else:
                weighted_tokens.append(token)
        
        # Add common bigrams that contain query tokens
        for bigram, freq in self.common_bigrams.items():
            token1, token2 = bigram.split()
            if token1 in query_tokens or token2 in query_tokens:
                # Add the bigram with a weight based on frequency
                weight = min(3, int(np.log10(freq + 1)) + 1)
                weighted_tokens.extend([bigram] * weight)
        
        return weighted_tokens
    
    def recommend(self, query, top_n=10):

        # Preprocess the query
        processed_query = preprocess_text(query)
        
        # Tokenize and lemmatize
        query_tokens = tokenize_and_lemmatize(processed_query)
        
        if not query_tokens:
            print("Query is too short or only contains stopwords.")
            return pd.DataFrame(columns=['id', 'title', 'selftext', 'subreddit', 'score', 'url', 'relevance_score'])
        
        # Print tokens for debugging
        print(f"Query tokens: {query_tokens}")
        
        # Expand the query with related terms
        print("Expanding query with related terms...")
        expanded_query_tokens = self.expand_query(query_tokens)
        
        # Convert the expanded query tokens back to a string
        expanded_query = ' '.join(expanded_query_tokens)
        
        # Vectorize the query
        print("Vectorizing query...")
        query_vector = self.vectorizer.transform([expanded_query])
        
        # Calculate cosine similarity between query and all posts
        print("Calculating similarities to corpus documents...")
        cosine_similarities = cosine_similarity(query_vector, self.tfidf_matrix).flatten()
        
        # Add similarity scores
        temp_df = self.df.copy()
        temp_df['relevance_score'] = cosine_similarities
        
        # Get top N recommendations
        print(f"Finding top {top_n} most relevant posts...")
        recommendations = temp_df.sort_values('relevance_score', ascending=False).head(top_n)
        
        # Select columns to return
        result = recommendations[['id', 'title', 'selftext', 'subreddit', 'score', 'url', 'relevance_score']]
        
        # Round relevance score for readability
        result['relevance_score'] = result['relevance_score'].round(3)
        
        return result
    
    def search(self, query, top_n=10):

        recommendations = self.recommend(query, top_n)
        
        print(f"\nTop {top_n} Recommendations for query: {query}")
        print("=" * 80)
        
        if recommendations.empty:
            print("No relevant results found.")
            return recommendations
        
        for i, row in recommendations.iterrows():
            # Format the selftext to show a preview
            selftext = row['selftext'] if pd.notna(row['selftext']) else ""
            selftext_preview = selftext[:150] + "..." if len(str(selftext)) > 150 else selftext
            
            print(f"{i+1}. [{row['relevance_score']:.3f}] {row['title']}")
            print(f"   {selftext_preview}")
            print(f"   subreddit: r/{row['subreddit']}, score: {row['score']}")
            print(f"   URL: {row['url']}")
            print("-" * 80)
        
        return recommendations


def analyze_and_save_corpus(csv_file='merged_reddit_only.csv', output_file='corpus_analysis.pkl'):
    
    print(f"Loading data from {csv_file}...")
    df = pd.read_csv(csv_file)
    print("Initializing corpus analysis...")
    corpus_analysis = RedditJobCorpusAnalysis(df)
    
    print(f"Saving corpus analysis to {output_file}...")
    with open(output_file, 'wb') as f:
        pickle.dump(corpus_analysis, f)
    
    print("Analysis complete and saved!")
    return corpus_analysis

def load_corpus(corpus_file='corpus_analysis.pkl'):

    print(f"Loading corpus analysis from {corpus_file}...")
    with open(corpus_file, 'rb') as f:
        corpus_analysis = pickle.load(f)
    return corpus_analysis


# Example usage with corpus persistence
if __name__ == "__main__":
    corpus_file = 'corpus_analysis.pkl'
    
    # Check if saved analysis exists
    if os.path.exists(corpus_file):
        print(f"Loading existing corpus analysis from {corpus_file}...")
        with open(corpus_file, 'rb') as f:
            corpus_analysis = pickle.load(f)
        print("Corpus loaded successfully!")
    else:
        print("No saved corpus analysis found. Creating new one...")
        # Load data from CSV
        df = pd.read_csv('merged_reddit_only.csv')
        print(f"Loaded {len(df)} posts from CSV.")
        print("\nInitializing corpus analysis...")
        corpus_analysis = RedditJobCorpusAnalysis(df)
        
        # Save for future use
        print(f"Saving corpus analysis to {corpus_file}...")
        with open(corpus_file, 'wb') as f:
            pickle.dump(corpus_analysis, f)
        print("Corpus analysis saved successfully!")
    
    # Create recommender
    basic_recommender = DataDrivenRedditJobRecommender(corpus_analysis)
    
    # Example queries
    print("\n=== Testing Recommender ===")
    queries = [
        "I am a JHU graduate student in data science, looking for internships in machine learning",
        "Senior software engineer with React and Node.js experience",
        "Entry level Python developer positions in Baltimore"
    ]
    
    for i, query in enumerate(queries):
        print(f"\n\nTesting query {i+1}/{len(queries)}: {query}")
        recommendations = basic_recommender.search(query, top_n=5)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\23109\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\23109\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\23109\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Loading existing corpus analysis from corpus_analysis.pkl...
Corpus loaded successfully!

=== Testing Recommender ===


Testing query 1/3: I am a JHU graduate student in data science, looking for internships in machine learning
Query tokens: ['jhu', 'graduate', 'student', 'data', 'science', 'internship', 'machine', 'learning']
Expanding query with related terms...
Vectorizing query...
Calculating similarities to corpus documents...
Finding top 5 most relevant posts...

Top 5 Recommendations for query: I am a JHU graduate student in data science, looking for internships in machine learning
3596. [0.779] How to start Data Science and Machine Learning Career?
   So, what’s the buzz all about with Data Science and Machine Learning. Seems like every other developer is fascinated by these two terms. If you ask an...
   subreddit: r/ReviewNPrep, score: 4
   URL: https://www.reddit.com/r/ReviewNPrep/comments/r1lqul/how_to_start_data_science_and_machine_learning/
-------------------------------