In [3]:
from tensorflow.keras import layers
from tensorflow import keras
import tensorflow as tf

from sklearn.model_selection import train_test_split

from ast import literal_eval

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

In [4]:
df = pd.read_csv(r"C:\Users\adity\Downloads\arxiv_data.csv")
df.head()


Unnamed: 0,titles,summaries,terms
0,Survey on Semantic Stereo Matching / Semantic ...,Stereo matching is one of the widely used tech...,"['cs.CV', 'cs.LG']"
1,FUTURE-AI: Guiding Principles and Consensus Re...,The recent advancements in artificial intellig...,"['cs.CV', 'cs.AI', 'cs.LG']"
2,Enforcing Mutual Consistency of Hard Regions f...,"In this paper, we proposed a novel mutual cons...","['cs.CV', 'cs.AI']"
3,Parameter Decoupling Strategy for Semi-supervi...,Consistency training has proven to be an advan...,['cs.CV']
4,Background-Foreground Segmentation for Interio...,"To ensure safety in automated driving, the cor...","['cs.CV', 'cs.LG']"


In [13]:
# Research Paper Recommendation System
# This system recommends relevant research papers based on user queries

import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re

# First, install and download NLTK resources
# Run these commands once to download the necessary resources
import nltk

# Download NLTK resources explicitly - this is important!
try:
    # Try to download resources
    nltk.download('punkt', quiet=True)
    nltk.download('stopwords', quiet=True)
    nltk.download('wordnet', quiet=True)
    print("NLTK resources downloaded successfully!")
except Exception as e:
    print(f"Error downloading NLTK resources: {e}")
    print("Proceeding with a simplified preprocessing function...")

# Load the dataset
df = pd.read_csv(r"C:\Users\adity\Downloads\arxiv_data.csv")
print(f"Dataset loaded with {len(df)} papers")

# Define a simpler preprocessing function that doesn't rely on NLTK resources
def simple_preprocess_text(text):
    if isinstance(text, str):
        # Convert to lowercase
        text = text.lower()
        # Remove special characters and digits
        text = re.sub(r'[^\w\s]', '', text)
        text = re.sub(r'\d+', '', text)
        return text
    return ''

# Apply preprocessing to titles and summaries
print("Preprocessing data...")
df['processed_title'] = df['titles'].apply(simple_preprocess_text)
df['processed_summary'] = df['summaries'].apply(simple_preprocess_text)

# Combine title and summary for better feature representation
df['combined_features'] = df['processed_title'] + ' ' + df['processed_summary']

# Extract terms as text (they appear to be in list format)
def extract_terms(term_str):
    try:
        # Try to evaluate the string as a list
        if isinstance(term_str, str):
            if term_str.startswith('[') and term_str.endswith(']'):
                return ' '.join(eval(term_str))
            return term_str
        return ''
    except:
        # If evaluation fails, return the original string
        return term_str if isinstance(term_str, str) else ''

df['terms_text'] = df['terms'].apply(extract_terms)

# Add terms to the features
df['features'] = df['combined_features'] + ' ' + df['terms_text']

# Create TF-IDF vectors
print("Creating TF-IDF vectors...")
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
tfidf_matrix = tfidf_vectorizer.fit_transform(df['features'])
print(f"TF-IDF matrix shape: {tfidf_matrix.shape}")

# Function to get paper recommendations
def get_paper_recommendations(user_query, top_n=5):
    # Preprocess the user query
    processed_query = simple_preprocess_text(user_query)
    
    # Transform the query to TF-IDF vector
    query_vector = tfidf_vectorizer.transform([processed_query])
    
    # Calculate cosine similarity
    similarity_scores = cosine_similarity(query_vector, tfidf_matrix).flatten()
    
    # Get indices of top N most similar papers
    top_indices = similarity_scores.argsort()[:-top_n-1:-1]
    
    # Return the recommendations
    recommendations = df.iloc[top_indices][['titles', 'summaries', 'terms']]
    
    return recommendations, similarity_scores[top_indices]

# Example usage
def recommend_papers(user_query, top_n=5):
    recommendations, scores = get_paper_recommendations(user_query, top_n)
    
    print(f"\nRecommendations for query: '{user_query}'\n")
    
    for i, (_, row) in enumerate(recommendations.iterrows()):
        print(f"Recommendation {i+1}: (Similarity: {scores[i]:.4f})")
        print(f"Title: {row['titles']}")
        print(f"Summary: {row['summaries'][:200]}...")
        print(f"Terms: {row['terms']}")
        print("-" * 80)
    
    return recommendations

# Test the recommendation system
print("\nTesting recommendation system...")
user_query = "deep learning for computer vision"
recommend_papers(user_query)

# Interactive recommendation function
def interactive_recommendation():
    while True:
        query = input("\nEnter your research interest (or 'exit' to quit): ")
        if query.lower() == 'exit':
            break
        n = input("How many recommendations do you want? (default: 5): ")
        try:
            n = int(n)
        except:
            n = 5
        recommend_papers(query, n)

# Run the interactive recommendation system
print("\nTo use interactive mode, run: interactive_recommendation()")
# interactive_recommendation()

NLTK resources downloaded successfully!
Dataset loaded with 51774 papers
Preprocessing data...
Creating TF-IDF vectors...
TF-IDF matrix shape: (51774, 5000)

Testing recommendation system...

Recommendations for query: 'deep learning for computer vision'

Recommendation 1: (Similarity: 0.5520)
Title: Are object detection assessment criteria ready for maritime computer vision?
Summary: Maritime vessels equipped with visible and infrared cameras can complement
other conventional sensors for object detection. However, application of
computer vision techniques in maritime domain receiv...
Terms: ['cs.CV']
--------------------------------------------------------------------------------
Recommendation 2: (Similarity: 0.5520)
Title: Are object detection assessment criteria ready for maritime computer vision?
Summary: Maritime vessels equipped with visible and infrared cameras can complement
other conventional sensors for object detection. However, application of
computer vision techniques in 

In [19]:
# Evaluate with synthetic queries - simplified version
def evaluate_model_with_synthetic_queries():
    # Create synthetic queries from paper titles
    num_samples = min(50, len(df))
    test_indices = np.random.choice(len(df), num_samples, replace=False)
    test_df = df.iloc[test_indices]
    
    # Get keywords from titles to use as queries
    def extract_keywords(title, n=3):
        words = title.split()
        if len(words) <= n:
            return title
        else:
            return ' '.join(np.random.choice(words, n, replace=False))
    
    test_queries = test_df['titles'].apply(extract_keywords).tolist()
    
    # Store results
    hit_rates = []  # Did the original paper appear in top 10?
    avg_positions = []  # Average position of the original paper
    avg_similarities = []  # Average similarity score
    
    print("Evaluating model with synthetic queries...")
    for i, query in enumerate(test_queries):
        original_idx = test_indices[i]
        
        # Get recommendations
        _, similarity_scores, top_indices = get_paper_recommendations(query, top_n=10)
        avg_similarities.append(np.mean(similarity_scores))
        
        # Check if original paper is in recommendations
        if original_idx in top_indices:
            hit_rates.append(1)
            position = np.where(top_indices == original_idx)[0][0] + 1  # Position (1-based)
            avg_positions.append(position)
        else:
            hit_rates.append(0)
            avg_positions.append(11)  # Not found, assign position beyond top 10
    
    # Calculate metrics
    hit_rate = np.mean(hit_rates)
    mean_position = np.mean([p for p in avg_positions if p <= 10])  # Average position when found
    mean_similarity = np.mean(avg_similarities)
    
    return {
        'hit_rate': hit_rate,
        'mean_position': mean_position,
        'mean_similarity': mean_similarity,
        'all_positions': avg_positions,
        'all_similarities': avg_similarities
    }

# Visualize similarity distribution
def plot_similarity_distribution(similarities):
    plt.figure(figsize=(10, 6))
    sns.histplot(similarities, kde=True)
    plt.title('Distribution of Similarity Scores')
    plt.xlabel('Similarity Score')
    plt.ylabel('Frequency')
    plt.savefig('similarity_distribution.png')
    plt.close()
    return 'similarity_distribution.png'

# Visualize performance metrics
def plot_performance_metrics(eval_results):
    plt.figure(figsize=(8, 6))
    metrics = ['Hit Rate', 'Mean Position\n(lower is better)']
    values = [eval_results['hit_rate'], min(10, eval_results['mean_position'])/10]  # Normalize position to 0-1 scale
    
    colors = ['#5DA5DA', '#FAA43A']
    plt.bar(metrics, values, color=colors)
    plt.title('Model Performance Metrics')
    plt.ylabel('Score')
    plt.ylim(0, 1)
    
    # Add actual values as text
    plt.text(0, values[0] + 0.05, f"{eval_results['hit_rate']:.2f}", ha='center')
    plt.text(1, values[1] + 0.05, f"{eval_results['mean_position']:.1f}", ha='center')
    
    plt.savefig('performance_metrics.png')
    plt.close()
    return 'performance_metrics.png'

# Plot rank distribution
def plot_rank_distribution(positions):
    plt.figure(figsize=(10, 6))
    positions_in_top10 = [p for p in positions if p <= 10]
    if positions_in_top10:
        sns.countplot(x=positions_in_top10)
        plt.title('Distribution of Recommendation Ranks')
        plt.xlabel('Rank Position')
        plt.ylabel('Frequency')
        plt.savefig('rank_distribution.png')
        plt.close()
        return 'rank_distribution.png'
    else:
        return None

# Extract and visualize term distribution
def analyze_term_distribution():
    # Generate some test queries
    test_queries = [
        "deep learning", 
        "computer vision", 
        "natural language processing", 
        "reinforcement learning",
        "graph neural networks"
    ]
    
    all_recommended_terms = []
    query_term_dict = {}
    
    for query in test_queries:
        query_terms = []
        recommendations, _, _ = get_paper_recommendations(query, top_n=10)
        for _, row in recommendations.iterrows():
            try:
                term_str = row['terms']
                if isinstance(term_str, str) and term_str.startswith('[') and term_str.endswith(']'):
                    terms = eval(term_str)
                    all_recommended_terms.extend(terms)
                    query_terms.extend(terms)
            except:
                continue
        query_term_dict[query] = Counter(query_terms).most_common(5)
    
    # Count term frequencies
    term_counts = Counter(all_recommended_terms).most_common(15)
    
    # Visualize overall distribution
    plt.figure(figsize=(12, 6))
    terms, counts = zip(*term_counts)
    plt.barh(terms, counts, color=sns.color_palette("viridis", len(terms)))
    plt.title('Most Common Terms in Recommendations')
    plt.xlabel('Frequency')
    plt.tight_layout()
    plt.savefig('term_distribution.png')
    plt.close()
    
    # Visualize per-query distribution
    plt.figure(figsize=(14, 10))
    for i, (query, query_terms) in enumerate(query_term_dict.items()):
        plt.subplot(len(query_term_dict), 1, i+1)
        if query_terms:
            terms, counts = zip(*query_terms)
            plt.barh(terms, counts, color=sns.color_palette("Set2", len(terms)))
            plt.title(f'Top Terms for Query: "{query}"')
            plt.xlabel('Frequency')
    
    plt.tight_layout()
    plt.savefig('query_term_distribution.png')
    plt.close()
    
    return 'term_distribution.png', 'query_term_distribution.png', term_counts


# Test the recommendation system
print("\nTesting recommendation system with a sample query...")
user_query = "deep learning for computer vision"
recommend_papers(user_query)

print("\nTo use interactive mode, run: interactive_recommendation()")


Testing recommendation system with a sample query...

Recommendations for query: 'deep learning for computer vision'

Recommendation 1: (Similarity: 0.5520)
Title: Are object detection assessment criteria ready for maritime computer vision?
Summary: Maritime vessels equipped with visible and infrared cameras can complement
other conventional sensors for object detection. However, application of
computer vision techniques in maritime domain receiv...
Terms: ['cs.CV']
--------------------------------------------------------------------------------
Recommendation 2: (Similarity: 0.5520)
Title: Are object detection assessment criteria ready for maritime computer vision?
Summary: Maritime vessels equipped with visible and infrared cameras can complement
other conventional sensors for object detection. However, application of
computer vision techniques in maritime domain receiv...
Terms: ['cs.CV']
--------------------------------------------------------------------------------
Recommendatio