# Setup

In [1]:
import warnings
warnings.filterwarnings('ignore')

import os
import pickle
import json
import random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import tensorflow as tf
import tensorflow_hub as hub
from tensorflow.keras.losses import cosine_similarity
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize

plt.rcParams['figure.figsize'] = (8,5)
plt.rcParams['font.size'] = 14




[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\verma\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Loading Data

In [2]:
if os.path.exists('preprocessed_data_embeddings.pkl'):
    # Load preprocessed data
    with open('preprocessed_data_embeddings.pkl', 'rb') as f:
        preprocessed_data = pickle.load(f)
    df,tfidf_matrix,tfidf_vectorizer = preprocessed_data['df'], preprocessed_data['tfidf_matrix'], preprocessed_data['tfidf_vectorizer']
else:
    FILE = 'arxiv-metadata-oai-snapshot.json'
    def get_data():
        with open(FILE) as f:
            for line in f:
                yield line


    dataframe = {
        "submitter": [],
        "authors": [],
        "title": [],
        "doi": [],
        "categories": [],
        "abstract": [],
        "update_date": []
    }

    data = get_data()
    for i, paper in enumerate(data):
        paper = json.loads(paper)
        try:
            date = int(paper['update_date'].split('-')[0])
            if date > 2019:
                dataframe['submitter'].append(paper['submitter'])
                dataframe['authors'].append(paper['authors'])
                dataframe['title'].append(paper['title'])
                dataframe['doi'].append(paper['doi'])
                dataframe['categories'].append(paper['categories'])
                dataframe['abstract'].append(paper['abstract'])
                dataframe['update_date'].append(paper['update_date'])
        except:
            pass

    df = pd.DataFrame(dataframe)
    del dataframe
    df['length'] = df['abstract'].str.len()
    def word_count(x):
        return len(x.split())

    df['word_count'] = df['abstract'].apply(word_count)
    
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix = tfidf_vectorizer.fit_transform(df['abstract'])
    preprocessed_data = {
        'df': df, 'tfidf_vectorizer':tfidf_vectorizer, 'tfidf_matrix': tfidf_matrix}
    with open('preprocessed_data_embeddings.pkl', 'wb') as f:
        pickle.dump(preprocessed_data, f)

In [7]:
# Ask user for topic of interest
topic_of_interest = input("Enter your topic of interest: ")
resultsno = int(input("Enter the number of results you want: "))

# Preprocess user input
# (You can use your existing word_tokenize function)
processed_topic = word_tokenize(topic_of_interest.lower())

# Convert list of processed words back to string
processed_topic_str = " ".join(processed_topic)
user_tfidf = tfidf_vectorizer.transform([processed_topic_str])
cosine_similarities = cosine_similarity(user_tfidf, tfidf_matrix).flatten()

# Get indices of top recommendations
top_indices = cosine_similarities.argsort()[-resultsno:][::-1]

# Display top recommendations
print(f"Top Recommendations for {topic_of_interest}:")
j=0
for idx in top_indices:
    print(j,df['title'][idx])
    print()
    j+=1


Top Recommendations for watermarking:
New Evaluation Metrics Capture Quality Degradation due to LLM
  Watermarking

A Survey of Text Watermarking in the Era of Large Language Models

ItoV: Efficiently Adapting Deep Learning-based Image Watermarking to
  Video Watermarking

Knowledge-Free Black-Box Watermark and Ownership Proof for Image
  Classification Neural Networks

Cryptographic switching functions for multiplicative watermarking in
  cyber-physical systems

Reversible Watermarking in Deep Convolutional Neural Networks for
  Integrity Authentication

A survey of deep neural network watermarking techniques

Publicly Detectable Watermarking for Language Models

