In [14]:
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split


In [15]:
df = pd.read_csv(r"C:\Users\adity\Downloads\dblp-v10.csv")
df.head()  # Display the first few rows

Unnamed: 0,abstract,authors,n_citation,references,title,venue,year,id
0,"In this paper, a robust 3D triangular mesh wat...","['S. Ben Jabra', 'Ezzeddine Zagrouba']",50,"['09cb2d7d-47d1-4a85-bfe5-faa8221e644b', '10aa...",A new approach of 3D watermarking based on ima...,international symposium on computers and commu...,2008,4ab3735c-80f1-472d-b953-fa0557fed28b
1,We studied an autoassociative neural network w...,"['Joaquín J. Torres', 'Jesús M. Cortés', 'Joaq...",50,"['4017c9d2-9845-4ad2-ad5b-ba65523727c5', 'b118...",Attractor neural networks with activity-depend...,Neurocomputing,2007,4ab39729-af77-46f7-a662-16984fb9c1db
2,It is well-known that Sturmian sequences are t...,"['Genevi eve Paquin', 'Laurent Vuillon']",50,"['1c655ee2-067d-4bc4-b8cc-bc779e9a7f10', '2e4e...",A characterization of balanced episturmian seq...,Electronic Journal of Combinatorics,2007,4ab3a4cf-1d96-4ce5-ab6f-b3e19fc260de
3,One of the fundamental challenges of recognizi...,"['Yaser Sheikh', 'Mumtaz Sheikh', 'Mubarak Shah']",221,"['056116c1-9e7a-4f9b-a918-44eb199e67d6', '05ac...",Exploring the space of a human action,international conference on computer vision,2005,4ab3a98c-3620-47ec-b578-884ecf4a6206
4,This paper generalizes previous optimal upper ...,"['Efraim Laksman', 'Håkan Lennerstad', 'Magnus...",0,"['01a765b8-0cb3-495c-996f-29c36756b435', '5dbc...",Generalized upper bounds on the minimum distan...,Ima Journal of Mathematical Control and Inform...,2015,4ab3b585-82b4-4207-91dd-b6bce7e27c4e


In [16]:
df.shape

(1000000, 8)

In [17]:
print(df.isnull().sum())


abstract      172467
authors            2
n_citation         0
references    124417
title              0
venue         177755
year               0
id                 0
dtype: int64


In [18]:
df.dropna(subset=['authors'], inplace=True)  # Drop rows where authors are missing
df['abstract'].fillna("Unknown", inplace=True)
df['venue'].fillna("Unknown", inplace=True)
df['references'].fillna("[]", inplace=True)


In [19]:
df.shape

(999998, 8)

In [20]:
df = df.dropna(subset=["abstract"])
df = df[df["n_citation"] > 10]
df = df.sample(n=100000, random_state=42)


In [21]:
df.shape


(100000, 8)

In [22]:
df.head()

Unnamed: 0,abstract,authors,n_citation,references,title,venue,year,id
184805,As mobile robots are taking on more and more o...,"['Mansur R. Kabuka', 'Alvaro E. Arenas']",155,"['4494f0d5-7bb5-429a-99ba-ecc26c614041', 'acf1...",Position verification of a mobile robot using ...,international conference on robotics and autom...,1987,88117b8f-4698-40d9-8a9d-2584a46adedb
151238,The NAR online Molecular Biology Database Coll...,['Michael Y. Galperin'],415,"['031ee1f7-92be-41e4-8139-8566b48cd2d2', '04de...",The Molecular Biology Database Collection: 200...,Nucleic Acids Research,2004,7cf709fa-600a-47b7-bf29-90e73109f432
78501,When several models are proposed for one and t...,"['Brecht Donckels', 'Dirk De Pauw', 'Peter Van...",14,[],A Kernel-Based Method to Determine Optimal Sam...,Journal of Computational Chemistry,2009,64b9ab33-7ba4-4901-bc01-9354d1699a8a
86318,Several types of industrial Real-Time Ethernet...,"['Paolo Ferrari', 'Alessandra Flammini', 'Stef...",50,"['1d7f6dab-aca5-4f78-b759-043553a70f4d', '21ba...",On the Seamless Interconnection of IEEE1588-Ba...,IEEE Transactions on Industrial Informatics,2010,675680c0-ff8b-4702-9a0d-f29c1c7643b5
373597,Purpose – The purpose of this paper is to desc...,"['Dimitri V. Zarzhitsky', 'Diana F. Spears', '...",50,"['03ad315a-1710-4fd0-a4da-df8ad197a423', '08d8...",Experimental studies of swarm robotic chemical...,International Journal of Intelligent Computing...,2010,c6c32b3c-1131-4229-b719-8a99155f4cf3


In [23]:
# Combine 'title' and 'abstract' into a single text column for better recommendations
df['combined_text'] = df['title'] + " " + df['abstract']

# Vectorize the text using TF-IDF
vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform(df['combined_text'])

In [24]:
def recommend_papers(query, top_n=3):
    # Transform the user's query
    query_vec = vectorizer.transform([query])
    
    # Compute cosine similarity between the query and all papers
    cosine_sim = cosine_similarity(query_vec, tfidf_matrix)
    
    # Get indices of the most similar papers
    similar_indices = cosine_sim.argsort()[0][-top_n:][::-1]
    
    # Prepare recommendations based on similarity
    recommendations = []
    for idx in similar_indices:
        paper = df.iloc[idx]
        recommendations.append({
            'title': paper['title'],
            'authors': paper['authors'],
            'citations': paper['n_citation'],
            'year': paper['year'],
            'venue': paper['venue'],
            'id': paper['id']
        })
    return recommendations

In [27]:
# Function to calculate the cosine similarity of the top recommended papers
def calculate_similarity_accuracy(query, top_n=3):
    # Transform the user's query
    query_vec = vectorizer.transform([query])
    
    # Compute cosine similarity between the query and all papers
    cosine_sim = cosine_similarity(query_vec, tfidf_matrix)
    
    # Get indices of the most similar papers
    similar_indices = cosine_sim.argsort()[0][-top_n:][::-1]
    
    # Calculate the average cosine similarity of top_n papers
    avg_similarity = cosine_sim[0][similar_indices].mean()
    
    return avg_similarity

# Ask the user for a query
query = input("Enter the research paper or topic of interest: ")

# Calculate similarity-based accuracy
accuracy = calculate_similarity_accuracy(query)

# Display the accuracy and recommended papers
print(f"\nModel Accuracy (Cosine Similarity): {accuracy * 100:.2f}%\n")

recommended_papers = recommend_papers(query)

if recommended_papers:
    print("\nRecommended Papers:\n")
    for paper in recommended_papers:
        print(f"Title: {paper['title']}")
        print(f"Authors: {', '.join(paper['authors'])}")
        print(f"Citations: {paper['citations']}")
        print(f"Year: {paper['year']}")
        print(f"Venue: {paper['venue']}")
        print(f"ID: {paper['id']}")
        print("-" * 50)
else:
    print("No papers found for the given query.")



Model Accuracy (Cosine Similarity): 44.86%


Recommended Papers:

Title: Question Answering Systems in biology and medicine – the time is now
Authors: [, ', J, o, n, a, t, h, a, n,  , D, .,  , W, r, e, n, ', ]
Citations: 18
Year: 2011
Venue: Bioinformatics
ID: f831505d-b92a-4659-b501-b5997f10dddd
--------------------------------------------------
Title: Speculations on Biology, Information and Complexity
Authors: [, ', G, r, e, g, o, r, y,  , J, .,  , C, h, a, i, t, i, n, ', ]
Citations: 50
Year: 2006
Venue: Bulletin of The European Association for Theoretical Computer Science
ID: 13182ab7-2ee8-4f27-a7d6-aed2dbdb2323
--------------------------------------------------
Title: Algorithms on Strings, Trees, and Sequences: Computer Science and Computational Biology
Authors: [, ', S, u, s, a, n,  , H, o, l, m, e, s, ', ,,  , ', D, a, n,  , G, u, s, f, i, e, l, d, ', ]
Citations: 50
Year: 1999
Venue: Journal of the American Statistical Association
ID: 5cf1a54c-78cb-406f-a108-2734fd8e79b1
---