In [None]:
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

# Step 1: Read the Excel file
papers_df = pd.read_excel('test.xlsx')

# Step 2: Define Scoring Criteria
weights = {
    'relevance': 0.7,
    'author_count': 0.3
}

# Step 3: Assign Weights

# Step 4: Calculate Scores

# Tokenize words and remove stopwords
nltk.download('punkt')
nltk.download('stopwords')
stop_words = stopwords.words('english')

# Calculate abstract or title similarity using cosine similarity
def calculate_relevance_score(reference_text, target_texts):
    # Tokenize and remove stopwords
    vectorizer = TfidfVectorizer(stop_words=stop_words, tokenizer=word_tokenize)

    # Convert reference and target texts into TF-IDF vectors
    tfidf_matrix = vectorizer.fit_transform([reference_text] + target_texts)

    # Calculate cosine similarity matrix
    cosine_similarities = cosine_similarity(tfidf_matrix[0], tfidf_matrix[1:]).flatten()

    return cosine_similarities

def calculate_author_count_score(authors):
    # Calculate the author count score based on your desired logic
    # Replace the code below with your implementation
    return 1.0

# Replace NaN values with empty strings
papers_df['abstract'].fillna('', inplace=True)
papers_df['title'].fillna('', inplace=True)

for index, paper in papers_df.iterrows():
    # Calculate relevance score based on abstract similarity
    abstract_scores = calculate_relevance_score(paper['abstract'], papers_df['abstract'].tolist())

    # Calculate relevance score based on title similarity
    title_scores = calculate_relevance_score(paper['title'], papers_df['title'].tolist())

    # Calculate the overall relevance score
    relevance_score = (abstract_scores.mean() + title_scores.mean()) / 2

    # Calculate the overall score
    author_score = calculate_author_count_score(paper['authors'])
    papers_df.at[index, 'score'] = (relevance_score * weights['relevance'] +
                                    author_score * weights['author_count'])

# Step 5: Sort and Rank
ranked_papers = papers_df.sort_values('score', ascending=False)

# Export the ranked papers to an Excel file
output_file = 'similarity_rank.xlsx'
ranked_papers.to_excel(output_file, index=False)

print("Results exported successfully to", output_file)