In [2]:
import networkx as nx
import pandas as pd

from nltk.tokenize import sent_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from rouge import Rouge

import sys
import nltk
sys.setrecursionlimit(10**6)

In [10]:
# Read the dataset from CSV
df1 = pd.read_csv('data\data3.csv')
df2 = pd.read_csv('data\data7.csv')

# Concatenate the dataframes vertically
data = pd.concat([df1, df2], axis=0)

data.dropna(subset=['title', 'abstract'], inplace=True)
data.reset_index(drop=True, inplace=True)

data = data.head(10000)

# Preprocessing
def preprocess_text(text):
	if isinstance(text, str):  # Check if the text is a valid string
		sentences = sent_tokenize(text)
		return sentences
	return []

data['preprocessed_abstract'] = data['abstract'].apply(preprocess_text)

# Filter out empty or missing abstracts
data = data[data['preprocessed_abstract'].apply(len) > 0]

# Calculate sentence similarity once
vectorizer = TfidfVectorizer()
sentence_vectors = vectorizer.fit_transform(data['preprocessed_abstract'].apply(' '.join))
similarity_matrix = cosine_similarity(sentence_vectors)

# Graph Construction
graph = nx.from_numpy_array(similarity_matrix)

# Graph Ranking (PageRank)
scores = nx.pagerank(graph)

# Calculate the summary and ROUGE score for each data point
rouge_scores = []
for i in range(len(data)):
	print(i)
	abstract = ' '.join(data['preprocessed_abstract'][i])  # Combine sentences in the abstract
	title = data['title'][i]

	# Calculate sentence scores
	sentence_scores = {i: score for i, score in enumerate(scores)}

	# Set the ratio of sentences to include in the summary
	summary_ratio = 0.3  # Adjust as needed

	# Select Top Sentences based on scores
	num_sentences = int(len(data) * summary_ratio)
	top_sentences = sorted(sentence_scores, key=sentence_scores.get, reverse=True)[:num_sentences]
	summary = ' '.join([sentence for i in top_sentences for sentence in data['preprocessed_abstract'][i]])


	# Compute ROUGE score
	rouge = Rouge()
	rouge_scores.append(rouge.get_scores(summary, title))

# Calculate average ROUGE-1 F1 score
average_rouge_1_f1_score = sum(score[0]['rouge-1']['f'] for score in rouge_scores) / len(rouge_scores)

print("Average ROUGE-1 F1 Score:", average_rouge_1_f1_score)
