In [1]:
import pandas as pd
import networkx as nx
from sklearn.preprocessing import normalize
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from rouge import Rouge

In [2]:

df1 = pd.read_csv('data\data3.csv')
df2 = pd.read_csv('data\data7.csv')

# Concatenate the dataframes vertically
data = pd.concat([df1, df2], axis=0)

data.dropna(subset=['title', 'abstract'], inplace=True)
# Reset the index of the combined dataframe
data.reset_index(drop=True, inplace=True)
#data = data.head(2000)
# Load the dataset from CSV
#df = pd.read_csv('your_dataset.csv')
df = data
abstracts = df['abstract'].tolist()
titles = df['title'].tolist()

# Initialize ROUGE scorer
rouge = Rouge()

# Preprocess the text data
def preprocess_text(text):
    # Implement any necessary text preprocessing steps
    # Return the preprocessed text
    return text

abstracts = [preprocess_text(abstract) for abstract in abstracts]
titles = [preprocess_text(title) for title in titles]

# Check if the results CSV file exists
try:
    existing_results = pd.read_csv('summary_scores.csv')
    results = existing_results.to_dict('records')
except FileNotFoundError:
    results = []

# Build the text graph
def build_text_graph(sentences):
    # Initialize an empty graph
    graph = nx.Graph()

    # Add nodes to the graph for each sentence
    for i, sentence in enumerate(sentences):
        graph.add_node(i, sentence=sentence)

    # Add edges between sentences based on similarity
    # Implement your similarity measure or use existing algorithms like Cosine Similarity
    # Update the graph with the edge weights

    return graph

# Apply the PageRank algorithm
def apply_pagerank(graph):
    scores = nx.pagerank(graph)
    return scores

# Extract the top-ranked sentences as the summary
def extract_summary(sentences, scores, k=1):
    # Sort sentences based on the PageRank scores
    ranked_sentences = sorted(scores, key=scores.get, reverse=True)
    summary_sentences = ranked_sentences[:k]

    # Retrieve the corresponding sentences from the original list
    summary = [sentences[sentence_index] for sentence_index in summary_sentences]
    return ' '.join(summary)






In [4]:
# Iterate over each abstract and generate summary
for i, (abstract, title) in enumerate(zip(abstracts, titles)):
    if i == 2000:
        break
    # Build the text graph using abstracts
    text_graph = build_text_graph([abstract])

    # Apply the PageRank algorithm to the text graph
    scores = apply_pagerank(text_graph)

    # Extract the summary
    generated_summary = extract_summary([abstract], scores)

    # Calculate ROUGE scores if the summary is not empty
    if generated_summary and title:
        try:
            scores = rouge.get_scores(generated_summary, title)
            rouge_score = scores[0]['rouge-1']['f']
        except:
            rouge_score = 0.0
    else:
        rouge_score = 0.0

    # Append the result to the list
    result = {'Index': i+1, 'ROUGE Score': rouge_score}
    results.append(result)
    print(result)

    # Convert the results to a DataFrame
    results_df = pd.DataFrame(results)

    # Save the results to the CSV file
    results_df.to_csv('summary_scores.csv', index=False, mode='w' if i == 0 else 'a', header=i == 0)
average_rouge_score = results_df['ROUGE Score'].mean()

# Append the average score to the result
result = {'Index': 'Average', 'ROUGE Score': average_rouge_score}
print(result)

{'Index': 1, 'ROUGE Score': 0.014388488515087245}
{'Index': 2, 'ROUGE Score': 0.013605440908880677}
{'Index': 3, 'ROUGE Score': 0.03149606167524341}
{'Index': 4, 'ROUGE Score': 0.03846153599297353}
{'Index': 5, 'ROUGE Score': 0.13157894470914133}
{'Index': 6, 'ROUGE Score': 0.10958903981985363}
{'Index': 7, 'ROUGE Score': 0.19512194926531828}
{'Index': 8, 'ROUGE Score': 0.16260162425540353}
{'Index': 9, 'ROUGE Score': 0.08955223755290712}
{'Index': 10, 'ROUGE Score': 0.07476635314874666}
{'Index': 11, 'ROUGE Score': 0.0312499957031256}
{'Index': 12, 'ROUGE Score': 0.0591715964160919}
{'Index': 13, 'ROUGE Score': 0.19999999722222223}
{'Index': 14, 'ROUGE Score': 0.03149606128092266}
{'Index': 15, 'ROUGE Score': 0.0294117630968859}
{'Index': 16, 'ROUGE Score': 0.030534349865392517}
{'Index': 17, 'ROUGE Score': 0.010869562869211284}
{'Index': 18, 'ROUGE Score': 0.22018348324215137}
{'Index': 19, 'ROUGE Score': 0.09467455499597353}
{'Index': 20, 'ROUGE Score': 0.0624999983007813}
{'Index':

KeyboardInterrupt: 