In [1]:
import datetime,os
import pandas as pd
from transformers import pipeline

# Initialize the summarization pipeline
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

# Optional: Initialize the classification pipeline
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")


today_str = datetime.date.today().strftime("%Y-%m-%d")
# Read the CSV file
csv_path = f"./Data/cyber_gov_{today_str}.csv"
df = pd.read_csv(csv_path)

# Prepare lists to store the summary and classification results
summaries = []
URL = []
DATE = []
labels = []
scores = []
final_labels = []
# Specify candidate labels for classification
candidate_labels = ["cyber security", "business", "finance", "technology"]

# Iterate over each row in the DataFrame
for index, row in df.iterrows():
    content = row['Content']
    url = row['URL']
    URL.append(url)
    date = row['Date']
    DATE.append(date)
    # Perform summarization
    summary_result = summarizer(content, max_length=30, min_length=10, do_sample=False)
    summary_text = summary_result[0]['summary_text']
    summaries.append(summary_text)
    
    # Perform classification
    classification_result = classifier(content, candidate_labels)
    
    # Determine the final label based on the highest score
    max_score_index = classification_result['scores'].index(max(classification_result['scores']))
    final_label = classification_result['labels'][max_score_index]
    final_labels.append(final_label)

# Add the summary, classification results, and final label as new columns in the DataFrame
df['Summary'] = summaries
df['URL'] = URL
df['Date'] = DATE
df['Final Label'] = final_labels

# Retain only the necessary columns: Summary, URL, Date, Final Label
df = df[['Summary', 'URL', 'Date', 'Final Label']]

store_csv_path = f"./Data/{today_str}_final.csv"
if os.path.exists(store_csv_path):
    df.to_csv(path_or_buf=store_csv_path, mode="a",index=False,header=False)
else:
    # If it does not exist, create a new file and add data
    df.to_csv(path_or_buf=store_csv_path , mode="w", index=False)
    

print(f"Summarization and classification results have been added to '{today_str}_final.csv'")


Your max_length is set to 30, but your input_length is only 12. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=6)
Your max_length is set to 30, but your input_length is only 25. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=12)


Summarization and classification results have been added to '2024-08-20_final.csv'
