In [None]:
from google.colab import drive
drive.mount('/content/drive')


In [None]:
import pandas as pd
from transformers import pipeline
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re

# Make sure to install these dependencies or include installation commands in your Colab notebook
!pip install transformers nltk

# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Load the data
data = pd.read_csv("/content/drive/MyDrive/GoogleColab/web social media anlysis and visualization/api_news_articles.csv")

# Preprocessing function to remove non-ASCII characters
def preprocess_text(text):
    # Remove non-ASCII characters
    text = re.sub(r'[^\x00-\x7F]+', '', text)
    # Remove placeholders like "[+4592 chars]"
    text = re.sub(r"\[\+\d+\schars\]", "", text)
    # Lowercasing
    text = text.lower()
    # Remove non-alphabetic characters, excluding basic punctuation
    text = re.sub(r'[^a-zA-Z\s.,!?]', '', text, re.I|re.A)
    # Tokenization
    tokens = word_tokenize(text)
    # Removing stopwords
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word not in stop_words]
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    lemmatized_text = [lemmatizer.lemmatize(word) for word in filtered_tokens]
    # Re-join tokens
    return ' '.join(lemmatized_text)

# Select the 13th article (index 12)
try:
    article = data.loc[12, 'content']  # Adjust index to 12 for the 13th article
    cleaned_article = preprocess_text(article)
except IndexError:
    print("The specified index is out of range. Please check your dataset.")
    cleaned_article = ""

# Initialize the summarization pipeline
summarizer = pipeline("summarization")

# Perform summarization
if cleaned_article:  # Check if the article contains text after preprocessing
    summary = summarizer(cleaned_article, max_length=150, min_length=30, do_sample=False)
    if summary:
        summarized_text = summary[0]['summary_text']
        print("Summarized Text: ", summarized_text)
    else:
        print("No summary was generated. Please check the content of the article.")
else:
    print("No content to summarize.")

# Print the original article for comparison (if it exists)
if 'article' in locals():
    print("Original Article: ", article)




[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
No model was supplied, defaulted to sshleifer/distilbart-cnn-12-6 and revision a4f8f3e (https://huggingface.co/sshleifer/distilbart-cnn-12-6).
Using a pipeline without specifying a model name and revision in production is not recommended.
Your max_length is set to 150, but your input_length is only 23. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=11)


Summarized Text:   google annual developer conference fast approaching . come announcement new device showcase company software year come . google pixel fold will be unveiled at the Google annual conference .
Original Article:  Google’s annual developer conference is fast approaching, and with it comes announcements of new devices that will showcase the company’s software for years to come. The Google Pixel Fold 2, which ma… [+4592 chars]
