In [17]:
!pip install requests beautifulsoup4 pandas nltk afinn transformers

Collecting transformers
  Downloading transformers-4.38.2-py3-none-any.whl (8.5 MB)
                                              0.0/8.5 MB ? eta -:--:--
                                              0.1/8.5 MB 1.1 MB/s eta 0:00:08
     ------                                   1.4/8.5 MB 14.9 MB/s eta 0:00:01
     ---------------                          3.4/8.5 MB 23.8 MB/s eta 0:00:01
     -----------------------                  4.9/8.5 MB 26.2 MB/s eta 0:00:01
     ------------------------------           6.5/8.5 MB 27.8 MB/s eta 0:00:01
     ---------------------------------------  8.5/8.5 MB 30.4 MB/s eta 0:00:01
     ---------------------------------------  8.5/8.5 MB 30.4 MB/s eta 0:00:01
     ---------------------------------------- 8.5/8.5 MB 26.0 MB/s eta 0:00:00
Collecting huggingface-hub<1.0,>=0.19.3 (from transformers)
  Downloading huggingface_hub-0.21.4-py3-none-any.whl (346 kB)
                                              0.0/346.4 kB ? eta -:--:--
     -------------

In [3]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import download
from afinn import Afinn

In [4]:
#Step1: Collect data
# Initialize variables
titles = []  # to store news titles
page_number = 0  # starting parameter for Bing news pagination
num_articles = 30  # target number of news articles
query = "ChatGPT and Diffusion models".replace(" ", "+")

# Continue scraping while the number of collected titles is less than the target number
while len(titles) < num_articles:
    # Construct the URL, including the logic for pagination
    url = f'https://www.bing.com/news/search?q={query}&form=QBNH&first={page_number}'
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    # Find all news items on the current page
    news_items = soup.find_all('div', class_='t_s', limit=30)

    # Extract titles from news items
    for item in news_items:
        title = item.find('a', class_='title')
        if title:
            titles.append(title.text)
        if len(titles) >= num_articles:
            break  # Stop if the target number or more is reached

    # Prepare to scrape the next page
    page_number += 10

# Convert the collected titles to a DataFrame
df = pd.DataFrame({'Title': titles})

# Display the contents of the DataFrame
print(df)

# Save to a CSV file, if needed
df.to_csv('chatgpt_diffusion_models_news.csv', index=False)


                                                Title
0   ChatGPT: 5 changes I'd like to see in the near...
1   New AI test measures how fast robots can respo...
2   ChatGPT rival from Stable Diffusion creators j...
3   Why is AI so bad at spelling? Because image ge...
4            OpenAI Sora: Everything you need to know
5   Artificial intelligence boosts super-resolutio...
6   Microsoft’s Copilot AI set to operate locally ...
7                       Why is AI so bad at spelling?
8   Key Stable Diffusion Researchers Leave Stabili...
9   FLock.io raises $6M for decentralized blockcha...
10        What is Midjourney AI and how does it work?
11  Microsoft’s Copilot AI set to operate locally ...
12            The Best Free AI Art Generators, Ranked
13  These Horror Stories Prove That AI Could Pose ...
14  What is Suno? The viral AI song generator expl...
15  AI is coming to the iPhone–and it could change...
16  The next phase of AI is here — and it's not lo...
17  Sora AI: What is it? How

In [5]:
# steo2: Preprocessing
# Ensure the necessary NLTK packages have been downloaded
download('stopwords')
download('wordnet')

# Read the CSV file
file_path = 'chatgpt_diffusion_models_news.csv'
news_df = pd.read_csv(file_path)

# Define a set of stop words and a lemmatizer
stop_words = set(stopwords.words('english'))  # Other languages can be added as needed
lemmer = WordNetLemmatizer()

# Define a text preprocessing function
def preprocess(x):
    x = re.sub(r'[^\w\s]', ' ', x)  # Remove punctuation
    x = re.sub(r'\d+', '', x)  # Remove numbers
    # Convert to lowercase, remove stop words, and lemmatize
    x = [lemmer.lemmatize(w.lower()) for w in x.split() if w.lower() not in stop_words]  
    return ' '.join(x)

# Apply the preprocessing function
news_df['title_clean'] = news_df['Title'].apply(preprocess)

news_df[['Title', 'title_clean']]  # Display titles before and after processing



[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\xyzen\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\xyzen\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Unnamed: 0,Title,title_clean
0,ChatGPT: 5 changes I'd like to see in the near...,chatgpt change like see near future
1,New AI test measures how fast robots can respo...,new ai test measure fast robot respond user co...
2,ChatGPT rival from Stable Diffusion creators j...,chatgpt rival stable diffusion creator launche...
3,Why is AI so bad at spelling? Because image ge...,ai bad spelling image generator actually readi...
4,OpenAI Sora: Everything you need to know,openai sora everything need know
5,Artificial intelligence boosts super-resolutio...,artificial intelligence boost super resolution...
6,Microsoft’s Copilot AI set to operate locally ...,microsoft copilot ai set operate locally futur...
7,Why is AI so bad at spelling?,ai bad spelling
8,Key Stable Diffusion Researchers Leave Stabili...,key stable diffusion researcher leave stabilit...
9,FLock.io raises $6M for decentralized blockcha...,flock io raise decentralized blockchain ai tra...


In [6]:
# step3
# Initialize AFINN sentiment analyzer, allowing for emoticons
afinn = Afinn(emoticons=True)

# Perform sentiment analysis on each title and store the results in a list
afinn_scores = [afinn.score(text) for text in news_df['title_clean']]  # Use the 'title_clean' column

# Add the list of sentiment scores to the DataFrame as a new column
news_df['afinn_sentiment'] = afinn_scores

# Display titles with their corresponding sentiment scores
print(news_df[['title_clean', 'afinn_sentiment']])


                                          title_clean  afinn_sentiment
0                 chatgpt change like see near future              2.0
1   new ai test measure fast robot respond user co...              0.0
2   chatgpt rival stable diffusion creator launche...              3.0
3   ai bad spelling image generator actually readi...             -3.0
4                    openai sora everything need know              0.0
5   artificial intelligence boost super resolution...              6.0
6   microsoft copilot ai set operate locally futur...              0.0
7                                     ai bad spelling             -3.0
8   key stable diffusion researcher leave stabilit...              1.0
9   flock io raise decentralized blockchain ai tra...              0.0
10                                 midjourney ai work              0.0
11  microsoft copilot ai set operate locally futur...              0.0
12                  best free ai art generator ranked              4.0
13    

In [7]:
# step4
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from transformers import TextClassificationPipeline

# Load the pre-trained DistilBERT model and tokenizer
model_name = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = DistilBertTokenizer.from_pretrained(model_name)
model = DistilBertForSequenceClassification.from_pretrained(model_name)

# Create a pipeline for performing sentiment analysis
distilbert_pipeline = TextClassificationPipeline(model=model, tokenizer=tokenizer)

# Define a function to get the sentiment score
def sentiment_score_distilbert(text):
    result = distilbert_pipeline(text)[0]
    score = round(result['score'], 4)  # Round to 4 decimal places
    return score

# Define a function to get the sentiment label
def sentiment_label_distilbert(text):
    result = distilbert_pipeline(text)[0]
    label = result['label']
    return label

# Apply these functions to the news titles and add the results to the DataFrame
news_df['distilbert_score'] = news_df['title_clean'].apply(sentiment_score_distilbert)
news_df['distilbert_label'] = news_df['title_clean'].apply(sentiment_label_distilbert)

# View the DataFrame including the sentiment scores and labels
news_df[['title_clean', 'afinn_sentiment', 'distilbert_score', 'distilbert_label']]


Unnamed: 0,title_clean,afinn_sentiment,distilbert_score,distilbert_label
0,chatgpt change like see near future,2.0,0.6954,NEGATIVE
1,new ai test measure fast robot respond user co...,0.0,0.8759,NEGATIVE
2,chatgpt rival stable diffusion creator launche...,3.0,0.9916,NEGATIVE
3,ai bad spelling image generator actually readi...,-3.0,0.9996,NEGATIVE
4,openai sora everything need know,0.0,0.9811,POSITIVE
5,artificial intelligence boost super resolution...,6.0,0.9973,POSITIVE
6,microsoft copilot ai set operate locally futur...,0.0,0.977,NEGATIVE
7,ai bad spelling,-3.0,0.9997,NEGATIVE
8,key stable diffusion researcher leave stabilit...,1.0,0.9922,NEGATIVE
9,flock io raise decentralized blockchain ai tra...,0.0,0.9119,NEGATIVE


In [9]:
news_df.to_csv('D:/course/CS688/2/sentiment_and_theme_analysis.csv', index=False)