In [None]:
%pip install newsapi-python
%pip install --upgrade transformers
%pip install bert-for-sequence-classification
from newsapi import NewsApiClient
import json
import torch
from wordcloud import WordCloud
import pandas as pd
import os
print(torch.__version__)

In [None]:

# Function to fetch articles from API or load from local file
def get_articles():
    if os.path.exists('articles.json'):
        # Load from local file if it exists
        with open('articles.json', 'r') as json_file:
            articles = json.load(json_file)
            print("Loaded articles from local file.")
    else:
        # Fetch from API if local file does not exist
        newsapi = NewsApiClient(api_key='cc7b577d5b4b4462b6eef124170903b1')
        articles = newsapi.get_everything(q='Trump OR Harris',
                                          from_param='2024-09-19',
                                          to='2024-10-01',
                                          language='en',
                                          sort_by='relevancy',
                                          page_size=2)
        # Save to local file
        with open('articles.json', 'w') as json_file:
            json.dump(articles, json_file, indent=4)
        print("Articles fetched from API and saved locally.")
    
    return articles

# Call the function to get articles
articles = get_articles()

for article in articles['articles']:
    print(article['content'], article['description'], article['url'])

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk import word_tokenize

nltk.download('vader_lexicon')
nltk.download('punkt')

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
stop_words = set(stopwords.words('english'))

def preprocess(text):
    words = word_tokenize(text.lower())
    words = [WordNetLemmatizer().lemmatize(word) for word in words if word.isalpha()]
    return ' '.join([word for word in words if word not in stop_words])

clean_article = [preprocess(article['content']) for article in articles['articles']]
clean_article += [preprocess(article['description']) for article in articles['articles']]

In [None]:
# Initialize Sentiment Intensity Analyzer
sid = SentimentIntensityAnalyzer()

# Define lists of keywords for Trump and Kamala Harris
trump_keywords = ['trump', 'donald trump', 'president trump']
kamala_keywords = ['kamala', 'kamala harris', 'vice president harris']

def split_sentence_by_candidate(sentence):
    """
    Splits a sentence into parts mentioning Trump and Kamala based on keywords.
    """
    words = word_tokenize(sentence.lower())
    trump_phrases = []
    kamala_phrases = []
    current_phrase = []
    current_candidate = None

    for word in words:
        # Check if the current word refers to Trump
        if word in trump_keywords:
            if current_candidate == "kamala":
                kamala_phrases.append(" ".join(current_phrase))
                current_phrase = []
            current_candidate = "trump"
        
        # Check if the current word refers to Kamala Harris
        elif word in kamala_keywords:
            if current_candidate == "trump":
                trump_phrases.append(" ".join(current_phrase))
                current_phrase = []
            current_candidate = "kamala"
        
        current_phrase.append(word)

    # Append the last phrase
    if current_candidate == "trump":
        trump_phrases.append(" ".join(current_phrase))
    elif current_candidate == "kamala":
        kamala_phrases.append(" ".join(current_phrase))
    
    return trump_phrases, kamala_phrases

def calculate_sentiment_for_candidate_parts(parts):
    """
    Calculate the total sentiment score for a candidate based on their parts of a sentence.
    """
    total_score = 0
    for part in parts:
        sentiment = sid.polarity_scores(part)
        total_score += sentiment['compound']
    return total_score

def calculate_article_sentiment(article):
    # Tokenize article into sentences
    sentences = sent_tokenize(article.lower())
    
    trump_total_score = 0
    kamala_total_score = 0
    
    for sentence in sentences:
        # Split the sentence into parts mentioning Trump and Kamala
        trump_parts, kamala_parts = split_sentence_by_candidate(sentence)
        
        # Calculate sentiment for Trump and Kamala parts separately
        trump_total_score += calculate_sentiment_for_candidate_parts(trump_parts)
        kamala_total_score += calculate_sentiment_for_candidate_parts(kamala_parts)
    
    # Return final combined sentiment
    return trump_total_score, kamala_total_score

# Process each article and calculate the combined score
articles_data =[]
for article in clean_article:
    trump_score, kamala_score = calculate_article_sentiment(article)
    
    # Calculate the final combined score (Trump score - Kamala score)
    combined_score = trump_score - kamala_score
    
    print(f"Article: {article}")
    print(f"Trump Score: {trump_score}, Kamala Harris Score: {kamala_score}")
    print(f"Combined Score (Trump - Kamala): {combined_score}\n")
    articles_data.append({
        'article': article,
        'trump_score': trump_score,
        'kamala_score': kamala_score,
        'combined_score': trump_score - kamala_score  # Calculate the combined score
    })


In [None]:

from transformers import BertTokenizer, BertForSequenceClassification
from transformers import Trainer, TrainingArguments

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)

In [None]:
from nltk.corpus import wordnet as wn
import pandas as pd

nrc = pd.read_csv('NRC-Emotion-Lexicon-Wordlevel-v0.92.txt', sep='\t', names=["word", "emotion", "association"])
nrc = nrc.pivot(index='word', columns='emotion', values='association').reset_index()

def get_emotions(text):
    words = word_tokenize(text.lower())
    emotions = {emotion: 0 for emotion in nrc.columns[1:]}
    for word in words:
        if word in nrc['word'].values:
            word_emotions = nrc[nrc['word'] == word].iloc[0, 1:]
            for emotion in word_emotions.index:
                emotions[emotion] += word_emotions[emotion]
    return emotions

for article in clean_article:
    emotions = get_emotions(article)
    print(article, emotions)


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

sentiments = ['positive', 'negative', 'neutral']
counts = [len([s for s in clean_article if sid.polarity_scores(s)['compound'] > 0.5]),
          len([s for s in clean_article if sid.polarity_scores(s)['compound'] < -0.5]),
          len([s for s in clean_article if -0.5 <= sid.polarity_scores(s)['compound'] <= 0.5])]

sns.barplot(x=sentiments, y=counts)
plt.title('Sentiment Distribution for Trump and Harris News Articles')
plt.show()


In [None]:
# Convert the data into a DataFrame
df = pd.DataFrame(articles_data)

# Add a 'combined_score' column (Trump - Kamala)
df['combined_score'] = df['trump_score'] - df['kamala_score']

# Display the DataFrame to inspect the structure
print(df.head())

# Summary statistics for sentiment scores
print(df.describe())

# Count the number of articles
print(f"Total number of articles: {len(df)}")

# Count articles favoring Trump (combined_score > 0) vs Kamala (combined_score < 0)
trump_favored = (df['combined_score'] > 0).sum()
kamala_favored = (df['combined_score'] < 0).sum()
print(f"Articles favoring Trump: {trump_favored}")
print(f"Articles favoring Kamala: {kamala_favored}")

# Distribution of Trump and Kamala sentiment scores
plt.figure(figsize=(10, 6))
sns.histplot(df['trump_score'], color='blue', label='Trump Sentiment', kde=True, bins=20)
sns.histplot(df['kamala_score'], color='red', label='Kamala Sentiment', kde=True, bins=20)
plt.title('Sentiment Score Distribution for Trump and Kamala Harris')
plt.xlabel('Sentiment Score')
plt.ylabel('Frequency')
plt.legend()
plt.show()

# Combined sentiment score (Trump - Kamala)
plt.figure(figsize=(10, 6))
sns.histplot(df['combined_score'], color='green', kde=True, bins=20)
plt.title('Combined Sentiment Score (Trump - Kamala)')
plt.xlabel('Combined Score')
plt.ylabel('Frequency')
plt.show()

# Function to create word cloud
def plot_wordcloud(text, title):
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)
    plt.figure(figsize=(10, 6))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title(title)
    plt.show()

# Create word clouds for Trump and Kamala mentions
trump_articles = " ".join(df[df['trump_score'] != 0]['article'])
kamala_articles = " ".join(df[df['kamala_score'] != 0]['article'])

plot_wordcloud(trump_articles, "Word Cloud for Trump Articles")
plot_wordcloud(kamala_articles, "Word Cloud for Kamala Harris Articles")


# Add an 'article_length' column (number of words in each article)
df['article_length'] = df['article'].apply(lambda x: len(x.split()))

# Plot sentiment scores vs article length
plt.figure(figsize=(10, 6))
sns.scatterplot(x='article_length', y='trump_score', data=df, label='Trump Score', color='blue')
sns.scatterplot(x='article_length', y='kamala_score', data=df, label='Kamala Score', color='red')
plt.title('Sentiment Scores vs Article Length')
plt.xlabel('Article Length (Word Count)')
plt.ylabel('Sentiment Score')
plt.legend()
plt.show()


most_positive_trump = df.loc[df['trump_score'].idxmax()]['article']
most_negative_trump = df.loc[df['trump_score'].idxmin()]['article']
most_positive_kamala = df.loc[df['kamala_score'].idxmax()]['article']
most_negative_kamala = df.loc[df['kamala_score'].idxmin()]['article']

print(f"Most positive article for Trump: {most_positive_trump}")
print(f"Most negative article for Trump: {most_negative_trump}")
print(f"Most positive article for Kamala: {most_positive_kamala}")
print(f"Most negative article for Kamala: {most_negative_kamala}")
