### Sentiment Labelling

#### Labelling for News Headline only as using News Headline to perform Sentiment Analysis

Evaluate by comparing labeled data from git (Compare with Vader, Afinn, Textblob)

In [1]:
import pandas as pd

# load the dataset
df = pd.read_csv('positivum-dataset.csv')
df.head(5)

Unnamed: 0,title,url,classification,date,publisher
0,Attacks leave a Jewish community on edge as le...,https://www.theguardian.com/us-news/2019/dec/2...,1,2019-12-29 19:00:45,The Guardian
1,US military carries out 'defensive strikes' in...,https://www.theguardian.com/us-news/2019/dec/2...,0,2019-12-29 21:16:24,The Guardian
2,Rebecca Long-Bailey makes opening pitch for La...,https://www.theguardian.com/politics/2019/dec/...,1,2019-12-29 22:30:49,The Guardian
3,"Vaughan Oliver, celebrated 4AD graphic designe...",https://www.theguardian.com/music/2019/dec/29/...,0,2019-12-29 21:31:55,The Guardian
4,'Remarkable' high as Scottish temperature reco...,https://www.theguardian.com/uk-news/2019/dec/2...,1,2019-12-29 22:37:24,The Guardian


In [2]:
print(df['publisher'].value_counts())

publisher
The Guardian    9180
BBC             9164
CNN             8973
Name: count, dtype: int64


Clean labelled dataset

In [3]:
# preprocess the data that is CNN only
df = df[df['publisher'] == 'CNN']
df['classification'] = df['classification'].map({1: 'positive', 0: 'negative'})
df = df[['title', 'classification']]
df = df.rename(columns={'title': 'headline', 'classification': 'original_sentiment'})
df = df.dropna()
df = df.drop_duplicates()

In [4]:
df.shape

(8819, 2)

Vader

In [5]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# VADER sentiment analyzer
analyzer = SentimentIntensityAnalyzer()

# classify sentiment
def classify_sentiment(text):
    scores = analyzer.polarity_scores(text)
    compound = scores['compound']
    if compound >= 0.05:
        return 'positive'
    elif compound <= -0.05:
        return 'negative'
    else:
        return 'neutral'

# calculate sentiment score
def calculate_sentiment_score(text):
    scores = analyzer.polarity_scores(text)
    return scores['compound']

# apply the functions
vader_df = df.copy()

vader_df['vader_sentiment'] = vader_df['headline'].apply(classify_sentiment)
vader_df['sentiment_score'] = vader_df['headline'].apply(calculate_sentiment_score)

Afinn

In [6]:
from afinn import Afinn

# afinn sentiment analyzer
afinn = Afinn()

def classify_sentiment(text):
    score = afinn.score(text)
    if score > 0:
        return 'positive'
    elif score < 0:
        return 'negative'
    else:
        return 'neutral'

def calculate_sentiment_score(text):
    return afinn.score(text)

afinn_df = df.copy()
    
afinn_df['afinn_sentiment'] = afinn_df['headline'].apply(classify_sentiment)
afinn_df['sentiment_score'] = afinn_df['headline'].apply(calculate_sentiment_score)

Textblob

In [7]:
from textblob import TextBlob

def classify_sentiment(text):
    analysis = TextBlob(text)
    polarity = analysis.sentiment.polarity
    
    if polarity > 0:
        return 'positive'
    elif polarity < 0:
        return 'negative'
    else:
        return 'neutral'
    
def calculate_sentiment_score(text):
    analysis = TextBlob(text)
    return analysis.sentiment.polarity

textblob_df = df.copy()

    
textblob_df['textblob_sentiment'] = textblob_df['headline'].apply(classify_sentiment)
textblob_df['sentiment_score'] = textblob_df['headline'].apply(calculate_sentiment_score)

Create New Column 

In [8]:
# define DataFrames and their column names for each sentiment analysis method
dataframes = {
    'vader_df': {'df': vader_df, 'sentiment_column': 'vader_sentiment', 'label_column': 'original_sentiment'},
    'afinn_df': {'df': afinn_df, 'sentiment_column': 'afinn_sentiment', 'label_column': 'original_sentiment'},
    'textblob_df': {'df': textblob_df, 'sentiment_column': 'textblob_sentiment', 'label_column': 'original_sentiment'}
}

# iterate over each DataFrame
for df_name, info in dataframes.items():
    df = info['df']
    sentiment_col = info['sentiment_column']
    label_col = info['label_column']
    
    # check and ensure columns exist
    if sentiment_col in df.columns and label_col in df.columns:
        # create a new column based on sentiment comparison
        df['new_column'] = df.apply(
            lambda row: 1 if ((row[sentiment_col] in ['positive', 'neutral'] and row[label_col] in ['positive', 'neutral']) or 
                              (row[sentiment_col] == 'negative' and row[label_col] == 'negative')) 
                        else 0, 
            axis=1
        )
        
        print(f"Processing complete for {df_name}. DataFrame has been updated.")
    else:
        print(f"Columns {sentiment_col} or {label_col} not found in {df_name}.")


Processing complete for vader_df. DataFrame has been updated.
Processing complete for afinn_df. DataFrame has been updated.
Processing complete for textblob_df. DataFrame has been updated.


Count same occurrence

In [9]:
dataframes = {
    'vader_df': {'df': vader_df, 'new_column': 'new_column'},
    'afinn_df': {'df': afinn_df, 'new_column': 'new_column'},
    'textblob_df': {'df': textblob_df, 'new_column': 'new_column'}
}

for df_name, info in dataframes.items():
    df = info['df']
    new_col = info['new_column']
    
    # check and ensure the new_column exists
    if new_col in df.columns:
        # count occurrences of each unique value in the new_column
        category_counts = df[new_col].value_counts()
        
        # get the total number of occurrences
        total_occurrences = category_counts.sum()
        
        print(f"\nResults for {df_name}:")
        print(category_counts)
        print("Total occurrences:", total_occurrences)
    else:
        print(f"Column {new_col} not found in {df_name}.")



Results for vader_df:
new_column
1    5762
0    3057
Name: count, dtype: int64
Total occurrences: 8819

Results for afinn_df:
new_column
1    5896
0    2923
Name: count, dtype: int64
Total occurrences: 8819

Results for textblob_df:
new_column
1    4913
0    3906
Name: count, dtype: int64
Total occurrences: 8819


### Evaluation 

#### Vader 

1 = 5762

0 = 3057

5762/8819 = 0.6534

= 65%

#### Afinn

1 = 5896

0 = 2923

5896/8819 = 0.6686

= 67%

#### Textblob

1 = 4913

0 = 3906

4913/8819 = 0.5571

= 56%

Apply Afinn to original dataset (cnn_news_articles_final_cleaned)

In [10]:
# load the dataset
df = pd.read_csv('cnn_news_articles_final_cleaned.csv')

from afinn import Afinn

# afinn sentiment analyzer
afinn = Afinn()

def classify_sentiment(text):
    score = afinn.score(text)
    if score > 0:
        return 'positive'
    elif score < 0:
        return 'negative'
    else:
        return 'neutral'

def calculate_sentiment_score(text):
    return afinn.score(text)
    
df['sentiment'] = df['headline'].apply(classify_sentiment)
df['sentiment_score'] = df['headline'].apply(calculate_sentiment_score)

df.to_csv('cnn_news_articles_final_cleaned.csv', index=False)

In [11]:
df['sentiment'].value_counts()

sentiment
neutral     16863
negative    14666
positive    10159
Name: count, dtype: int64