Evaluate by comparing labeled data from git (Compare with Vader, Afinn, Textblob, Kmeans)

In [37]:
import pandas as pd

# load the dataset
df = pd.read_csv('labelled_news_headline.csv')
df.head(5)

Unnamed: 0,title,url,classification,date,publisher,afinn_sentiment
0,Attacks leave a Jewish community on edge as le...,https://www.theguardian.com/us-news/2019/dec/2...,1,29-12-19 19:00,The Guardian,negative
1,US military carries out 'defensive strikes' in...,https://www.theguardian.com/us-news/2019/dec/2...,0,29-12-19 21:16,The Guardian,negative
2,Rebecca Long-Bailey makes opening pitch for La...,https://www.theguardian.com/politics/2019/dec/...,1,29-12-19 22:30,The Guardian,positive
3,"Vaughan Oliver, celebrated 4AD graphic designe...",https://www.theguardian.com/music/2019/dec/29/...,0,29-12-19 21:31,The Guardian,positive
4,'Remarkable' high as Scottish temperature reco...,https://www.theguardian.com/uk-news/2019/dec/2...,1,29-12-19 22:37,The Guardian,positive


In [38]:
print(df['publisher'].value_counts())

The Guardian    9180
BBC             9164
CNN             8973
Name: publisher, dtype: int64


Clean labelled dataset (Git)

In [39]:
import pandas as pd

# load the dataset
file_path = 'labelled_news_headline.csv'
df = pd.read_csv(file_path)

# filter the data that is CNN only
df = df[df['publisher'] == 'CNN']

# decode label
df['classification'] = df['classification'].map({1: 'positive', 0: 'negative'})

# select column 
df = df[['title', 'classification']]

# change selected column name
df = df.rename(columns={'title': 'headline', 'classification': 'label_sentiment'})

# drop duplicated
df = df.drop_duplicates()

# save new CSV file
output_path = 'CNN_labelled_news_headline.csv'
df.to_csv(output_path, index=False)

print(f"Filtered data saved to {output_path}")

Filtered data saved to CNN_labelled_news_headline.csv


In [42]:
df.shape

(8206, 2)

Vader

In [43]:
# import pandas as pd

# # load the dataset
# df = pd.read_csv('CNN_labelled_news_headline.csv')

# import pandas as pd
# from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# # VADER sentiment analyzer
# analyzer = SentimentIntensityAnalyzer()

# # classify sentiment
# def classify_sentiment(text):
#     # sentiment scores
#     scores = analyzer.polarity_scores(text)
#     # compound score
#     compound = scores['compound']
#     # sentiment based on the compound score
#     if compound >= 0.05:
#         return 'positive'
#     elif compound <= -0.05:
#         return 'negative'
#     else:
#         return 'neutral'
    
# df['vader_sentiment'] = df['headline'].apply(classify_sentiment)

# df.to_csv('CNN_labelled_news_headline_vader.csv', index=False)


Afinn

In [46]:
# import pandas as pd

# # load the dataset
# df = pd.read_csv('CNN_labelled_news_headline.csv')

# from afinn import Afinn

# # afinn sentiment analyzer
# afinn = Afinn()

# # classify sentiment
# def classify_sentiment(text):
#     # sentiment score
#     score = afinn.score(text)
#     # sentiment based on the score
#     if score > 0:
#         return 'positive'
#     elif score < 0:
#         return 'negative'
#     else:
#         return 'neutral'
    
# df['afinn_sentiment'] = df['headline'].apply(classify_sentiment)

# df.to_csv('CNN_labelled_news_headline_afinn.csv', index=False)

Textblob

In [49]:
# import pandas as pd

# #load the dataset
# df = pd.read_csv('CNN_labelled_news_headline.csv')

# from textblob import TextBlob

# # classify sentiment
# def classify_sentiment(text):
#     analysis = TextBlob(text)
#     polarity = analysis.sentiment.polarity
    
#     if polarity > 0:
#         return 'positive'
#     elif polarity < 0:
#         return 'negative'
#     else:
#         return 'neutral'
    
# df['textblob_sentiment'] = df['headline'].apply(classify_sentiment)

# df.to_csv('CNN_labelled_news_headline_textblob.csv', index=False)


Kmeans Clustering

In [54]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.preprocessing import LabelEncoder

# load dataset
df = pd.read_csv('CNN_labelled_news_headline.csv')

headlines = df['headline'].tolist()

# TF-IDF Vetorizer
vectorizer = TfidfVectorizer(max_features=1000)
X = vectorizer.fit_transform(headlines)

# kmeans clustering
num_clusters = 3  # nega posi neutral
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
kmeans.fit(X)

# assign sentiment labels to clusters
cluster_centers = kmeans.cluster_centers_
cluster_labels = kmeans.labels_

# clusters and their labels
df['cluster_label'] = kmeans.labels_

label_encoder = LabelEncoder()
df['kmeans_sentiment'] = label_encoder.fit_transform(df['cluster_label'])

# print(df[['headline', 'kmeans_sentiment']])

# decode back to negative, positive, neutral
inverse_mapping = {0: 'negative', 1: 'positive', 2: 'neutral'}
df['kmeans_sentiment'] = df['kmeans_sentiment'].map(inverse_mapping)

df.drop(columns=['cluster_label'], inplace=True)

# print(df[['headline', 'kmeans_sentiment']])

df.to_csv('CNN_labelled_news_headline_kmeans.csv', index=False)





Create New Column 

In [55]:
import pandas as pd

# Read the CSV file
input_file = 'CNN_labelled_news_headline_kmeans.csv'  # Replace with your input CSV file path
output_file = 'CNN_labelled_news_headline_kmeans.csv'  # Replace with the desired output CSV file path

data = pd.read_csv(input_file)

# Compare sentiment and annotation columns (case-insensitive), create a new column with 1 or 0
data['new_column'] = data.apply(
    lambda row: 1 if ((row['kmeans_sentiment'] in ['positive', 'neutral'] and row['label_sentiment'] in ['positive', 'neutral']) or 
                      (row['kmeans_sentiment'] == 'negative' and row['label_sentiment'] == 'negative')) 
                else 0, 
    axis=1
)
# Save the modified data to a new CSV file
data.to_csv(output_file, index=False)

print("Processing complete. New CSV file saved as", output_file)


Processing complete. New CSV file saved as CNN_labelled_news_headline_kmeans.csv


Count same occurrence

In [56]:
import pandas as pd

# 1. Load the CSV file
data = pd.read_csv('CNN_labelled_news_headline_kmeans.csv')

# Count the occurrences of each unique category in the 'annotation' column
category_counts = data['new_column'].value_counts()

# # Get the total number of occurrences
total_occurrences = category_counts.sum()

# Print the results
# Print the results
print(category_counts)
print("Total occurrences:", total_occurrences)

1    4132
0    4074
Name: new_column, dtype: int64
Total occurrences: 8206


### Evaluation 

### Vader 

1 = 5677

0 = 2529

5677/8206 = 0.6918

= 69%

### Afinn

1 = 5896

0 = 2310

5896/8206 = 0.7184

= 72%

### Textblob

1 = 4729

0 = 3477

4729/8206 = 0.5763

= 58%

### Kmeans

1 = 4132

0 = 4074

4132/8206 = 0.5035

= 50%
