# Import Required Libraries

In [1]:
# pip install spacy

In [2]:
# python -m spacy download en_core_web_sm

In [3]:
import pandas as pd
import spacy
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [4]:
# Load the spaCy model
nlp = spacy.load("en_core_web_sm")

In [5]:
# Initialize the SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()

# Load Dataset

In [6]:
# Load the sentimentdataset.csv file into a pandas DataFrame
df = pd.read_csv('CNN_Editors.tsv', delimiter='\t')
df.head()

Unnamed: 0,Year,Description,Conference,Title,Author Names
0,2022,Here's a look at how the pandemic reshaped peo...,business,"Two years later, remote work has changed milli...",Kathryn Vasel
1,2022,March Madness isn't just for college basketbal...,business,Why March is so volatile for stocks - CNN,"Paul R. La Monica, Business"
2,2022,"As crude prices surge, oil companies are rakin...",business,Stocks week ahead: Big Oil rakes in billions a...,"Chris Isidore, Business"
3,2022,Governments around the world must consider dra...,business,Oil 'emergency': Work from home and drive slow...,"Matt Egan, Business"
4,2022,The convenience of digital payments to both co...,business,Opinion: Technology is transforming the nature...,"As told to by Chris Wellisz, International Mon..."


# Preprocess dataset

In [7]:
# Remove the 'Author Names' column
df.drop(columns=['Author Names'], inplace=True)
df.head()

Unnamed: 0,Year,Description,Conference,Title
0,2022,Here's a look at how the pandemic reshaped peo...,business,"Two years later, remote work has changed milli..."
1,2022,March Madness isn't just for college basketbal...,business,Why March is so volatile for stocks - CNN
2,2022,"As crude prices surge, oil companies are rakin...",business,Stocks week ahead: Big Oil rakes in billions a...
3,2022,Governments around the world must consider dra...,business,Oil 'emergency': Work from home and drive slow...
4,2022,The convenience of digital payments to both co...,business,Opinion: Technology is transforming the nature...


## Exctract named entities

In [8]:
def extract_entities(text):
    doc = nlp(text)
    return ';'.join([ent.text for ent in doc.ents])

# Apply the function to the 'Description' column
df['Author Names'] = df['Description'].apply(extract_entities)
df.head()

Unnamed: 0,Year,Description,Conference,Title,Author Names
0,2022,Here's a look at how the pandemic reshaped peo...,business,"Two years later, remote work has changed milli...",
1,2022,March Madness isn't just for college basketbal...,business,Why March is so volatile for stocks - CNN,March;this March
2,2022,"As crude prices surge, oil companies are rakin...",business,Stocks week ahead: Big Oil rakes in billions a...,
3,2022,Governments around the world must consider dra...,business,Oil 'emergency': Work from home and drive slow...,Russia;Ukraine;the International Energy Agency...
4,2022,The convenience of digital payments to both co...,business,Opinion: Technology is transforming the nature...,


In [9]:
# Count the number of rows that are empty or NaN in the 'Author Names' column
empty_or_nan_count = df['Author Names'].isna().sum() + df['Author Names'].eq('').sum()

# Count the number of rows that are not empty or NaN in the 'Author Names' column
non_empty_count = len(df) - empty_or_nan_count

print(f"Empty or NaN rows: {empty_or_nan_count}")
print(f"Non-empty rows: {non_empty_count}")

Empty or NaN rows: 1072
Non-empty rows: 36871


In [10]:
# Remove all the rows that are empty in the 'Author Names' column
df = df[df['Author Names'].apply(lambda x: x != '')]
df.head()

Unnamed: 0,Year,Description,Conference,Title,Author Names
1,2022,March Madness isn't just for college basketbal...,business,Why March is so volatile for stocks - CNN,March;this March
3,2022,Governments around the world must consider dra...,business,Oil 'emergency': Work from home and drive slow...,Russia;Ukraine;the International Energy Agency...
5,2022,"Inflation is everywhere: grocery stores, gas s...",business,Inflation is everywhere. Except your cell phon...,one
6,2022,Burger King is trying to suspend its operation...,business,Burger King partner 'refuses' to close 800 Rus...,Burger King;Russia;800
7,2022,The White House spent much of Friday frustrate...,business,White House 'appalled' at Axios over Ukraine a...,The White House;Friday;Axios;Ukraine


## Sentiment analysis

In [11]:
# Function to calculate normalized sentiment score
def calculate_sentiment(text):
    sentiment = analyzer.polarity_scores(text)
    normalized_score = (sentiment['compound'] + 1) / 2  # Normalize to range [0, 1]
    return normalized_score

# Apply the function to the 'Description' column
df['Sentiment Score'] = df['Description'].apply(calculate_sentiment)
df.head()

Unnamed: 0,Year,Description,Conference,Title,Author Names,Sentiment Score
1,2022,March Madness isn't just for college basketbal...,business,Why March is so volatile for stocks - CNN,March;this March,0.18755
3,2022,Governments around the world must consider dra...,business,Oil 'emergency': Work from home and drive slow...,Russia;Ukraine;the International Energy Agency...,0.1596
5,2022,"Inflation is everywhere: grocery stores, gas s...",business,Inflation is everywhere. Except your cell phon...,one,0.8062
6,2022,Burger King is trying to suspend its operation...,business,Burger King partner 'refuses' to close 800 Rus...,Burger King;Russia;800,0.11415
7,2022,The White House spent much of Friday frustrate...,business,White House 'appalled' at Axios over Ukraine a...,The White House;Friday;Axios;Ukraine,0.4742


In [12]:
# Function to categorize sentiment
def categorize_sentiment(score):
    if score < 0.4:
        return 'Negative'
    elif 0.4 <= score <= 0.6:
        return 'Neutral'
    else:
        return 'Positive'

# Apply the function to create a new column with sentiment labels
df['Sentiment'] = df['Sentiment Score'].apply(categorize_sentiment)
df.head()

Unnamed: 0,Year,Description,Conference,Title,Author Names,Sentiment Score,Sentiment
1,2022,March Madness isn't just for college basketbal...,business,Why March is so volatile for stocks - CNN,March;this March,0.18755,Negative
3,2022,Governments around the world must consider dra...,business,Oil 'emergency': Work from home and drive slow...,Russia;Ukraine;the International Energy Agency...,0.1596,Negative
5,2022,"Inflation is everywhere: grocery stores, gas s...",business,Inflation is everywhere. Except your cell phon...,one,0.8062,Positive
6,2022,Burger King is trying to suspend its operation...,business,Burger King partner 'refuses' to close 800 Rus...,Burger King;Russia;800,0.11415,Negative
7,2022,The White House spent much of Friday frustrate...,business,White House 'appalled' at Axios over Ukraine a...,The White House;Friday;Axios;Ukraine,0.4742,Neutral


## Split important words

In [13]:
# Split the 'Author Names' column into multiple columns
important_words_split = df['Author Names'].str.split(';', expand=True)

# Rename the new columns to Author Names1, Author Names2, Author Names3, ...
important_words_split.columns = [f'Author Names{i+1}' for i in range(important_words_split.shape[1])]

# Concatenate the original dataframe (without 'Author Names') with the new split columns
df = pd.concat([df.drop(columns=['Author Names']), important_words_split], axis=1)

In [14]:
df.head()

Unnamed: 0,Year,Description,Conference,Title,Sentiment Score,Sentiment,Author Names1,Author Names2,Author Names3,Author Names4,...,Author Names6,Author Names7,Author Names8,Author Names9,Author Names10,Author Names11,Author Names12,Author Names13,Author Names14,Author Names15
1,2022,March Madness isn't just for college basketbal...,business,Why March is so volatile for stocks - CNN,0.18755,Negative,March,this March,,,...,,,,,,,,,,
3,2022,Governments around the world must consider dra...,business,Oil 'emergency': Work from home and drive slow...,0.1596,Negative,Russia,Ukraine,the International Energy Agency,Friday,...,,,,,,,,,,
5,2022,"Inflation is everywhere: grocery stores, gas s...",business,Inflation is everywhere. Except your cell phon...,0.8062,Positive,one,,,,...,,,,,,,,,,
6,2022,Burger King is trying to suspend its operation...,business,Burger King partner 'refuses' to close 800 Rus...,0.11415,Negative,Burger King,Russia,800,,...,,,,,,,,,,
7,2022,The White House spent much of Friday frustrate...,business,White House 'appalled' at Axios over Ukraine a...,0.4742,Neutral,The White House,Friday,Axios,Ukraine,...,,,,,,,,,,


## Melt

In [15]:
# Melt the dataset so that each Author Names corresponds to each row
df_melted = df.melt(id_vars=[col for col in df.columns if not col.startswith('Author Names')],
                    value_vars=[col for col in df.columns if col.startswith('Author Names')],
                    var_name='Author Names Column', value_name='Author Names')

# Drop rows where 'Author Names' is NaN
df_melted = df_melted.dropna(subset=['Author Names'])
df_melted.head()

Unnamed: 0,Year,Description,Conference,Title,Sentiment Score,Sentiment,Author Names Column,Author Names
0,2022,March Madness isn't just for college basketbal...,business,Why March is so volatile for stocks - CNN,0.18755,Negative,Author Names1,March
1,2022,Governments around the world must consider dra...,business,Oil 'emergency': Work from home and drive slow...,0.1596,Negative,Author Names1,Russia
2,2022,"Inflation is everywhere: grocery stores, gas s...",business,Inflation is everywhere. Except your cell phon...,0.8062,Positive,Author Names1,one
3,2022,Burger King is trying to suspend its operation...,business,Burger King partner 'refuses' to close 800 Rus...,0.11415,Negative,Author Names1,Burger King
4,2022,The White House spent much of Friday frustrate...,business,White House 'appalled' at Axios over Ukraine a...,0.4742,Neutral,Author Names1,The White House


In [16]:
# Drop the 'Description' and 'Author Names Column' columns
df_melted = df_melted.drop(columns=['Description', 'Author Names Column', 'Sentiment Score', 'Conference'])
df_melted.head()

Unnamed: 0,Year,Title,Sentiment,Author Names
0,2022,Why March is so volatile for stocks - CNN,Negative,March
1,2022,Oil 'emergency': Work from home and drive slow...,Negative,Russia
2,2022,Inflation is everywhere. Except your cell phon...,Positive,one
3,2022,Burger King partner 'refuses' to close 800 Rus...,Negative,Burger King
4,2022,White House 'appalled' at Axios over Ukraine a...,Neutral,The White House


In [17]:
# Rename the column 'Sentiment' to 'Conference'
df_melted.rename(columns={'Sentiment': 'Conference'}, inplace=True)

In [18]:
df_melted.head()

Unnamed: 0,Year,Title,Conference,Author Names
0,2022,Why March is so volatile for stocks - CNN,Negative,March
1,2022,Oil 'emergency': Work from home and drive slow...,Negative,Russia
2,2022,Inflation is everywhere. Except your cell phon...,Positive,one
3,2022,Burger King partner 'refuses' to close 800 Rus...,Negative,Burger King
4,2022,White House 'appalled' at Axios over Ukraine a...,Neutral,The White House


## Agreggate

# Save Preprocessed dataset

In [19]:
# Export the DataFrame to a new .tsv file
df_melted.to_csv('CNN_Sentiment2.tsv', sep='\t', index=False)