# Import Required Libraries

In [1]:
# pip install spacy

In [2]:
# python -m spacy download en_core_web_sm

In [3]:
import pandas as pd
import spacy

In [4]:
# Load the spaCy model
nlp = spacy.load("en_core_web_sm")

# Load Dataset

In [5]:
# Load the sentimentdataset.csv file into a pandas DataFrame
df = pd.read_csv('CNN_Editors.tsv', delimiter='\t')
df.head()

Unnamed: 0,Year,Description,Conference,Title,Author Names
0,2022,Here's a look at how the pandemic reshaped peo...,business,"Two years later, remote work has changed milli...",Kathryn Vasel
1,2022,March Madness isn't just for college basketbal...,business,Why March is so volatile for stocks - CNN,"Paul R. La Monica, Business"
2,2022,"As crude prices surge, oil companies are rakin...",business,Stocks week ahead: Big Oil rakes in billions a...,"Chris Isidore, Business"
3,2022,Governments around the world must consider dra...,business,Oil 'emergency': Work from home and drive slow...,"Matt Egan, Business"
4,2022,The convenience of digital payments to both co...,business,Opinion: Technology is transforming the nature...,"As told to by Chris Wellisz, International Mon..."


# Preprocess dataset

In [6]:
# Remove the 'Author Names' column
df.drop(columns=['Author Names'], inplace=True)
df.head()

Unnamed: 0,Year,Description,Conference,Title
0,2022,Here's a look at how the pandemic reshaped peo...,business,"Two years later, remote work has changed milli..."
1,2022,March Madness isn't just for college basketbal...,business,Why March is so volatile for stocks - CNN
2,2022,"As crude prices surge, oil companies are rakin...",business,Stocks week ahead: Big Oil rakes in billions a...
3,2022,Governments around the world must consider dra...,business,Oil 'emergency': Work from home and drive slow...
4,2022,The convenience of digital payments to both co...,business,Opinion: Technology is transforming the nature...


## Exctract named entities

In [7]:
# Function to extract named entities from a text
# def extract_entities(text):
#     doc = nlp(text)
#     return [(ent.text, ent.label_) for ent in doc.ents]
def extract_entities(text):
    doc = nlp(text)
    return ';'.join([ent.text for ent in doc.ents])

# Apply the function to the 'Description' column
df['Author Names'] = df['Description'].apply(extract_entities)
df.head()

Unnamed: 0,Year,Description,Conference,Title,Author Names
0,2022,Here's a look at how the pandemic reshaped peo...,business,"Two years later, remote work has changed milli...",
1,2022,March Madness isn't just for college basketbal...,business,Why March is so volatile for stocks - CNN,March;this March
2,2022,"As crude prices surge, oil companies are rakin...",business,Stocks week ahead: Big Oil rakes in billions a...,
3,2022,Governments around the world must consider dra...,business,Oil 'emergency': Work from home and drive slow...,Russia;Ukraine;the International Energy Agency...
4,2022,The convenience of digital payments to both co...,business,Opinion: Technology is transforming the nature...,


In [8]:
# Count the number of rows that are empty or NaN in the 'Author Names' column
empty_or_nan_count = df['Author Names'].isna().sum() + df['Author Names'].eq('').sum()

# Count the number of rows that are not empty or NaN in the 'Author Names' column
non_empty_count = len(df) - empty_or_nan_count

print(f"Empty or NaN rows: {empty_or_nan_count}")
print(f"Non-empty rows: {non_empty_count}")

Empty or NaN rows: 1072
Non-empty rows: 36871


In [9]:
# Remove all the rows that are empty in the 'Author Names' column
df = df[df['Author Names'].apply(lambda x: x != '')]
df.head()

Unnamed: 0,Year,Description,Conference,Title,Author Names
1,2022,March Madness isn't just for college basketbal...,business,Why March is so volatile for stocks - CNN,March;this March
3,2022,Governments around the world must consider dra...,business,Oil 'emergency': Work from home and drive slow...,Russia;Ukraine;the International Energy Agency...
5,2022,"Inflation is everywhere: grocery stores, gas s...",business,Inflation is everywhere. Except your cell phon...,one
6,2022,Burger King is trying to suspend its operation...,business,Burger King partner 'refuses' to close 800 Rus...,Burger King;Russia;800
7,2022,The White House spent much of Friday frustrate...,business,White House 'appalled' at Axios over Ukraine a...,The White House;Friday;Axios;Ukraine


## Split important words

## Melt

In [10]:
# Drop the 'Description' and 'Author Names Column' columns
df = df.drop(columns=['Description'])
df.head()

Unnamed: 0,Year,Conference,Title,Author Names
1,2022,business,Why March is so volatile for stocks - CNN,March;this March
3,2022,business,Oil 'emergency': Work from home and drive slow...,Russia;Ukraine;the International Energy Agency...
5,2022,business,Inflation is everywhere. Except your cell phon...,one
6,2022,business,Burger King partner 'refuses' to close 800 Rus...,Burger King;Russia;800
7,2022,business,White House 'appalled' at Axios over Ukraine a...,The White House;Friday;Axios;Ukraine


## Agreggate

# Save Preprocessed dataset

In [11]:
# Export the DataFrame to a new .tsv file
df.to_csv('CNN2.tsv', sep='\t', index=False)