# Text cleaning

In [1]:
import requests
import time
import tqdm
import pandas as pd
import os
import json
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
from tqdm import tqdm
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import pandas as pd
import re
import nltk

# Importing articles from ABCnews and CNN and merging them into one: 

In [2]:
data_ABC = pd.read_csv('/Users/astakettel/Desktop/ISDS/GitHub/ISDS-sentiment-analysis/Webscraping/ABC_full_data.csv')

data_CNN = pd.read_csv('/Users/astakettel/Desktop/ISDS/GitHub/ISDS-sentiment-analysis/Webscraping/CNN_full_data.csv')

In [3]:
# Making sure we only keep relevant columns and giving them the same names:
data_CNN = data_CNN[['headline', 'url', 'lastModifiedDate','article_text','source']]
data_CNN = data_CNN.rename(columns={'headline': 'title', 'lastModifiedDate': 'date'})

data_ABC = data_ABC.rename(columns = {'URL':'url', 'Title':'title', 'Date':'date'})

In [None]:
# Formatting the Date columns to be in the same format: 

In [4]:
# The CNN also has the time included, which confuses when trying to convert to datetime format, so we will remove those first:
data_CNN['date'] = data_CNN['date'].str.replace(r'T.*Z', '', regex=True)

# Convert to date-time format 
data_CNN['date'] = pd.to_datetime(data_CNN['date'], errors='coerce')
# Convert to date-time format 
data_ABC['date'] = pd.to_datetime(data_ABC['date'], errors='coerce')

In [5]:
data = pd.concat([data_ABC, data_CNN], axis=0)

In [6]:
# Sort the DataFrame by the 'article_date' column from oldest to newest
data = data.sort_values(by='date')

## What is your document?

Our document is the article as a whole - all text in the article excluding image discriptions, and authour tags. 

## Preprossesing
- Clean text: ignore/remove any unwanted characters: casing, HTML markup, non-words, etc. (maybe also emoticons?)
- Tokenization and stop-words
- Stemming and lemmatization

***Removing NA values*** on the column "article_text"

In [7]:
# Remove rows with NaN values in the 'article_text' column
data = data.dropna(subset=['article_text'])
data = data.reset_index(drop=True)  # Reset the index

***Cleaning the text***

In [8]:
def cleaner(document):
    document = document.lower()  # To lower case
    document = re.sub(r'<[^>]*>', ' ', document)  # Remove HTML
    document = re.sub(r'[^\w\s&$€%]', '', document)  # Remove non-alphanumeric characters except &, $, %, and €
    document = re.sub(r'&151', '', document)  # Remove specific string "&151"
    return document

data['article_text'] = data['article_text'].apply(cleaner)

# Checking for duplicates: 
data[data['title'].duplicated()]
data[data['article_text'].duplicated()]

# Removing the duplicates articles: 
data = data.drop_duplicates(subset=['title']).reset_index(drop=True)
data = data.drop_duplicates(subset=['article_text']).reset_index(drop=True)

***Tokenization***
- Splitting the articles into meaningfull elements to prepare for analysis. In our case we need to split the articles into words as these are what will be used for classifying sentiments. 

In [9]:
# Split the 'article_text' column into tokens based on whitespace and saving it into a new column
# "words"
data['words'] = data['article_text'].str.split()

***Removing stop words***
- These are words that occur very often and probably bear no useful information about the text

In [10]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

stop = stopwords.words('english')
data['words'] = [i for i in data['words'] if i not in stop]

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/astakettel/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


***Saving as a CSV file***

In [11]:
data.to_csv('/Users/astakettel/Desktop/ISDS/GitHub/ISDS-sentiment-analysis/EN_cleaned.csv', index=False)

# Using the lexicon vader for sentiment scores

## Computing sentiment scores from -1 to 1 and binary 

In [12]:
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# Download the VADER lexicon if you haven't already
nltk.download('vader_lexicon')

# Initialize the VADER SentimentIntensityAnalyzer
analyser = SentimentIntensityAnalyzer()

# Apply VADER sentiment analysis to each row and store the sentiment score in a new column
def get_sentiment(text):
    # Get the compound sentiment score
    return analyser.polarity_scores(text)['compound']

# Apply the function to the 'article_text'
data['sentiment'] = data['article_text'].apply(get_sentiment)

# Display the first few rows to verify
print(data[['article_text', 'sentiment']].head())

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/astakettel/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


                                        article_text  sentiment
0    the end of the tv writers strike cleared the...     0.9851
1    the end of the tv writers strike cleared the...     0.9892
2    if the complex deal comes together in time a...     0.9674
3    the companies which were ready to announce a...     0.7829
4    delta air lines and northwest airlines on mo...     0.9684


In [13]:
# Create a new column using an if-else loop to classify sentiment
data['pos/neg'] = ''

# Loop through each row and assign the sentiment label
for i in range(len(data)):
    if data.loc[i, 'sentiment'] > 0:
        data.loc[i, 'pos/neg'] = 'positive'
    else:
        data.loc[i, 'pos/neg'] = 'negative'

## Saving as csv

In [14]:
data.to_csv('/Users/astakettel/Desktop/ISDS/GitHub/ISDS-sentiment-analysis/Webscraping/EN_clean_sent.csv', index=False)