# Data Preprocessing
> Author: Chandan Rao (chandankuma4@iisc.ac.in)

In [65]:
import json
import torch
import pandas as pd
from datetime import datetime
from transformers import pipeline
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [None]:
# Load the dataset
df = pd.read_csv('../Dataset/newsapi/raw/NewsDataset.csv')

In [67]:
# Extracting the name value from the source column
df['source'] = df['source'].apply(lambda x: eval(x)["name"] if isinstance(x, str) else x)

df['source']

0                   Forbes
1                   Forbes
2                 Livemint
3             BusinessLine
4        Business Standard
               ...        
12142         BusinessLine
12143         BusinessLine
12144         BusinessLine
12145         BusinessLine
12146         BusinessLine
Name: source, Length: 12147, dtype: object

In [68]:
# Converting 'publishedAt' to datetime and extracting the date and time
df['publishedAt'] = pd.to_datetime(df['publishedAt']).dt.tz_convert(None)

In [69]:
# Removing microseconds from the datetime
df['publishedAt'] = df['publishedAt'].dt.floor('s')

In [70]:
df.publishedAt.head()

0   2024-10-22 16:00:08
1   2024-10-09 22:03:27
2   2024-10-11 05:50:52
3   2024-10-09 19:40:00
4   2024-10-14 13:41:19
Name: publishedAt, dtype: datetime64[ns]

In [71]:
# Remove duplicate rows based on all columns
df.drop_duplicates(inplace=True)

In [72]:
# Retain only the source, author, title, description, publishedAt, url, and content columns
df_filtered = df[['source','author', 'title', 'description', 'publishedAt', 'url', 'content','search_query']]
df_filtered.head()

Unnamed: 0,source,author,title,description,publishedAt,url,content,search_query
0,Forbes,"John Kang, Forbes Staff, \n John Kang, Forbes ...",Hyundai Motor India Shares Slump As Trading Be...,The South Korean car maker’s $3.3 billion shar...,2024-10-22 16:00:08,https://www.forbes.com/sites/johnkang/2024/10/...,Hyundai Motor India managing director Unsoo Ki...,Tata Motors March 2024
1,Forbes,"Gloria Haraito, Forbes Staff, \n Gloria Harait...",What’s Driving The Son Of India’s Richest Woma...,"Amid India’s EV push, steel magnate Sajjan Jin...",2024-10-09 22:03:27,https://www.forbes.com/sites/gloriaharaito/202...,Sajjan (left) and Parth Jindal.\nJSW Group\nTh...,Tata Motors March 2024
2,Livemint,George Skaria,Tata’s next challenge: Leadership void at Trusts,"With Ratan Tata’s passing, the lack of a clear...",2024-10-11 05:50:52,https://www.livemint.com/opinion/ratan-tata-de...,With the passing of Ratan Naval Tata (1937-202...,Tata Motors March 2024
3,BusinessLine,,Ratan Tata over the years - I,"Following the demise of Ratan Tata, here are s...",2024-10-09 19:40:00,https://www.thehindubusinessline.com/multimedi...,"Following the demise of Ratan Tata, here are s...",Tata Motors March 2024
4,Business Standard,Dev Chatterjee,"Tata Capital, Tata Motors Finance merger recei...",Tata Capital-Tata Motors Finance Merger: Throu...,2024-10-14 13:41:19,https://www.business-standard.com/companies/ne...,"Through this merger, Tata Capital aims to attr...",Tata Motors March 2024


In [73]:
# Rename the search_query field to companyName and author to authors
df_filtered.rename(columns={'search_query': 'companyName','author':'authors', 'content':'summary'}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered.rename(columns={'search_query': 'companyName','author':'authors', 'content':'summary'}, inplace=True)


In [74]:
df_filtered.head()

Unnamed: 0,source,authors,title,description,publishedAt,url,summary,companyName
0,Forbes,"John Kang, Forbes Staff, \n John Kang, Forbes ...",Hyundai Motor India Shares Slump As Trading Be...,The South Korean car maker’s $3.3 billion shar...,2024-10-22 16:00:08,https://www.forbes.com/sites/johnkang/2024/10/...,Hyundai Motor India managing director Unsoo Ki...,Tata Motors March 2024
1,Forbes,"Gloria Haraito, Forbes Staff, \n Gloria Harait...",What’s Driving The Son Of India’s Richest Woma...,"Amid India’s EV push, steel magnate Sajjan Jin...",2024-10-09 22:03:27,https://www.forbes.com/sites/gloriaharaito/202...,Sajjan (left) and Parth Jindal.\nJSW Group\nTh...,Tata Motors March 2024
2,Livemint,George Skaria,Tata’s next challenge: Leadership void at Trusts,"With Ratan Tata’s passing, the lack of a clear...",2024-10-11 05:50:52,https://www.livemint.com/opinion/ratan-tata-de...,With the passing of Ratan Naval Tata (1937-202...,Tata Motors March 2024
3,BusinessLine,,Ratan Tata over the years - I,"Following the demise of Ratan Tata, here are s...",2024-10-09 19:40:00,https://www.thehindubusinessline.com/multimedi...,"Following the demise of Ratan Tata, here are s...",Tata Motors March 2024
4,Business Standard,Dev Chatterjee,"Tata Capital, Tata Motors Finance merger recei...",Tata Capital-Tata Motors Finance Merger: Throu...,2024-10-14 13:41:19,https://www.business-standard.com/companies/ne...,"Through this merger, Tata Capital aims to attr...",Tata Motors March 2024


In [75]:
# Remove the last two words from the companyName field values
df_filtered.loc[:, 'companyName'] = df_filtered['companyName'].apply(lambda x: ' '.join(x.split()[:-2]))

In [76]:
df_filtered.head()

Unnamed: 0,source,authors,title,description,publishedAt,url,summary,companyName
0,Forbes,"John Kang, Forbes Staff, \n John Kang, Forbes ...",Hyundai Motor India Shares Slump As Trading Be...,The South Korean car maker’s $3.3 billion shar...,2024-10-22 16:00:08,https://www.forbes.com/sites/johnkang/2024/10/...,Hyundai Motor India managing director Unsoo Ki...,Tata Motors
1,Forbes,"Gloria Haraito, Forbes Staff, \n Gloria Harait...",What’s Driving The Son Of India’s Richest Woma...,"Amid India’s EV push, steel magnate Sajjan Jin...",2024-10-09 22:03:27,https://www.forbes.com/sites/gloriaharaito/202...,Sajjan (left) and Parth Jindal.\nJSW Group\nTh...,Tata Motors
2,Livemint,George Skaria,Tata’s next challenge: Leadership void at Trusts,"With Ratan Tata’s passing, the lack of a clear...",2024-10-11 05:50:52,https://www.livemint.com/opinion/ratan-tata-de...,With the passing of Ratan Naval Tata (1937-202...,Tata Motors
3,BusinessLine,,Ratan Tata over the years - I,"Following the demise of Ratan Tata, here are s...",2024-10-09 19:40:00,https://www.thehindubusinessline.com/multimedi...,"Following the demise of Ratan Tata, here are s...",Tata Motors
4,Business Standard,Dev Chatterjee,"Tata Capital, Tata Motors Finance merger recei...",Tata Capital-Tata Motors Finance Merger: Throu...,2024-10-14 13:41:19,https://www.business-standard.com/companies/ne...,"Through this merger, Tata Capital aims to attr...",Tata Motors


In [77]:
# Get base url

from urllib.parse import urlparse

def get_base_url(url):
    parsed_url = urlparse(url)
    base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
    return base_url

In [78]:
# Applying the function to the 'url' column
df_filtered['source_url'] = df_filtered['url'].apply(get_base_url)

In [79]:
df_filtered.head()

Unnamed: 0,source,authors,title,description,publishedAt,url,summary,companyName,source_url
0,Forbes,"John Kang, Forbes Staff, \n John Kang, Forbes ...",Hyundai Motor India Shares Slump As Trading Be...,The South Korean car maker’s $3.3 billion shar...,2024-10-22 16:00:08,https://www.forbes.com/sites/johnkang/2024/10/...,Hyundai Motor India managing director Unsoo Ki...,Tata Motors,https://www.forbes.com
1,Forbes,"Gloria Haraito, Forbes Staff, \n Gloria Harait...",What’s Driving The Son Of India’s Richest Woma...,"Amid India’s EV push, steel magnate Sajjan Jin...",2024-10-09 22:03:27,https://www.forbes.com/sites/gloriaharaito/202...,Sajjan (left) and Parth Jindal.\nJSW Group\nTh...,Tata Motors,https://www.forbes.com
2,Livemint,George Skaria,Tata’s next challenge: Leadership void at Trusts,"With Ratan Tata’s passing, the lack of a clear...",2024-10-11 05:50:52,https://www.livemint.com/opinion/ratan-tata-de...,With the passing of Ratan Naval Tata (1937-202...,Tata Motors,https://www.livemint.com
3,BusinessLine,,Ratan Tata over the years - I,"Following the demise of Ratan Tata, here are s...",2024-10-09 19:40:00,https://www.thehindubusinessline.com/multimedi...,"Following the demise of Ratan Tata, here are s...",Tata Motors,https://www.thehindubusinessline.com
4,Business Standard,Dev Chatterjee,"Tata Capital, Tata Motors Finance merger recei...",Tata Capital-Tata Motors Finance Merger: Throu...,2024-10-14 13:41:19,https://www.business-standard.com/companies/ne...,"Through this merger, Tata Capital aims to attr...",Tata Motors,https://www.business-standard.com


In [80]:
df_filtered.head()

Unnamed: 0,source,authors,title,description,publishedAt,url,summary,companyName,source_url
0,Forbes,"John Kang, Forbes Staff, \n John Kang, Forbes ...",Hyundai Motor India Shares Slump As Trading Be...,The South Korean car maker’s $3.3 billion shar...,2024-10-22 16:00:08,https://www.forbes.com/sites/johnkang/2024/10/...,Hyundai Motor India managing director Unsoo Ki...,Tata Motors,https://www.forbes.com
1,Forbes,"Gloria Haraito, Forbes Staff, \n Gloria Harait...",What’s Driving The Son Of India’s Richest Woma...,"Amid India’s EV push, steel magnate Sajjan Jin...",2024-10-09 22:03:27,https://www.forbes.com/sites/gloriaharaito/202...,Sajjan (left) and Parth Jindal.\nJSW Group\nTh...,Tata Motors,https://www.forbes.com
2,Livemint,George Skaria,Tata’s next challenge: Leadership void at Trusts,"With Ratan Tata’s passing, the lack of a clear...",2024-10-11 05:50:52,https://www.livemint.com/opinion/ratan-tata-de...,With the passing of Ratan Naval Tata (1937-202...,Tata Motors,https://www.livemint.com
3,BusinessLine,,Ratan Tata over the years - I,"Following the demise of Ratan Tata, here are s...",2024-10-09 19:40:00,https://www.thehindubusinessline.com/multimedi...,"Following the demise of Ratan Tata, here are s...",Tata Motors,https://www.thehindubusinessline.com
4,Business Standard,Dev Chatterjee,"Tata Capital, Tata Motors Finance merger recei...",Tata Capital-Tata Motors Finance Merger: Throu...,2024-10-14 13:41:19,https://www.business-standard.com/companies/ne...,"Through this merger, Tata Capital aims to attr...",Tata Motors,https://www.business-standard.com


In [81]:
# Remove all rows that contain any null value

# Display the count of null values
print(df_filtered.isnull().sum())

# Remove all rows with any null values
df_cleaned = df_filtered.dropna()

# Display the cleaned DataFrame
print(df_cleaned)

source           0
authors        216
title           24
description      1
publishedAt      0
url              0
summary          0
companyName      0
source_url       0
dtype: int64
                  source                                            authors  \
0                 Forbes  John Kang, Forbes Staff, \n John Kang, Forbes ...   
1                 Forbes  Gloria Haraito, Forbes Staff, \n Gloria Harait...   
2               Livemint                                      George Skaria   
4      Business Standard                                     Dev Chatterjee   
5               Livemint                                      Nikita Prasad   
...                  ...                                                ...   
12142       BusinessLine                        KS Badri Narayanan, Team BL   
12143       BusinessLine                        KS Badri Narayanan, Team BL   
12144       BusinessLine                        KS Badri Narayanan, Team BL   
12145       BusinessLine  

In [82]:
# Verify the count of null values
df_cleaned.isnull().sum()

source         0
authors        0
title          0
description    0
publishedAt    0
url            0
summary        0
companyName    0
source_url     0
dtype: int64

In [83]:
# function computes the compound sentiment score for a given piece of text using the VADER sentiment analysis tool,
# which is useful for quickly assessing whether the sentiment of the text is positive, negative, or neutral.

def get_vader_score(text):
    analyzer = SentimentIntensityAnalyzer()
    sentiment = analyzer.polarity_scores(text)
    return sentiment["compound"]

In [85]:
# Add a new column vader to the DataFrame, which contains the sentiment scores generated by the get_vader_score function.
# The function presumably calculates sentiment analysis using the VADER (Valence Aware Dictionary and sEntiment Reasoner) method,
# which provides a score indicating the sentiment of the text in the content column

df_cleaned['summary_vader'] = df_cleaned['summary'].apply(get_vader_score)

df_cleaned['description_vader'] = df_cleaned['description'].apply(get_vader_score)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned['summary_vader'] = df_cleaned['summary'].apply(get_vader_score)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned['description_vader'] = df_cleaned['description'].apply(get_vader_score)


In [86]:
df_cleaned.head()

Unnamed: 0,source,authors,title,description,publishedAt,url,summary,companyName,source_url,summary_vader,description_vader
0,Forbes,"John Kang, Forbes Staff, \n John Kang, Forbes ...",Hyundai Motor India Shares Slump As Trading Be...,The South Korean car maker’s $3.3 billion shar...,2024-10-22 16:00:08,https://www.forbes.com/sites/johnkang/2024/10/...,Hyundai Motor India managing director Unsoo Ki...,Tata Motors,https://www.forbes.com,0.0,0.296
1,Forbes,"Gloria Haraito, Forbes Staff, \n Gloria Harait...",What’s Driving The Son Of India’s Richest Woma...,"Amid India’s EV push, steel magnate Sajjan Jin...",2024-10-09 22:03:27,https://www.forbes.com/sites/gloriaharaito/202...,Sajjan (left) and Parth Jindal.\nJSW Group\nTh...,Tata Motors,https://www.forbes.com,0.7783,0.4767
2,Livemint,George Skaria,Tata’s next challenge: Leadership void at Trusts,"With Ratan Tata’s passing, the lack of a clear...",2024-10-11 05:50:52,https://www.livemint.com/opinion/ratan-tata-de...,With the passing of Ratan Naval Tata (1937-202...,Tata Motors,https://www.livemint.com,0.0772,0.6486
4,Business Standard,Dev Chatterjee,"Tata Capital, Tata Motors Finance merger recei...",Tata Capital-Tata Motors Finance Merger: Throu...,2024-10-14 13:41:19,https://www.business-standard.com/companies/ne...,"Through this merger, Tata Capital aims to attr...",Tata Motors,https://www.business-standard.com,0.4939,0.4939
5,Livemint,Nikita Prasad,Ratan Tata passes away at 86: Top business tyc...,"Ratan Tata breathed his last on Wednesday, Oct...",2024-10-09 19:46:32,https://www.livemint.com/companies/people/rata...,"Ratan Tata passes away:Ratan Tata, chairman em...",Tata Motors,https://www.livemint.com,-0.3182,-0.1779


In [87]:
df_cleaned.shape

(11537, 11)

In [None]:
# Save the pre-processed file
df_cleaned.to_csv('../Dataset/newsapi/preprocessed/company_news_preprocessed.csv', index=False)