# Data Preprocessing

## Environment Setup

In [None]:
%pip install -r '../requirements.txt'

In [None]:
import json
import glob
import torch
import pandas as pd
from datetime import datetime
from transformers import pipeline
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [None]:
import nltk
nltk.download('popular')

In [None]:
# check if torch finds cuda
torch.cuda.is_available()

In [None]:
# CONSTANTS
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
FILENAME = '../Dataset/scraper/raw/rawdata_1731148570.344731.json'

## Data Processing

In [None]:
summarizer = pipeline("summarization", device=DEVICE)
def get_summary(text):
    summary = summarizer(text, truncation=True, max_length=300, min_length=50, do_sample=True, temperature=0.3)
    return summary[0]['summary_text']

In [None]:
def get_vader_score(text):
    if type(text) != str or len(text) == 0:    return 0
    text = text.strip()
    analyzer = SentimentIntensityAnalyzer()
    sentiment = analyzer.polarity_scores(text)
    return sentiment["compound"]

In [None]:
# %%script false --no-raise-error
with open(FILENAME, 'r') as f:
    data = json.load(f)
article_df = pd.DataFrame(data)
article_df['datetime'] = pd.to_datetime(article_df['datetime'])
article_df.dropna(subset=['datetime'], inplace=True)
article_df.head()

In [None]:
# %%script false --no-raise-error
article_df['summary'] = article_df['text'].apply(get_summary)
article_df['summary_vader'] = article_df['summary'].apply(get_vader_score)
article_df['description_vader'] = article_df['description'].apply(get_vader_score)

In [None]:
# %%script false --no-raise-error
article_df.drop(columns=['metadata', 'text'], inplace=True)
article_df.sort_values('datetime', inplace=True)
article_df.to_excel(FILENAME.replace('raw', 'preprocessed').replace('.json', '.xlsx'), index=False)
article_df.head()

In [None]:
# Read each preprocessed file and concatenate them into a single DataFrame
excel_files = glob.glob("../Dataset/scraper/preprocessed/articledf_*.xlsx")
df_list = [pd.read_excel(file, parse_dates=['datetime']) for file in excel_files]
article_df = pd.concat(df_list, ignore_index=True)
article_df.drop_duplicates(subset=['url'])
article_df.sort_values('datetime', inplace=True)
article_df.to_excel("../Dataset/scraper/preprocessed/market_article_df.xlsx", index=False)