:)

In [1]:
import json
import os
import pandas as pd
import numpy as np
import re
import spacy
from sklearn.utils import resample
import matplotlib.pyplot as plt

# Function to read JSON files
def read_json_files(dir: str, fname_filter=None):
    json_fnames = []
    for (_, _, fnames) in os.walk(dir):
        json_fnames.extend(fnames)
        break  # ensure only top level json files are read.

    filter_predicate = lambda name: name.startswith(fname_filter) and name.endswith('.json') if fname_filter else lambda name: name.startswith(filter)
    json_fnames = list(filter(filter_predicate, json_fnames))

    json_data = []
    for fname in json_fnames:
        with open(f'{dir}/{fname}', 'r') as json_file:
            json_data.append(json.load(json_file))
    return json_data

# Load and parse the data
json_data = read_json_files('./data', 'data')
def parse_vantage_api_data(json_data):
    df_list = []
    for json_dict in json_data:
        for item in json_dict['feed']:
            item_dict = {'title': item['title'], 'time_published': item['time_published'], 'summary': item['summary'],
                         'overall_sentiment_label': item['overall_sentiment_label']}
            df_list.append(item_dict)
    return pd.DataFrame(df_list)

data_df = parse_vantage_api_data(json_data)
data_df["news"] = data_df["title"].str.cat(data_df["summary"], sep=' ')

# Preprocess the data for sentiment classification
nlp = spacy.load('en_core_web_sm', disable=['ner', 'parser'])  # Load SpaCy for text processing

def cleaning(doc):
    txt = [token.lemma_ for token in doc if not token.is_stop and token.is_alpha]
    return ' '.join(txt)

brief_cleaning = (re.sub("[^A-Za-z']+", ' ', str(row)).lower() for row in data_df['news'])
cleaned_txt = [cleaning(doc) for doc in nlp.pipe(brief_cleaning, batch_size=2500, n_process=-1)]

data_df['cleaned_text'] = cleaned_txt
data_df['label'] = data_df['overall_sentiment_label'].map({
    'Bearish': 0,
    'Somewhat-Bearish': 1,
    'Neutral': 2,
    'Somewhat-Bullish': 3,
    'Bullish': 4,
})
data_df.dropna(subset=['cleaned_text', 'label'], inplace=True)

# Save cleaned data to CSV
data_df[['cleaned_text', 'label']].to_csv('./data/cleaned_data_for_modeling.csv', index=False)


Found Intel OpenMP ('libiomp') and LLVM OpenMP ('libomp') loaded at
the same time. Both libraries are known to be incompatible and this
can cause random crashes or deadlocks on Linux when loaded in the
same Python program.
Using threadpoolctl may cause crashes or deadlocks. For more
information and possible workarounds, please see
    https://github.com/joblib/threadpoolctl/blob/master/multiple_openmp.md



In [2]:
%run alpha_vantage_log_reg.py

              precision    recall  f1-score   support

           0       1.00      0.01      0.03        69
           1       0.47      0.09      0.16       696
           2       0.63      0.69      0.66      6899
           3       0.60      0.67      0.63      6572
           4       0.57      0.18      0.27      1281

    accuracy                           0.61     15517
   macro avg       0.65      0.33      0.35     15517
weighted avg       0.61      0.61      0.59     15517

