In [None]:
# This notebook shows how to create a signals dataset from a .csv file

In [None]:
# not needed if news_signals is already installed
# you might see a pip version error but it's grand, don't worry
!pip install -q news_signals

In [1]:
import argparse
import json
import logging
from pathlib import Path

import pandas as pd
import arrow

from news_signals import newsapi
from news_signals.signals import AylienSignal
from news_signals.signals_dataset import SignalsDataset, generate_dataset, reduce_aylien_story
from news_signals.dataset_transformations import get_dataset_transform
from news_signals.log import create_logger

In [None]:
# create a trial account here: https://aylien.com/news-api-signup
# then go to https://app.aylien.com/dashboard to get your credentials

# note it's MUCH better to set these as environment variables if you know how, and delete this cell
# so that API keys aren't accidentally passed around
# but we include this cell for convience -- please be very careful to not share your keys
# NEWSAPI_APP_ID='<set-this>'
# NEWSAPI_APP_KEY='<set-this>'
# newsapi.set_headers(NEWSAPI_APP_ID, NEWSAPI_APP_KEY)

In [2]:
# let's grab an example csv and load it into a dataframe
csv_url = "https://raw.githubusercontent.com/AYLIEN/news-signals-datasets/main/resources/test/nasdaq100.small.csv"

# Read data from URL
entity_data = pd.read_csv(csv_url)
csv_path = 'example_csv_data.csv'
entity_data.to_csv(csv_path)
entity_data

Unnamed: 0,Wikidata ID,Wikidata Label
0,Q1024454,CSX Corporation
1,Q1055390,Cerner
2,Q1092571,Cintas
3,Q11463,Adobe
4,Q1155668,NXP Semiconductors
5,Q1383669,Exelon
6,Q14772,Baidu
7,Q15109865,Dexcom
8,Q1545076,GARANT
9,Q17081612,Moderna


In [3]:
output_dataset_path = Path('example_signals_dataset')
config = {
    'gcs_bucket': None,
    'start': '2023-10-01',
    'end': '2023-10-10',
    'stories_per_day': 10,
    'name_field': 'Wikidata Label',
    'id_field': None,
    'surface_form_field': 'Wikidata Label',
    'overwrite': False
}

dataset = generate_dataset(
    input=Path(csv_path),
    output_dataset_dir=output_dataset_path,
    gcs_bucket=config['gcs_bucket'],
    start=arrow.get(config['start']).datetime,
    end=arrow.get(config['end']).datetime,
    stories_per_day=config['stories_per_day'],
    name_field=config['name_field'],
    id_field=config['id_field'],
    surface_form_field=config.get("surface_form_field", None),
    overwrite=config['overwrite'],
    delete_tmp_files=True,
    compress=True,
    post_process_story=reduce_aylien_story
)    

  0%|                                                                                                                                                                              | 0/10 [00:00<?, ?it/s]

2024-01-25 14:14:48,935 news_signals.signals_dataset INFO: signal exists already, skipping to next
2024-01-25 14:14:48,936 news_signals.signals_dataset INFO: signal exists already, skipping to next
2024-01-25 14:14:48,937 news_signals.signals_dataset INFO: signal exists already, skipping to next
2024-01-25 14:14:48,938 news_signals.signals_dataset INFO: signal exists already, skipping to next
2024-01-25 14:14:48,939 news_signals.signals_dataset INFO: signal exists already, skipping to next
2024-01-25 14:14:48,939 news_signals.signals_dataset INFO: signal exists already, skipping to next
2024-01-25 14:14:48,940 news_signals.signals_dataset INFO: signal exists already, skipping to next
2024-01-25 14:14:48,941 news_signals.signals_dataset INFO: signal exists already, skipping to next
2024-01-25 14:14:48,941 news_signals.signals_dataset INFO: signal exists already, skipping to next
2024-01-25 14:14:48,942 news_signals.signals_dataset INFO: signal exists already, skipping to next


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 1329.25it/s]

2024-01-25 14:14:48,943 news_signals.signals_dataset INFO: Found decompressed dataset at example_signals_dataset, not decompressing again.





2024-01-25 14:14:49,404 news_signals.signals_dataset INFO: Saved compressed dataset to example_signals_dataset.tar.gz


In [None]:
dataset = SignalsDataset.load(output_dataset_path)

In [None]:
dataset.plot()

In [None]:
signals = sorted(dataset.signals.values(), key=lambda s: s.name)
[s.name for s in signals]

In [None]:
signal = signals[0]
print(f'Signal Name: {signal.name}')
signal.plot()

In [None]:
signal.feeds_df.tail(3)

In [None]:
for day in signal.feeds_df['stories']:
    for story in day:
        print(story['title'])