In [None]:
!ln -s /mnt/datasets/stocknet-dataset stocknet

In [None]:
import pandas as pd
import os
import json
import re

In [None]:

directory = "/content/stocknet-dataset/price/raw"

stock_data = {}

for filename in os.listdir(directory):
    if filename.endswith(".csv"):
        ticker = filename.replace(".csv", "")
        filepath = os.path.join(directory, filename)
        df = pd.read_csv(filepath, parse_dates=["Date"])
        df.set_index("Date", inplace=True)
        stock_data[ticker] = df

print(stock_data["RDS-B"].head())

In [None]:
tidy_df = pd.concat(
    [df.assign(Ticker=ticker) for ticker, df in stock_data.items()],
    axis=0
).reset_index()

In [None]:
tidy_df

In [None]:

root_dir = "/content/stocknet-dataset/tweet/preprocessed"

all_tweets = []

for ticker in os.listdir(root_dir):
    subfolder = os.path.join(root_dir, ticker)
    if os.path.isdir(subfolder):
        for file in os.listdir(subfolder):
            filepath = os.path.join(subfolder, file)
            if os.path.isfile(filepath):
                try:
                    with open(filepath, "r", encoding="utf-8") as f:
                        try:

                            tweet = json.load(f)
                            tweets = [tweet]
                        except json.JSONDecodeError:

                            f.seek(0)
                            tweets = [json.loads(line) for line in f if line.strip()]

                        for tweet in tweets:
                            flat_text = " ".join(tweet.get("text", []))
                            all_tweets.append({
                                "ticker": ticker,
                                "text": flat_text,
                                "created_at": tweet.get("created_at"),
                                "user_id": tweet.get("user_id_str")
                            })
                except Exception as e:
                    print(f"Skipping file {filepath} due to error: {e}")

tweet_df = pd.DataFrame(all_tweets)
tweet_df["created_at"] = pd.to_datetime(tweet_df["created_at"], errors="coerce")

In [None]:
tweet_df

In [None]:
def clean_text():
    char_patterns = re.compile('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+|#[a-zA-Z]+|$[a-zA-Z]+|@[a-zA-Z]+|[,.^_$*%-;!?:]')
    for i in range(len(tweet_df["text"])):
        tweet_df["text"][i] = char_patterns.sub('', tweet_df["text"][i])


In [None]:
def date_extract(datetime_obj):
  return str(datetime_obj.date())[:10]

tweet_df['date'] = tweet_df['created_at'].apply(date_extract)

In [None]:
tweet_df

In [None]:
clean_text()
tweet_df

In [None]:
tidy_df.to_parquet('stock_prices.parquet',index=False)
tweet_df.to_parquet('stock_tweets.parquet',index=False)