In [73]:
# Imports & Paths
import pandas as pd
import numpy as np
from pathlib import Path

# Define project directories
PROJECT_DIR = Path.cwd().parent
D2_DIR       = PROJECT_DIR / 'D2'
DATA_DIR     = PROJECT_DIR / 'data'
FEATURES_DIR = DATA_DIR / 'features'

# verification
for d in (DATA_DIR, FEATURES_DIR):
    d.mkdir(parents=True, exist_ok=True)


### Load CSV Files

In [74]:
# Working copies
fg_path = D2_DIR / 'fear_greed_index.csv'
trades_path = D2_DIR / 'historical_data.csv'

# DataFrames
df_sentiment = pd.read_csv(fg_path)
df_trades = pd.read_csv(trades_path)

In [75]:
# Inspection
print("Sentiment shape & columns:", df_sentiment.shape, df_sentiment.columns.tolist())
print("Trades   shape & columns:", df_trades.shape,    df_trades.columns.tolist())

print("\nNulls in Sentiment:")
print(df_sentiment.isna().sum())
print("\nNulls in Trades:")
print(df_trades.isna().sum())

print("\nDtypes in Sentiment:")
print(df_sentiment.dtypes)
print("\nDtypes in Trades:")
print(df_trades.dtypes)

Sentiment shape & columns: (2644, 4) ['timestamp', 'value', 'classification', 'date']
Trades   shape & columns: (211224, 16) ['Account', 'Coin', 'Execution Price', 'Size Tokens', 'Size USD', 'Side', 'Timestamp IST', 'Start Position', 'Direction', 'Closed PnL', 'Transaction Hash', 'Order ID', 'Crossed', 'Fee', 'Trade ID', 'Timestamp']

Nulls in Sentiment:
timestamp         0
value             0
classification    0
date              0
dtype: int64

Nulls in Trades:
Account             0
Coin                0
Execution Price     0
Size Tokens         0
Size USD            0
Side                0
Timestamp IST       0
Start Position      0
Direction           0
Closed PnL          0
Transaction Hash    0
Order ID            0
Crossed             0
Fee                 0
Trade ID            0
Timestamp           0
dtype: int64

Dtypes in Sentiment:
timestamp          int64
value              int64
classification    object
date              object
dtype: object

Dtypes in Trades:
Account     

### Parse & clean datetime columns

In [76]:
# Sentiment
df_sentiment['timestamp'] = pd.to_datetime(
    df_sentiment['timestamp'],
    unit='s',
    errors='raise'
)
# - 'date' is ISO YYYY‑MM‑DD
df_sentiment['date'] = pd.to_datetime(
    df_sentiment['date'],
    format='%Y-%m-%d',
    errors='raise'
)

# Trades
#  human‑readable IST string
df_trades['Timestamp IST'] = pd.to_datetime(
    df_trades['Timestamp IST'],
    format='%d-%m-%Y %H:%M',
    errors='raise'
)


###  Rename to snake_case & drop old

In [77]:
df_sentiment = df_sentiment.rename(columns={
    'timestamp':      'sentiment_ts',
    'value':          'sentiment_value',
    'classification': 'sentiment_cat',
    'date':           'sentiment_date'
})

trade_rename_map = {
    'Account':           'account',
    'Coin':              'coin',
    'Execution Price':   'execution_price',
    'Size Tokens':       'size_tokens',
    'Size USD':          'size_usd',
    'Side':              'side',
    'Timestamp IST':     'trade_time',
    'Start Position':    'start_position',
    'Direction':         'direction',
    'Closed PnL':        'closed_pnl',
    'Transaction Hash':  'transaction_hash',
    'Order ID':          'order_id',
    'Crossed':           'crossed',
    'Fee':                'fee',
    'Trade ID':          'trade_id'
}


df_trades = (
    df_trades
    .rename(columns=trade_rename_map)
    .drop(columns=['Timestamp'])   # drop the all‑NaT numeric field
)


In [78]:
# Post‑rename inspection
print("\nAfter renaming & dropping:")
print("Sentiment columns:", df_sentiment.columns.tolist())
print("Trades    columns:",    df_trades.columns.tolist())

print("\nNulls in cleaned sentiment and trades:")
print(df_sentiment.isna().sum())
print(df_trades.isna().sum())

print("\nDtypes in cleaned sentiment:")
print(df_sentiment.dtypes)
print("\nDtypes in cleaned trades:")
print(df_trades.dtypes)



After renaming & dropping:
Sentiment columns: ['sentiment_ts', 'sentiment_value', 'sentiment_cat', 'sentiment_date']
Trades    columns: ['account', 'coin', 'execution_price', 'size_tokens', 'size_usd', 'side', 'trade_time', 'start_position', 'direction', 'closed_pnl', 'transaction_hash', 'order_id', 'crossed', 'fee', 'trade_id']

Nulls in cleaned sentiment and trades:
sentiment_ts       0
sentiment_value    0
sentiment_cat      0
sentiment_date     0
dtype: int64
account             0
coin                0
execution_price     0
size_tokens         0
size_usd            0
side                0
trade_time          0
start_position      0
direction           0
closed_pnl          0
transaction_hash    0
order_id            0
crossed             0
fee                 0
trade_id            0
dtype: int64

Dtypes in cleaned sentiment:
sentiment_ts       datetime64[ns]
sentiment_value             int64
sentiment_cat              object
sentiment_date     datetime64[ns]
dtype: object

Dtypes 

### Convert categories & numeric types

In [79]:
df_sentiment['sentiment_cat'] = df_sentiment['sentiment_cat'].astype('category')
df_trades['side']           = df_trades['side'].astype('category')
df_trades['direction']      = df_trades['direction'].astype('category')

for col in ['execution_price','size_tokens','size_usd','closed_pnl','fee']:
    df_trades[col] = pd.to_numeric(df_trades[col], errors='coerce')



### Save and export cleaned data

In [80]:
df_sentiment.to_parquet(DATA_DIR / 'clean_sentiment.parquet', index=False)
df_trades.to_parquet(DATA_DIR / 'clean_trades.parquet', index=False)

print("\n✅ Data ingestion & cleaning complete. Clean files written to:")
print(" -", DATA_DIR / 'clean_sentiment.parquet')
print(" -", DATA_DIR / 'clean_trades.parquet')


✅ Data ingestion & cleaning complete. Clean files written to:
 - H:\Portfilios\JuniorDataScientist\TradeAnalysis\data\clean_sentiment.parquet
 - H:\Portfilios\JuniorDataScientist\TradeAnalysis\data\clean_trades.parquet
