In [1]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [2]:
import pandas as pd
import numpy as np
from scipy.stats import skew, kurtosis

# Load dataset
df = pd.read_csv('/content/drive/MyDrive/MRP/merged_price_news_data_ffilled.csv', parse_dates=['date'])

In [3]:
# Drop duplicate rows
df.drop_duplicates(inplace=True)
print("New shape after dropping duplicates:", df.shape)

New shape after dropping duplicates: (10851560, 17)


In [4]:
# Remove invalid weekday entries
df = df[df['day_of_week'] >= 0].copy()

In [5]:
# Remove rows with log_volume == 0 (invalid)
df = df[df['log_volume'] > 0]
print(f"Rows after dropping log_volume == 0: {len(df):,}")

Rows after dropping log_volume == 0: 8,415,260


In [6]:
# Apply log1p to skewed price columns
price_cols = ['open', 'high', 'low', 'adj close', 'ma_10']
df.loc[:, price_cols] = df[price_cols].apply(np.log1p)


In [7]:
# Cap extreme vol_30 values
vol_cap = df['vol_30'].quantile(0.99)
df.loc[:, 'vol_30'] = df['vol_30'].clip(upper=vol_cap)

In [8]:
# Filter out unreliable symbols
symbol_stats = df.groupby('symbol')['date'].agg(['min', 'max', 'count'])
symbol_stats['duration_days'] = (symbol_stats['max'] - symbol_stats['min']).dt.days
valid_symbols = symbol_stats[(symbol_stats['count'] >= 300) & (symbol_stats['duration_days'] >= 365)].index
df = df[df['symbol'].isin(valid_symbols)].reset_index(drop=True)
print("Remaining rows after symbol filtering:", len(df))
print("Remaining symbols:", df['symbol'].nunique())

Remaining rows after symbol filtering: 8380665
Remaining symbols: 5455


In [9]:
# Drop constant or irrelevant columns
if 'had_news' in df.columns and df['had_news'].nunique() <= 1:
    df.drop(columns=['had_news'], inplace=True)
    print("Dropped constant column: 'had_news'")

Dropped constant column: 'had_news'


In [10]:
# Drop redundant price columns
df.drop(columns=['open', 'high', 'low'], inplace=True)
print("Remaining columns:", df.columns.tolist())

Remaining columns: ['date', 'symbol', 'adj close', 'log_volume', 'ma_10', 'vol_30', 'rsi_14', 'day_of_week', 'avg_sentiment', 'avg_sentiment_confidence', 'sentiment_std_7', 'news_count', 'news_count_capped']


In [11]:
# Ensure proper datetime format and sort
df['date'] = pd.to_datetime(df['date'])
df = df.sort_values(['symbol', 'date']).reset_index(drop=True)

In [12]:
# Compute 1-day forward return per symbol
df['adj_close_next'] = df.groupby('symbol')['adj close'].shift(-1)
df['return_1d'] = np.log(df['adj_close_next'] / df['adj close'])
df.dropna(subset=['return_1d'], inplace=True)
df.drop(columns=['adj_close_next'], inplace=True)

In [13]:
# Drop more irrelevant columns if present
df.drop(columns=[col for col in ['news_count', 'news_count_capped'] if col in df.columns], inplace=True)


In [14]:
# Filter out low-coverage symbols again (if needed)
valid_symbols = df['symbol'].value_counts()[lambda x: x >= 252].index
df = df[df['symbol'].isin(valid_symbols)].reset_index(drop=True)

In [15]:
# Clip extreme return_1d values
df = df[df['return_1d'].between(-1, 1)]

In [16]:
# Drop rows with suspicious log_volume (outside bounds)
print("Extreme log_volume values:\n", df['log_volume'].describe())
df = df[(df['log_volume'] >= 4) & (df['log_volume'] <= 16)]

Extreme log_volume values:
 count    8.374814e+06
mean     1.191545e+01
std      2.318968e+00
min      6.931472e-01
25%      1.043620e+01
50%      1.212324e+01
75%      1.358031e+01
max      1.660955e+01
Name: log_volume, dtype: float64


In [17]:
# Drop rows with low sentiment confidence
df = df[df['avg_sentiment_confidence'] >= 0.6]

In [18]:
# Cap vol_30 again (safe re-cap after filtering)
vol_cap = df['vol_30'].quantile(0.99)
df['vol_30'] = np.clip(df['vol_30'], None, vol_cap)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['vol_30'] = np.clip(df['vol_30'], None, vol_cap)


In [19]:
# Final filter: remove weak/noise returns
filtered_df = df[df['return_1d'].abs() > 0.002].copy()
print(f"Rows after return_1d thresholding: {len(filtered_df):,} / {len(df):,} "
      f"({(len(filtered_df)/len(df))*100:.2f}%)")

Rows after return_1d thresholding: 4,858,091 / 8,004,725 (60.69%)


In [20]:
# Create binary target column
filtered_df['target'] = (filtered_df['return_1d'] > 0).astype(int)

In [21]:

# Save cleaned dataset
filtered_df.to_csv("/content/drive/MyDrive/MRP/final_dataset.csv", index=False)
print("Saved filtered dataset as 'final_dataset.csv'")

Saved filtered dataset as 'final_dataset.csv'
