In [9]:
# Importing libraries for data handling
import pandas as pd
import numpy as np
import time
import os

# For machine learning
#from sklearn import preprocessing
#from sklearn.model_selection import train_test_split
#from sklearn.linear_model import LinearRegression
#from sklearn.tree import DecisionTreeRegressor

In [10]:
# Load the dataset to understand its structure
csv_filename= pd.read_csv('all_stocks_5yr.csv')

# Display basic information about the dataset
csv_filename.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 619040 entries, 0 to 619039
Data columns (total 7 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   date    619040 non-null  object 
 1   open    619029 non-null  float64
 2   high    619032 non-null  float64
 3   low     619032 non-null  float64
 4   close   619040 non-null  float64
 5   volume  619040 non-null  int64  
 6   name    619040 non-null  object 
dtypes: float64(4), int64(1), object(2)
memory usage: 33.1+ MB


In [13]:
import pandas as pd
import time
import os

# Load the original dataset
datastocks = "all_stocks_5yr.csv"  # Replace with the actual filename
df_original = pd.read_csv(datastocks)

# Benchmark function for evaluating CSV vs. Parquet
def benchmark(df, scale_factor, compression=None):
    """Evaluates CSV vs. Parquet in terms of size and speed at different scales."""
    scaled_df = pd.concat([df] * scale_factor, ignore_index=True)
    csv_filename = f"data_{scale_factor}x.csv"
    parquet_filename = f"data_{scale_factor}x.parquet"

    # ---- CSV Evaluation ----
    start_time = time.time()
    scaled_df.to_csv(csv_filename, index=False)
    csv_write_time = time.time() - start_time

    start_time = time.time()
    df_csv = pd.read_csv(csv_filename)
    csv_read_time = time.time() - start_time

    csv_size = os.path.getsize(csv_filename) / (1024 * 1024)  # Size in MB

    # ---- Parquet Evaluation ----
    start_time = time.time()
    scaled_df.to_parquet(parquet_filename, index=False, compression=compression, engine="pyarrow")
    parquet_write_time = time.time() - start_time

    start_time = time.time()
    df_parquet = pd.read_parquet(parquet_filename, engine="pyarrow")
    parquet_read_time = time.time() - start_time

    parquet_size = os.path.getsize(parquet_filename) / (1024 * 1024)  # Size in MB

    # Remove files to save space
    os.remove(csv_filename)
    os.remove(parquet_filename)

    return {
        "Scale": scale_factor,
        "CSV_Size_MB": csv_size,
        "CSV_Write_Time_s": csv_write_time,
        "CSV_Read_Time_s": csv_read_time,
        "Parquet_Size_MB": parquet_size,
        "Parquet_Write_Time_s": parquet_write_time,
        "Parquet_Read_Time_s": parquet_read_time,
        "Compression": compression
    }

# Run benchmarks at scales 1x, 10x, 100x with different compression methods
scales = [1, 10, 100]
compressions = [None, "snappy", "gzip", "brotli"]
results = []

for scale in scales:
    for comp in compressions:
        results.append(benchmark(df_original, scale, comp))

# Convert results to a DataFrame
results_df = pd.DataFrame(results)

# Print the results to the console
print("\n=== CSV vs. Parquet Benchmark Results ===")
print(results_df)

# Print the results in the Jupyter Notebook
print("\n=== CSV vs. Parquet Benchmark Results ===")
display(results_df)  # Display table in Jupyter Notebook


=== CSV vs. Parquet Benchmark Results ===
    Scale  CSV_Size_MB  CSV_Write_Time_s  CSV_Read_Time_s  Parquet_Size_MB  \
0       1    28.800573          1.681946         0.285367        12.730873   
1       1    28.800573          1.268893         0.270100        10.151073   
2       1    28.800573          1.341022         0.294221         8.058299   
3       1    28.800573          1.413980         0.334922         7.755613   
4      10   288.005407         13.368246         2.418940       118.028409   
5      10   288.005407         15.014923         2.515728        95.354448   
6      10   288.005407         14.243311         2.842862        75.976962   
7      10   288.005407         13.672949         6.751509        73.180475   
8     100  2880.053747        152.582843        25.106432      1178.436979   
9     100  2880.053747        140.351567        26.087857       951.709143   
10    100  2880.053747        152.033769        26.132726       758.129578   
11    100  2880.05374

Unnamed: 0,Scale,CSV_Size_MB,CSV_Write_Time_s,CSV_Read_Time_s,Parquet_Size_MB,Parquet_Write_Time_s,Parquet_Read_Time_s,Compression
0,1,28.800573,1.681946,0.285367,12.730873,0.254456,0.095066,
1,1,28.800573,1.268893,0.2701,10.151073,0.275099,0.120534,snappy
2,1,28.800573,1.341022,0.294221,8.058299,0.882644,0.087183,gzip
3,1,28.800573,1.41398,0.334922,7.755613,0.924692,0.077551,brotli
4,10,288.005407,13.368246,2.41894,118.028409,2.271214,0.682673,
5,10,288.005407,15.014923,2.515728,95.354448,2.134055,0.777165,snappy
6,10,288.005407,14.243311,2.842862,75.976962,7.791368,1.179863,gzip
7,10,288.005407,13.672949,6.751509,73.180475,9.350809,1.480582,brotli
8,100,2880.053747,152.582843,25.106432,1178.436979,21.250852,8.634975,
9,100,2880.053747,140.351567,26.087857,951.709143,24.474314,8.30594,snappy


## First Result
<div style="text-align: center;">
    <img src="1x.jpg" alt="first result" width="1000"/>
</div>

### PART 2

In [14]:
import pandas as pd
import polars as pl
import numpy as np
import time
from IPython.display import display

# Load Dataset in both Pandas and Polars
csv_file = "all_stocks_5yr.csv"

# Load with Pandas
df_pandas = pd.read_csv(csv_file)
df_pandas['date'] = pd.to_datetime(df_pandas['date'])
df_pandas = df_pandas.sort_values('date')

# Load with Polars
df_polars = pl.read_csv(csv_file)
df_polars = df_polars.with_columns(pl.col("date").str.to_date())

print("Dataset successfully loaded in both Pandas and Polars.")

# Save Polars DF for persistence
df_polars.write_parquet("all_stocks_5yr_polars.parquet")

### === Technical Indicators Calculation Functions === ###

# 1. Relative Strength Index (RSI)
def calculate_rsi_pandas(df, period=14):
    delta = df["close"].diff()
    gain = np.where(delta > 0, delta, 0)
    loss = np.where(delta < 0, -delta, 0)
    avg_gain = pd.Series(gain).rolling(window=period, min_periods=1).mean()
    avg_loss = pd.Series(loss).rolling(window=period, min_periods=1).mean()
    rs = avg_gain / (avg_loss + 1e-10)
    rsi = 100 - (100 / (1 + rs))
    return rsi

def calculate_rsi_polars(df, period=14):
    df = df.with_columns((df["close"].diff()).alias("delta"))
    gain = df.with_columns((pl.when(df["delta"] > 0, df["delta"]).otherwise(0)).alias("gain"))
    loss = df.with_columns((pl.when(df["delta"] < 0, -df["delta"]).otherwise(0)).alias("loss"))
    avg_gain = gain["gain"].rolling_mean(period)
    avg_loss = loss["loss"].rolling_mean(period)
    rs = avg_gain / (avg_loss + 1e-10)
    rsi = 100 - (100 / (1 + rs))
    return rsi

# 2. Money Flow Index (MFI)
def calculate_mfi_pandas(df, period=14):
    typical_price = (df["high"] + df["low"] + df["close"]) / 3
    money_flow = typical_price * df["volume"]
    positive_flow = money_flow.where(typical_price > typical_price.shift(1), 0)
    negative_flow = money_flow.where(typical_price < typical_price.shift(1), 0)
    mf_ratio = positive_flow.rolling(period).sum() / (negative_flow.rolling(period).sum() + 1e-10)
    mfi = 100 - (100 / (1 + mf_ratio))
    return mfi

def calculate_mfi_polars(df, period=14):
    typical_price = (df["high"] + df["low"] + df["close"]) / 3
    money_flow = typical_price * df["volume"]
    positive_flow = money_flow.filter(typical_price > typical_price.shift(1))
    negative_flow = money_flow.filter(typical_price < typical_price.shift(1))
    mf_ratio = positive_flow.rolling_sum(period) / (negative_flow.rolling_sum(period) + 1e-10)
    mfi = 100 - (100 / (1 + mf_ratio))
    return mfi

# 3. Stochastics
def calculate_stoch_pandas(df, period=14):
    lowest_low = df["low"].rolling(window=period).min()
    highest_high = df["high"].rolling(window=period).max()
    stoch = 100 * (df["close"] - lowest_low) / (highest_high - lowest_low + 1e-10)
    return stoch

def calculate_stoch_polars(df, period=14):
    lowest_low = df["low"].rolling_min(period)
    highest_high = df["high"].rolling_max(period)
    stoch = 100 * (df["close"] - lowest_low) / (highest_high - lowest_low + 1e-10)
    return stoch

# 4. Moving Average Convergence Divergence (MACD)
def calculate_macd_pandas(df, short=12, long=26, signal=9):
    short_ema = df["close"].ewm(span=short, adjust=False).mean()
    long_ema = df["close"].ewm(span=long, adjust=False).mean()
    macd = short_ema - long_ema
    signal_line = macd.ewm(span=signal, adjust=False).mean()
    return macd, signal_line

def calculate_macd_polars(df, short=12, long=26, signal=9):
    short_ema = df["close"].rolling_mean(short)
    long_ema = df["close"].rolling_mean(long)
    macd = short_ema - long_ema
    signal_line = macd.rolling_mean(signal)
    return macd, signal_line

### === Performance Benchmark === ###
results = []

def benchmark(func, df, label):
    """ Measure execution time of a function """
    start_time = time.time()
    result = func(df)
    exec_time = time.time() - start_time
    return {"Indicator": label, "Time (s)": exec_time}

# Run Benchmarks for Pandas
print("\nRunning calculations in Pandas...")
results.append(benchmark(lambda df: calculate_rsi_pandas(df), df_pandas, "RSI - Pandas"))
results.append(benchmark(lambda df: calculate_mfi_pandas(df), df_pandas, "MFI - Pandas"))
results.append(benchmark(lambda df: calculate_stoch_pandas(df), df_pandas, "Stochastic - Pandas"))
results.append(benchmark(lambda df: calculate_macd_pandas(df), df_pandas, "MACD - Pandas"))

# Run Benchmarks for Polars
print("\nRunning calculations in Polars...")
results.append(benchmark(lambda df: calculate_rsi_polars(df), df_polars, "RSI - Polars"))
results.append(benchmark(lambda df: calculate_mfi_polars(df), df_polars, "MFI - Polars"))
results.append(benchmark(lambda df: calculate_stoch_polars(df), df_polars, "Stochastic - Polars"))
results.append(benchmark(lambda df: calculate_macd_polars(df), df_polars, "MACD - Polars"))

# Convert results to DataFrame
results_df = pd.DataFrame(results)

# Create a comparison table
comparison_table = results_df.pivot(index="Indicator", columns="Indicator", values="Time (s)")

# Display Results
print("\n=== Performance Comparison: Pandas vs. Polars ===")
display(comparison_table)


ModuleNotFoundError: No module named 'polars'

In [15]:
import polars as pl
df_polars = pl.read_csv("all_stocks_5yr.csv")
df_polars.head()  # Display first few rows
df_polars.describe()  # Show statistics

ModuleNotFoundError: No module named 'polars'