In [1]:
pip install polars

Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import polars as pl
import time

# File path to dataset
data_file = "all_stocks_5yr.csv"

# Measure file read time for Pandas
start_time = time.time()
pd_df = pd.read_csv(data_file)
pd_read_duration = time.time() - start_time

# Measure file read time for Polars
start_time = time.time()
pl_df = pl.read_csv(data_file)
pl_read_duration = time.time() - start_time

# Filtering rows where 'close' price is greater than 100
start_time = time.time()
pd_filtered = pd_df[pd_df['close'] > 100]
pd_filter_duration = time.time() - start_time

start_time = time.time()
pl_filtered = pl_df.filter(pl_df['close'] > 100)
pl_filter_duration = time.time() - start_time

# Calculating mean closing price
start_time = time.time()
pd_mean_close = pd_df['close'].mean()
pd_mean_duration = time.time() - start_time

start_time = time.time()
pl_mean_close = pl_df['close'].mean()
pl_mean_duration = time.time() - start_time

# Sorting dataset by closing price in descending order
start_time = time.time()
pd_sorted = pd_df.sort_values('close', ascending=False)
pd_sort_duration = time.time() - start_time

start_time = time.time()
pl_sorted = pl_df.sort('close', descending=True)
pl_sort_duration = time.time() - start_time

# Display performance results
print("\n--- Performance Comparison: Pandas vs. Polars ---")
print(f"Pandas Read Time: {pd_read_duration:.4f} sec | Polars Read Time: {pl_read_duration:.4f} sec")
print(f"Pandas Filter Time: {pd_filter_duration:.4f} sec | Polars Filter Time: {pl_filter_duration:.4f} sec")
print(f"Pandas Mean Calculation Time: {pd_mean_duration:.4f} sec | Polars Mean Calculation Time: {pl_mean_duration:.4f} sec")
print(f"Pandas Sort Time: {pd_sort_duration:.4f} sec | Polars Sort Time: {pl_sort_duration:.4f} sec")



--- Performance Comparison: Pandas vs. Polars ---
Pandas Read Time: 0.2902 sec | Polars Read Time: 0.1193 sec
Pandas Filter Time: 0.0090 sec | Polars Filter Time: 0.0030 sec
Pandas Mean Calculation Time: 0.0020 sec | Polars Mean Calculation Time: 0.0000 sec
Pandas Sort Time: 0.0703 sec | Polars Sort Time: 0.0250 sec


# 📊 Pandas vs. Polars Performance Comparison  

| **Operation**        | **Pandas Time (sec)** | **Polars Time (sec)** | **Faster Option** |
|----------------------|----------------------|----------------------|------------------|
| **Reading CSV**      | 0.2830               | 0.0161               | ✅ **Polars (~17x faster)** |
| **Filtering Data**   | 0.0059               | 0.0072               | 🟡 **Pandas slightly ahead** |
| **Mean Calculation** | 0.0010               | 0.0009               | ✅ **Polars (Marginally better)** |
| **Sorting Data**     | 0.0603               | 0.0335               | ✅ **Polars (~2x faster)** |

## 🔍 Key Observations  
- **CSV Reading Speed:** Polars is significantly faster (~17x), making it ideal for large datasets.  
- **Filtering Performance:** Pandas has a slight edge in filtering speed.  
- **Mean Calculation:** Both libraries perform well, with Polars being marginally faster.  
- **Sorting Efficiency:** Polars significantly outperforms Pandas, nearly doubling the speed.  

## 🏆 Final Verdict  
- **For large datasets**, Polars is the best choice due to its speed in reading, sorting, and aggregating.  
- **For smaller datasets**, Pandas remains a strong option with familiar syntax and slightly better filtering.  
- If performance is crucial, **Polars is the recommended tool** for fast and efficient data analysis. 🚀  


In [4]:
import polars as pl

# Load the dataset using Polars
file_path = "all_stocks_5yr.csv"
df = pl.read_csv(file_path)

# Function to compute alternative technical indicators with Polars
def compute_technical_indicators(df):
    return df.with_columns(
        pl.col("close").ewm_mean(span=10).alias("EMA_10"),
        (pl.col("close").ewm_mean(span=12) - pl.col("close").ewm_mean(span=26)).alias("MACD"),
        ((pl.col("high").rolling_max(window_size=14) - pl.col("close")) / 
         (pl.col("high").rolling_max(window_size=14) - pl.col("low").rolling_min(window_size=14)) * -100).alias("Williams_%R"),
        (pl.max_horizontal([
            pl.col("high") - pl.col("low"),
            (pl.col("high") - pl.col("close").shift(1)).abs(),
            (pl.col("low") - pl.col("close").shift(1)).abs()
        ]).rolling_mean(window_size=14)).alias("ATR_14")
    )

df = compute_technical_indicators(df)

In [5]:
df

date,open,high,low,close,volume,name,EMA_10,MACD,Williams_%R,ATR_14
str,f64,f64,f64,f64,i64,str,f64,f64,f64,f64
"""2013-02-08""",15.07,15.12,14.63,14.75,8407500,"""AAL""",14.75,0.0,,
"""2013-02-11""",14.89,15.01,14.26,14.46,8882000,"""AAL""",14.5905,-0.006506,,
"""2013-02-12""",14.45,14.51,14.1,14.27,8126000,"""AAL""",14.461661,-0.014182,,
"""2013-02-13""",14.3,14.94,14.25,14.66,10259500,"""AAL""",14.527005,-0.003245,,
"""2013-02-14""",14.94,14.96,13.16,13.99,31879900,"""AAL""",14.372845,-0.023895,,
…,…,…,…,…,…,…,…,…,…,…
"""2018-02-01""",76.84,78.27,76.69,77.82,2982259,"""ZTS""",77.722553,1.533067,-43.097015,1.197143
"""2018-02-02""",77.53,78.12,76.73,76.78,2595187,"""ZTS""",77.55118,1.36093,-68.089431,1.217857
"""2018-02-05""",76.64,76.92,73.18,73.83,2962031,"""ZTS""",76.874602,0.975229,-90.647482,1.4
"""2018-02-06""",72.74,74.56,72.13,73.27,4924323,"""ZTS""",76.21922,0.617255,-85.75,1.456429


In [6]:
## Checking NaN values
null_counts = df.select(pl.all().is_null().sum())
print(null_counts)

shape: (1, 11)
┌──────┬──────┬──────┬─────┬───┬────────┬──────┬─────────────┬────────┐
│ date ┆ open ┆ high ┆ low ┆ … ┆ EMA_10 ┆ MACD ┆ Williams_%R ┆ ATR_14 │
│ ---  ┆ ---  ┆ ---  ┆ --- ┆   ┆ ---    ┆ ---  ┆ ---         ┆ ---    │
│ u32  ┆ u32  ┆ u32  ┆ u32 ┆   ┆ u32    ┆ u32  ┆ u32         ┆ u32    │
╞══════╪══════╪══════╪═════╪═══╪════════╪══════╪═════════════╪════════╡
│ 0    ┆ 11   ┆ 8    ┆ 8   ┆ … ┆ 0      ┆ 0    ┆ 125         ┆ 125    │
└──────┴──────┴──────┴─────┴───┴────────┴──────┴─────────────┴────────┘


In [7]:
## Drop NaN values
df = df.drop_nulls()

In [8]:
## Check first five data
df.head()

date,open,high,low,close,volume,name,EMA_10,MACD,Williams_%R,ATR_14
str,f64,f64,f64,f64,i64,str,f64,f64,f64,f64
"""2013-02-28""",13.49,13.63,13.39,13.43,6143600,"""AAL""",13.548838,-0.140549,-69.834711,0.715
"""2013-03-01""",13.37,13.95,13.32,13.61,7376800,"""AAL""",13.560535,-0.12416,-60.606061,0.725
"""2013-03-04""",13.5,14.07,13.47,13.9,8174800,"""AAL""",13.62485,-0.092574,-46.902655,0.714286
"""2013-03-05""",14.01,14.05,13.71,14.05,7676100,"""AAL""",13.704787,-0.058055,-40.265487,0.709286
"""2013-03-06""",14.52,14.68,14.25,14.57,13243200,"""AAL""",13.866463,0.00188,-17.256637,0.705


In [9]:
features = ["open", "high", "low", "volume", "EMA_10", "MACD", "ATR_14", "Williams_%R"]
target = "close" 

In [10]:
from sklearn.model_selection import train_test_split
X = df[features]
y = df['close']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
# Required Libraries
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_absolute_error

In [12]:
# Extra Trees Regressor Model
et_model = ExtraTreesRegressor(n_estimators=100, random_state=42)
et_model.fit(X_train, y_train)
et_predictions = et_model.predict(X_test)
et_mae = mean_absolute_error(y_test, et_predictions)

# Random Forest Regressor Model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
rf_predictions = rf_model.predict(X_test)
rf_mae = mean_absolute_error(y_test, rf_predictions)

print(f"Extra Trees MAE: {et_mae:.2f}")
print(f"Random Forest MAE: {rf_mae:.2f}")

Extra Trees MAE: 0.27
Random Forest MAE: 0.26


In [None]:
import polars as pl
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

# Assuming your data is loaded into a Polars DataFrame `df`
# df = pl.read_csv("your_dataset.csv")

# Check if df is a Polars DataFrame
if isinstance(df, pl.DataFrame):
    print("df is a Polars DataFrame. Proceeding with conversion to Pandas.")
    # Convert Polars DataFrame to Pandas DataFrame
    df = df.to_pandas()
else:
    print("df is already a Pandas DataFrame.")

# Preprocessing: Convert date columns to numerical values (e.g., year, month, day)
if 'date' in df.columns:
    df['date'] = pd.to_datetime(df['date'])  # Convert to datetime format
    df['year'] = df['date'].dt.year  # Extract year
    df['month'] = df['date'].dt.month  # Extract month
    df['day'] = df['date'].dt.day  # Extract day
    df.drop(columns=['date'], inplace=True)  # Drop original date column if not needed

# Loop through each company and train models individually
for company in df['name'].unique():
    print(f"Training models for company: {company}")
    
    # Filter data for the current company
    company_data = df[df['name'] == company]

    # Define the features (X) and target (y) for training
    X = company_data.drop(columns=['name', 'close'])  # Drop company name and target column
    y = company_data['close']  # 'close' as the target variable

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Extra Trees Regressor Model
    et_model = ExtraTreesRegressor(n_estimators=100, random_state=42)
    et_model.fit(X_train, y_train)
    et_predictions = et_model.predict(X_test)
    et_mae = mean_absolute_error(y_test, et_predictions)

    # Random Forest Regressor Model
    rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
    rf_model.fit(X_train, y_train)
    rf_predictions = rf_model.predict(X_test)
    rf_mae = mean_absolute_error(y_test, rf_predictions)

    # Print the results for the current company
    print(f"Extra Trees MAE for {company}: {et_mae:.2f}")
    print(f"Random Forest MAE for {company}: {rf_mae:.2f}")
    print("-" * 50)



df is a Polars DataFrame. Proceeding with conversion to Pandas.
Training models for company: AAL
Extra Trees MAE for AAL: 0.31
Random Forest MAE for AAL: 0.32
--------------------------------------------------
Training models for company: AAPL
Extra Trees MAE for AAPL: 0.48
Random Forest MAE for AAPL: 0.56
--------------------------------------------------
Training models for company: AAP
Extra Trees MAE for AAP: 0.76
Random Forest MAE for AAP: 0.80
--------------------------------------------------
Training models for company: ABBV
Extra Trees MAE for ABBV: 0.35
Random Forest MAE for ABBV: 0.37
--------------------------------------------------
Training models for company: ABC
Extra Trees MAE for ABC: 0.34
Random Forest MAE for ABC: 0.35
--------------------------------------------------
Training models for company: ABT
Extra Trees MAE for ABT: 0.16
Random Forest MAE for ABT: 0.17
--------------------------------------------------
Training models for company: ACN
Extra Trees MAE for A