In [1]:
# Importing necessary libraries
import time
import pandas as pd
import polars as pl
import numpy as np

In [2]:
# Number of rows for the data
nums_rows = 120000

# Creating a dictionary with random data (dummies data)
data = {
    'A': np.random.randint(0, 100,nums_rows),
    'B': np.random.randint(0, 100,nums_rows),
    'C': np.random.rand(nums_rows)
}

In [3]:
# Creating DataFrame using Pandas
data_pandas = pd.DataFrame(data)

# Creating DataFrame using Polars
data_polars = pl.DataFrame(data)

In [4]:
# Function to time the execution
def time_operation(func, *args):
    start_time = time.time()
    result = func(*args)
    end_time = time.time()
    return result, end_time - start_time

In [5]:
# Function to read data using Pandas
def read_data_pandas():
    return pd.DataFrame(data)

# Function to read data using Polars
def read_data_polars():
    return pl.DataFrame(data)

In [6]:
# Function to perform aggregation using Pandas
def aggregation_pandas(data):
    return data.groupby('A').agg({'B': 'mean', 'C': 'sum'})

# Function to perform aggregation using Polars
def aggregation_polars(data):
    return data.group_by('A').agg(pl.col('B').mean(), pl.col('C').sum())

In [7]:
# Function to filter data using Pandas
def filtering_pandas(data):
    return data[data['A'] > 50]

# Function to filter data using Polars
def filtering_polars(data):
    return data.filter(pl.col('A') > 50)

In [8]:
# Function to join data using Pandas
def join_pandas(data1, data2):
    return data1.merge(data2, on='A')

# Function to join data using Polars
def join_polars(data1, data2):
    return data1.join(data2, on='A')

In [9]:
# Creating copies of the data for joining
data_pandas2 = data_pandas.copy()
data_polars2 = data_polars.clone()

# Dictionary to store the results of each operation
results = {}

# Timing the data reading operations
_, read_pandas_time = time_operation(read_data_pandas)
_, read_polars_time = time_operation(read_data_polars)
results['Read Data'] = {'Pandas': read_pandas_time, 'Polars': read_polars_time}

# Timing the aggregation operations
_, agg_pandas_time = time_operation(aggregation_pandas, data_pandas)
_, agg_polars_time = time_operation(aggregation_polars, data_polars)
results['Aggregation'] = {'Pandas': agg_pandas_time, 'Polars': agg_polars_time}

# Timing the filtering operations
_, filter_pandas_time = time_operation(filtering_pandas, data_pandas)
_, filter_polars_time = time_operation(filtering_polars, data_polars)
results['Filtering'] = {'Pandas': filter_pandas_time, 'Polars': filter_polars_time}

# Timing the joining operations
_, join_pandas_time = time_operation(join_pandas, data_pandas, data_pandas2)
_, join_polars_time = time_operation(join_polars, data_polars, data_polars2)
results['Joining'] = {'Pandas': join_pandas_time, 'Polars': join_polars_time}

In [10]:
# Converting the results dictionary to a DataFrame
results_data = pd.DataFrame(results)
results_data

Unnamed: 0,Read Data,Aggregation,Filtering,Joining
Pandas,0.001002,0.003001,0.000999,4.09397
Polars,0.0,0.001996,0.001003,0.946711


In [11]:
# Example 2 (Pandas): Creating a new DataFrame and adding a new column based on a condition
new_data = pd.DataFrame({
    'A': [1, 2, 3, 4],
    'B': [5, 6, 7, 8]
})

# Adding column 'C' with conditional values
new_data['C'] = new_data.apply(lambda row: row['A'] + row['B'] if row['A'] % 2 == 0 else row['A'] - row['B'], axis=1)

print(new_data)

   A  B   C
0  1  5  -4
1  2  6   8
2  3  7  -4
3  4  8  12


In [12]:
# Example 2 (Polars): Creating a new DataFrame and adding a new column based on a condition
new_data = pl.DataFrame({
    'A': [1, 2, 3, 4],
    'B': [5, 6, 7, 8]
})

# Adding column 'C' with conditional values
new_data = new_data.with_columns(
    pl.when(pl.col('A') % 2 == 0)
    .then(pl.col('A') + pl.col('B'))
    .otherwise(pl.col('A') - pl.col('B'))
    .alias('C')
)

print(new_data)

shape: (4, 3)
┌─────┬─────┬─────┐
│ A   ┆ B   ┆ C   │
│ --- ┆ --- ┆ --- │
│ i64 ┆ i64 ┆ i64 │
╞═════╪═════╪═════╡
│ 1   ┆ 5   ┆ -4  │
│ 2   ┆ 6   ┆ 8   │
│ 3   ┆ 7   ┆ -4  │
│ 4   ┆ 8   ┆ 12  │
└─────┴─────┴─────┘


# My personal opinion:
Based on the provided results, here is my personal opinion on Polars vs. Pandas:
## Performance Comparison

1. **Read Data:**
   - **Pandas:** 0.001002 seconds
   - **Polars:** 0.000000 seconds
   
   Polars is significantly faster than Pandas when it comes to reading data.

2. **Aggregation:**
   - **Pandas:** 0.003001 seconds
   - **Polars:** 0.001996 seconds
   
   Polars shows a slight edge in aggregation operations.

3. **Filtering:**
   - **Pandas:** 0.000999 seconds
   - **Polars:** 0.001003 seconds
   
   Pandas performs slightly better in filtering operations.

4. **Joining:**
   - **Pandas:** 4.093970 seconds
   - **Polars:** 0.946711 seconds
   
   Polars is much faster than Pandas for join operations, with a significant difference in execution time.

## General Observations

- **Speed:** Polars consistently demonstrates faster performance in reading, aggregating, and joining data. The differences in speed are most pronounced in the joining operation, where Polars is substantially quicker.
- **Syntax:** Polars offers a more concise syntax for certain operations, which can be more intuitive once familiar. However, Pandas is widely used and has a vast amount of documentation and community support.
- **Scalability:** Polars is designed to handle large datasets efficiently, making it a strong candidate for big data applications. Its performance benefits become more noticeable as the size of the dataset increases.

## Conclusion

While Pandas is a mature and widely-used library with extensive support and resources, Polars is emerging as a powerful alternative for performance-critical applications. If your work involves processing large datasets or you require the fastest possible execution times for specific operations, Polars is worth considering. However, for general-purpose data manipulation, especially when community support and existing familiarity are important, Pandas remains a solid choice.