<a href="https://colab.research.google.com/github/2stndard/polars/blob/main/3%EC%9E%A5_code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import polars as pl
import numpy as np
import time

In [2]:
## 샘플 데이터 생성
# Generate a large dataset with 10 million rows
data = {
    'Age': np.random.randint(18, 60, 50_000_000),
    'Salary': np.random.randint(30000, 90000, 50_000_000),
    'Gender': np.random.choice(['Male', 'Female'], 50_000_000)
}

In [3]:
df_pandas_data = pd.DataFrame(data)
print(df_pandas_data.shape)

(50000000, 3)


In [4]:
df_pandas_data.info(memory_usage = 'deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000000 entries, 0 to 49999999
Data columns (total 3 columns):
 #   Column  Dtype 
---  ------  ----- 
 0   Age     int64 
 1   Salary  int64 
 2   Gender  object
dtypes: int64(2), object(1)
memory usage: 3.6 GB


In [5]:
start_time = time.time()
filtered_pandas = df_pandas_data[df_pandas_data['Age'] > 30]
grouped_pandas = filtered_pandas.groupby('Gender')['Salary'].mean()
end_time = time.time()
print("Pandas Execution Time:", round(end_time - start_time, 3), "seconds")

Pandas Execution Time: 7.259 seconds


In [6]:
df_polars_data = pl.DataFrame(data)
print(df_polars_data.shape)

(50000000, 3)


In [7]:
df_polars_data.estimated_size('gb')

0.9778955671936274

In [8]:
start_time = time.time()
filtered_polars = df_polars_data.filter(df_polars_data['Age'] > 30)
grouped_polars = filtered_polars.group_by('Gender').agg(pl.col('Salary').mean().alias('Average Salary'))
end_time = time.time()

print("Polars Execution Time:", round(end_time - start_time, 3), "seconds")

Polars Execution Time: 3.472 seconds


In [9]:
import polars as pl

In [11]:
df = pl.read_csv('/content/Iris.csv')

In [12]:
df.head()

Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
i64,f64,f64,f64,f64,str
1,5.1,3.5,1.4,0.2,"""Iris-setosa"""
2,4.9,3.0,1.4,0.2,"""Iris-setosa"""
3,4.7,3.2,1.3,0.2,"""Iris-setosa"""
4,4.6,3.1,1.5,0.2,"""Iris-setosa"""
5,5.0,3.6,1.4,0.2,"""Iris-setosa"""


In [13]:
df.describe()

statistic,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
str,f64,f64,f64,f64,f64,str
"""count""",150.0,150.0,150.0,150.0,150.0,"""150"""
"""null_count""",0.0,0.0,0.0,0.0,0.0,"""0"""
"""mean""",75.5,5.843333,3.054,3.758667,1.198667,
"""std""",43.445368,0.828066,0.433594,1.76442,0.763161,
"""min""",1.0,4.3,2.0,1.0,0.1,"""Iris-setosa"""
"""25%""",38.0,5.1,2.8,1.6,0.3,
"""50%""",76.0,5.8,3.0,4.4,1.3,
"""75%""",113.0,6.4,3.3,5.1,1.8,
"""max""",150.0,7.9,4.4,6.9,2.5,"""Iris-virginica"""


In [14]:
df.select(pl.col('Species'))

Species
str
"""Iris-setosa"""
"""Iris-setosa"""
"""Iris-setosa"""
"""Iris-setosa"""
"""Iris-setosa"""
…
"""Iris-virginica"""
"""Iris-virginica"""
"""Iris-virginica"""
"""Iris-virginica"""


In [15]:
print(df.select(pl.col('Species').value_counts()))

shape: (3, 1)
┌────────────────────────┐
│ Species                │
│ ---                    │
│ struct[2]              │
╞════════════════════════╡
│ {"Iris-virginica",50}  │
│ {"Iris-versicolor",50} │
│ {"Iris-setosa",50}     │
└────────────────────────┘


In [16]:
df.with_columns(pl.col('PetalLengthCm').round(0))

Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
i64,f64,f64,f64,f64,str
1,5.1,3.5,1.0,0.2,"""Iris-setosa"""
2,4.9,3.0,1.0,0.2,"""Iris-setosa"""
3,4.7,3.2,1.0,0.2,"""Iris-setosa"""
4,4.6,3.1,2.0,0.2,"""Iris-setosa"""
5,5.0,3.6,1.0,0.2,"""Iris-setosa"""
…,…,…,…,…,…
146,6.7,3.0,5.0,2.3,"""Iris-virginica"""
147,6.3,2.5,5.0,1.9,"""Iris-virginica"""
148,6.5,3.0,5.0,2.0,"""Iris-virginica"""
149,6.2,3.4,5.0,2.3,"""Iris-virginica"""


In [17]:
df.with_columns(pl.col('PetalLengthCm').round(decimals = 0).alias('PetalLengthCm.rounded'))

Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species,PetalLengthCm.rounded
i64,f64,f64,f64,f64,str,f64
1,5.1,3.5,1.4,0.2,"""Iris-setosa""",1.0
2,4.9,3.0,1.4,0.2,"""Iris-setosa""",1.0
3,4.7,3.2,1.3,0.2,"""Iris-setosa""",1.0
4,4.6,3.1,1.5,0.2,"""Iris-setosa""",2.0
5,5.0,3.6,1.4,0.2,"""Iris-setosa""",1.0
…,…,…,…,…,…,…
146,6.7,3.0,5.2,2.3,"""Iris-virginica""",5.0
147,6.3,2.5,5.0,1.9,"""Iris-virginica""",5.0
148,6.5,3.0,5.2,2.0,"""Iris-virginica""",5.0
149,6.2,3.4,5.4,2.3,"""Iris-virginica""",5.0


In [18]:
df.with_columns(pl.lit('iris').alias('flowername'))

Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species,flowername
i64,f64,f64,f64,f64,str,str
1,5.1,3.5,1.4,0.2,"""Iris-setosa""","""iris"""
2,4.9,3.0,1.4,0.2,"""Iris-setosa""","""iris"""
3,4.7,3.2,1.3,0.2,"""Iris-setosa""","""iris"""
4,4.6,3.1,1.5,0.2,"""Iris-setosa""","""iris"""
5,5.0,3.6,1.4,0.2,"""Iris-setosa""","""iris"""
…,…,…,…,…,…,…
146,6.7,3.0,5.2,2.3,"""Iris-virginica""","""iris"""
147,6.3,2.5,5.0,1.9,"""Iris-virginica""","""iris"""
148,6.5,3.0,5.2,2.0,"""Iris-virginica""","""iris"""
149,6.2,3.4,5.4,2.3,"""Iris-virginica""","""iris"""


In [19]:
df.with_columns(pl.when(pl.col('PetalLengthCm') <= 2).
                   then(pl.lit("<=2")).
                   otherwise(pl.lit('>2')).
                alias('condition'))

Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species,condition
i64,f64,f64,f64,f64,str,str
1,5.1,3.5,1.4,0.2,"""Iris-setosa""","""<=2"""
2,4.9,3.0,1.4,0.2,"""Iris-setosa""","""<=2"""
3,4.7,3.2,1.3,0.2,"""Iris-setosa""","""<=2"""
4,4.6,3.1,1.5,0.2,"""Iris-setosa""","""<=2"""
5,5.0,3.6,1.4,0.2,"""Iris-setosa""","""<=2"""
…,…,…,…,…,…,…
146,6.7,3.0,5.2,2.3,"""Iris-virginica""",""">2"""
147,6.3,2.5,5.0,1.9,"""Iris-virginica""",""">2"""
148,6.5,3.0,5.2,2.0,"""Iris-virginica""",""">2"""
149,6.2,3.4,5.4,2.3,"""Iris-virginica""",""">2"""


In [20]:
df[1:4,:]

Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
i64,f64,f64,f64,f64,str
2,4.9,3.0,1.4,0.2,"""Iris-setosa"""
3,4.7,3.2,1.3,0.2,"""Iris-setosa"""
4,4.6,3.1,1.5,0.2,"""Iris-setosa"""


In [21]:
df.filter((pl.col('PetalLengthCm') > 6) & (pl.col('PetalWidthCm') <2))

Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
i64,f64,f64,f64,f64,str
108,7.3,2.9,6.3,1.8,"""Iris-virginica"""
131,7.4,2.8,6.1,1.9,"""Iris-virginica"""


In [22]:
df.group_by('Species').agg(pl.col('PetalLengthCm').mean())

Species,PetalLengthCm
str,f64
"""Iris-setosa""",1.464
"""Iris-virginica""",5.552
"""Iris-versicolor""",4.26


In [23]:
df.group_by('Species').agg(pl.col(pl.Float64).mean())

Species,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm
str,f64,f64,f64,f64
"""Iris-virginica""",6.588,2.974,5.552,2.026
"""Iris-versicolor""",5.936,2.77,4.26,1.326
"""Iris-setosa""",5.006,3.418,1.464,0.244


In [24]:
df.sort("Species","PetalLengthCm", descending = [True, False])

Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
i64,f64,f64,f64,f64,str
107,4.9,2.5,4.5,1.7,"""Iris-virginica"""
139,6.0,3.0,4.8,1.8,"""Iris-virginica"""
127,6.2,2.8,4.8,1.8,"""Iris-virginica"""
128,6.1,3.0,4.9,1.8,"""Iris-virginica"""
122,5.6,2.8,4.9,2.0,"""Iris-virginica"""
…,…,…,…,…,…
21,5.4,3.4,1.7,0.2,"""Iris-setosa"""
24,5.1,3.3,1.7,0.5,"""Iris-setosa"""
6,5.4,3.9,1.7,0.4,"""Iris-setosa"""
25,4.8,3.4,1.9,0.2,"""Iris-setosa"""


In [25]:
df_pivot = pl.DataFrame(
    {'country' : ['France', 'France', 'France', 'Italy', 'Italy', 'Italy'],
     'city' : ["Paris", "Lille", "Nice", "Roma", "Milan", "Napoli"],
     'location' : ["North","North","South","South","North","South"],
     'population' : [2.1, 0.2, 0.4, 2.8, 1.4, 3.0]}
)

In [26]:
df_pivot.pivot(
  index = "country",
  on = "city",
  values = "population",
)

country,Paris,Lille,Nice,Roma,Milan,Napoli
str,f64,f64,f64,f64,f64,f64
"""France""",2.1,0.2,0.4,,,
"""Italy""",,,,2.8,1.4,3.0


In [27]:
df_pivot = df_pivot.pivot(
    index = "country",
    on = "location",
    values = "population",
    aggregate_function = 'mean'
)
df_pivot

country,North,South
str,f64,f64
"""France""",1.15,0.4
"""Italy""",1.4,2.9


In [28]:
df_pivot.unpivot(index = 'country', on = ['North', 'South'])

country,variable,value
str,str,f64
"""France""","""North""",1.15
"""Italy""","""North""",1.4
"""France""","""South""",0.4
"""Italy""","""South""",2.9
