# Selecting and manipulating data with Polars

In [1]:
import polars as pl

In [2]:
import sklearn

Alternative datasets include the California housing dataset and the
Ames housing dataset. You can load the datasets as follows::

    from sklearn.datasets import fetch_california_housing
    housing = fetch_california_housing()

for the California housing dataset and::

    from sklearn.datasets import fetch_openml
    housing = fetch_openml(name="house_prices", as_frame=True)

for the Ames housing dataset.

In [3]:
ames_data = sklearn.datasets.fetch_openml("house_prices", as_frame=True)

In [4]:
ames_data.keys()

dict_keys(['data', 'target', 'frame', 'categories', 'feature_names', 'target_names', 'DESCR', 'details', 'url'])

In [5]:
cal_data = sklearn.datasets.fetch_california_housing()

In [6]:
cal_data.keys()

dict_keys(['data', 'target', 'frame', 'target_names', 'feature_names', 'DESCR'])

In [7]:
df = pl.from_numpy(cal_data["data"], schema=cal_data["feature_names"])

In [8]:
df = df.with_columns(
    pl.Series(cal_data["target"]).alias(cal_data["target_names"][0]),
)

In [9]:
df

MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal
f64,f64,f64,f64,f64,f64,f64,f64,f64
8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422
…,…,…,…,…,…,…,…,…
1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09,0.781
2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21,0.771
1.7,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22,0.923
1.8672,18.0,5.329513,1.17192,741.0,2.123209,39.43,-121.32,0.847


## Data selection

### Columns

- Selectors: https://docs.pola.rs/api/python/stable/reference/selectors.html

In [10]:
df.select()

In [11]:
df.with_columns()

MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal
f64,f64,f64,f64,f64,f64,f64,f64,f64
8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422
…,…,…,…,…,…,…,…,…
1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09,0.781
2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21,0.771
1.7,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22,0.923
1.8672,18.0,5.329513,1.17192,741.0,2.123209,39.43,-121.32,0.847


### Rows

In [12]:
df.slice(2000)

MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal
f64,f64,f64,f64,f64,f64,f64,f64,f64
2.5045,15.0,4.103933,1.16573,924.0,2.595506,36.74,-119.78,0.903
1.2375,35.0,4.459916,1.067511,2050.0,4.324895,36.75,-119.78,0.5
1.2813,31.0,3.627907,0.979328,1515.0,3.914729,36.75,-119.78,0.564
1.0513,35.0,2.951557,1.024221,1228.0,4.249135,36.74,-119.79,0.396
2.1094,52.0,2.059524,1.035714,401.0,4.77381,36.74,-119.79,0.75
…,…,…,…,…,…,…,…,…
1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09,0.781
2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21,0.771
1.7,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22,0.923
1.8672,18.0,5.329513,1.17192,741.0,2.123209,39.43,-121.32,0.847


In [13]:
df.filter()

MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal
f64,f64,f64,f64,f64,f64,f64,f64,f64
8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422
…,…,…,…,…,…,…,…,…
1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09,0.781
2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21,0.771
1.7,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22,0.923
1.8672,18.0,5.329513,1.17192,741.0,2.123209,39.43,-121.32,0.847


### Combination

## Manipulating data

## Addendum

### Data- or LazyFrame? Lazy operations?

### Using square brackets

It is recommended to use expressions to select and slice data.

However, you can use square brackets to select rows and columns,

In [14]:
df["MedInc"]

MedInc
f64
8.3252
8.3014
7.2574
5.6431
3.8462
…
1.5603
2.5568
1.7
1.8672


In [15]:
df[["MedInc", "Population"]]

MedInc,Population
f64,f64
8.3252,322.0
8.3014,2401.0
7.2574,496.0
5.6431,558.0
3.8462,565.0
…,…
1.5603,845.0
2.5568,356.0
1.7,1007.0
1.8672,741.0


In [16]:
df[0:2, ["MedInc", "Population"]]

MedInc,Population
f64,f64
8.3252,322.0
8.3014,2401.0


In [17]:
df[pl.col("Population") > 350]

TypeError: cannot select columns using key of type 'Expr': <Expr ['[(col("Population")) > (dyn in…'] at 0x706C9B088C50>


- https://docs.pola.rs/api/python/stable/reference/dataframe/api/polars.DataFrame.__getitem__.html#polars.DataFrame.__getitem__
- https://docs.pola.rs/user-guide/migration/pandas/#selecting-data
- https://typethepipe.com/vizs-and-tips/python-polars-selectors-select-multiple-columns/