In [1]:
import random
import pandas as pd 
import numpy as np
from scipy.sparse import csr_matrix

### Basic operations

In [144]:
df = pd.DataFrame(np.random.randn(5, 4))
df

Unnamed: 0,0,1,2,3
0,-0.320398,-0.397417,0.410847,0.856481
1,-0.146387,2.339511,1.307196,1.130137
2,-1.516584,-0.9063,0.020025,-0.429329
3,-1.443883,-1.496697,-1.521724,0.137609
4,0.856799,-1.170585,0.336923,-1.840485


In [157]:
#fill partially with nan
df.iloc[:,:2] = np.nan ; df

Unnamed: 0,0,1,2,3
0,,,0.410847,0.856481
1,,,1.307196,1.130137
2,,,0.020025,-0.429329
3,,,-1.521724,0.137609
4,,,0.336923,-1.840485


In [151]:
#convert matrix to sparse 
#notice that any value can be used in our case we use nan
sdf = df.astype(pd.SparseDtype("float", np.nan))

In [152]:
sdf.head()

Unnamed: 0,0,1,2,3
0,5.0,5.0,0.410847,0.856481
1,5.0,5.0,1.307196,1.130137
2,5.0,5.0,0.020025,-0.429329
3,5.0,5.0,-1.521724,0.137609
4,5.0,5.0,0.336923,-1.840485


In [153]:
sdf.dtypes

0    Sparse[float64, 5]
1    Sparse[float64, 5]
2    Sparse[float64, 5]
3    Sparse[float64, 5]
dtype: object

In [154]:
sdf.sparse.density , sdf.memory_usage().sum()

(0.5, 248)

In [155]:
#Ability to apply basic operations after conversion
sdf +5

Unnamed: 0,0,1,2,3
0,10.0,10.0,5.410847,5.856481
1,10.0,10.0,6.307196,6.130137
2,10.0,10.0,5.020025,4.570671
3,10.0,10.0,3.478276,5.137609
4,10.0,10.0,5.336923,3.159515


In [156]:
#ability to use numpy operation
np.square(sdf)

Unnamed: 0,0,1,2,3
0,25.0,25.0,0.168795,0.73356
1,25.0,25.0,1.70876,1.277211
2,25.0,25.0,0.000401,0.184323
3,25.0,25.0,2.315643,0.018936
4,25.0,25.0,0.113517,3.387385


## Using Scipy

In [137]:
from scipy import sparse

In [158]:
#Create identity matrix and convert it to sparse
mat = sparse.eye(5)
columns=['A','B','C','D','E']
df = pd.DataFrame.sparse.from_spmatrix(mat,columns=columns)

In [159]:
df

Unnamed: 0,A,B,C,D,E
0,1.0,0.0,0.0,0.0,0.0
1,0.0,1.0,0.0,0.0,0.0
2,0.0,0.0,1.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0
4,0.0,0.0,0.0,0.0,1.0


In [161]:
#It shows that the df is in sparse mode
df.dtypes

A    Sparse[float64, 0]
B    Sparse[float64, 0]
C    Sparse[float64, 0]
D    Sparse[float64, 0]
E    Sparse[float64, 0]
dtype: object

In [142]:
#to return to the normal mode without compression
df_dense = df.sparse.to_dense()

In [143]:
df_dense.dtypes

A    float64
B    float64
C    float64
D    float64
E    float64
dtype: object