# Python Lab 04a: Introduction to Scikit-Learn (PCA, Kmeans, etc.) and to Pandas

## Francesco Della Santa, Computational Linear Algebra for Large Scale Problems, Politecnico di Torino

In [1]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler


## Initialize PCA objects

In [2]:
pca_general = PCA()
pca_perc = PCA(n_components=0.5, svd_solver='full')
pca_ncomp = PCA(n_components=7)

### And let's use them...

In [3]:
N, n = 1000, 100
S = np.random.rand(N, n)

# Start using the pca object
pca_ncomp.fit(S)
pca_perc.fit(S)

# Representation of S in the m-dim space of PCs
Qm_ncomp = pca_ncomp.transform(S)
Qm_perc = pca_perc.transform(S)

# Approximation of S obtained using m PCs
Stilde_ncomp = pca_ncomp.inverse_transform(Qm_ncomp)
Stilde_perc = pca_perc.inverse_transform(Qm_perc)

print('*********************** DATASET S ***********************')
display(S)
print('*********************************************************')
print('')
print(f'*********************** DATASET PROJECTED (ncomp: {pca_ncomp.n_components_} PCs) ***********************')
display(Qm_ncomp)
print('*********************************************************************************************************')
print('')
print(f'*********************** DATASET PROJECTED (perc: {pca_perc.n_components_} PCs) ************************')
display(Qm_perc)
print('********************************************************************************************************')
print('')
print('*********************** RECOVERED DATASET S (ncomp) ***********************')
display(Stilde_ncomp)
print('***************************************************************************')
print('')
print('*********************** RECOVERED DATASET S (perc) ***********************')
display(Stilde_perc)
print('**************************************************************************')

*********************** DATASET S ***********************


array([[0.44803678, 0.72129292, 0.66499731, ..., 0.46181752, 0.13549445,
        0.43259962],
       [0.80942453, 0.15145725, 0.67695802, ..., 0.44528302, 0.95060429,
        0.94707229],
       [0.7593011 , 0.66041691, 0.11657051, ..., 0.29408057, 0.83784082,
        0.62837131],
       ...,
       [0.86597371, 0.06158437, 0.06456138, ..., 0.47412807, 0.70829862,
        0.50148986],
       [0.25004315, 0.92267323, 0.27806626, ..., 0.21961048, 0.33456625,
        0.9079498 ],
       [0.90208471, 0.22675268, 0.23821032, ..., 0.88490443, 0.4038326 ,
        0.40756518]])

*********************************************************

*********************** DATASET PROJECTED (ncomp: 7 PCs) ***********************


array([[ 0.19143596, -0.06525626, -0.1341026 , ...,  0.17337555,
        -0.27285551,  0.15477667],
       [-0.89670253, -0.39761108,  0.10472288, ...,  0.08429767,
        -0.14144705, -0.032583  ],
       [-0.50915913,  0.45040079, -0.05233917, ..., -0.18997798,
        -0.19461427,  0.5299346 ],
       ...,
       [ 0.03802245, -0.69229358,  0.2551665 , ..., -0.81654088,
        -0.34482747,  0.43008589],
       [ 0.21953548,  0.11211792, -0.22606317, ..., -0.10188934,
        -0.60674507, -0.330214  ],
       [ 0.30653462,  0.62014272, -0.3053524 , ..., -0.44310304,
         0.1808224 , -0.00807624]])

*********************************************************************************************************

*********************** DATASET PROJECTED (perc: 38 PCs) ************************


array([[ 0.2124041 , -0.07089708,  0.18831163, ...,  0.00736104,
        -0.16867982,  0.14170654],
       [-0.87496169, -0.66561667,  0.08469299, ..., -0.2103309 ,
        -0.59244947,  0.12052345],
       [-0.48074577,  0.48016478, -0.23436394, ..., -0.28051888,
         0.02211495,  0.06763604],
       ...,
       [ 0.07486451, -0.35900522,  0.10593167, ...,  0.22917736,
        -0.08453159,  0.55695876],
       [ 0.2728795 ,  0.15601535,  0.15327854, ...,  0.06080648,
         0.62261675, -0.74818963],
       [ 0.19894246,  0.61982119, -0.04889108, ...,  0.01009683,
        -0.24454166,  0.00787916]])

********************************************************************************************************

*********************** RECOVERED DATASET S (ncomp) ***********************


array([[0.43393518, 0.52832273, 0.55271249, ..., 0.49695416, 0.446727  ,
        0.46579479],
       [0.49722457, 0.50528734, 0.58144121, ..., 0.47024724, 0.61977767,
        0.64651157],
       [0.56885406, 0.25403653, 0.21742566, ..., 0.54794299, 0.55168731,
        0.58267385],
       ...,
       [0.57988961, 0.58711113, 0.35474111, ..., 0.50139409, 0.63879801,
        0.45489191],
       [0.36251675, 0.47364278, 0.48540028, ..., 0.44151149, 0.41299587,
        0.42789186],
       [0.53379333, 0.48366347, 0.3649358 , ..., 0.4775457 , 0.48223893,
        0.43423771]])

***************************************************************************

*********************** RECOVERED DATASET S (perc) ***********************


array([[ 0.31395964,  0.51743789,  0.43662324, ...,  0.4079689 ,
         0.24358497,  0.35489997],
       [ 0.75746613,  0.44305457,  0.68781404, ...,  0.58094821,
         0.68806871,  0.72283448],
       [ 0.81974344,  0.4730806 , -0.05561242, ...,  0.46286053,
         0.69183634,  0.79715472],
       ...,
       [ 0.70800851,  0.2387695 ,  0.36310615, ...,  0.38846387,
         0.87590291,  0.33587253],
       [ 0.37805113,  0.77673055,  0.43790157, ...,  0.18289552,
         0.3709523 ,  0.7987082 ],
       [ 0.77188733,  0.43894983,  0.38415046, ...,  0.88843458,
         0.69159773,  0.50345679]])

**************************************************************************


In [4]:
display(pca_ncomp.explained_variance_ratio_)

array([0.01713306, 0.01614954, 0.01612618, 0.01588559, 0.01543022,
       0.01538071, 0.01500091])

## Initialize the Standard Scaler

In [5]:
scaler_recent = StandardScaler(with_std=False)
scaler_znorm = StandardScaler()
# Start using the scaler objects
scaler_recent.fit(S)
scaler_znorm.fit(S)
# recentered S
Sbar = scaler_recent.transform(S)
# standardized S
Shat = scaler_znorm.transform(S)

print(f'*********************** DATASET RECENTERED ***********************')
print('SAMPLE MEAN OF RECENTERED DATA:')
display(Sbar.mean(axis=1)[:10])
print('SAMPLE ST.DEV. OF RECENTERED DATA:')
display(Sbar.std(axis=1)[:10])
print('')
display(Sbar)
print('*******************************************************************')
print('')
print(f'*********************** DATASET STANDARDIZED ***********************')
print('SAMPLE MEAN OF STANDARDIZED DATA:')
display(Shat.mean(axis=1)[:10])
print('SAMPLE ST.DEV. OF STANDARDIZED DATA:')
display(Shat.std(axis=1)[:10])
print('')
display(Shat)
print('**********************************************************************')

*********************** DATASET RECENTERED ***********************
SAMPLE MEAN OF RECENTERED DATA:


array([-0.02200652,  0.04949518, -0.0212853 ,  0.002105  ,  0.01455074,
        0.02088604, -0.02397331, -0.0138903 ,  0.00831277,  0.06872925])

SAMPLE ST.DEV. OF RECENTERED DATA:


array([0.27189358, 0.26909503, 0.27878131, 0.27383112, 0.29432719,
       0.27984366, 0.28482845, 0.28035614, 0.29312737, 0.29014242])




array([[-0.05744344,  0.21795929,  0.16741691, ..., -0.04396553,
        -0.3771537 , -0.06329526],
       [ 0.30394431, -0.35187638,  0.17937762, ..., -0.06050002,
         0.43795614,  0.4511774 ],
       [ 0.25382087,  0.15708328, -0.38100989, ..., -0.21170247,
         0.32519267,  0.13247642],
       ...,
       [ 0.36049349, -0.44174926, -0.43301902, ..., -0.03165497,
         0.19565047,  0.00559497],
       [-0.25543707,  0.4193396 , -0.21951414, ..., -0.28617256,
        -0.1780819 ,  0.41205492],
       [ 0.39660449, -0.27658096, -0.25937008, ...,  0.37912139,
        -0.10881555, -0.0883297 ]])

*******************************************************************

*********************** DATASET STANDARDIZED ***********************
SAMPLE MEAN OF STANDARDIZED DATA:


array([-0.07840277,  0.17239716, -0.07238442,  0.00755748,  0.0516825 ,
        0.07420248, -0.08255673, -0.0474839 ,  0.02792655,  0.23686958])

SAMPLE ST.DEV. OF STANDARDIZED DATA:


array([0.94386755, 0.93496549, 0.96728693, 0.94929248, 1.01901619,
       0.9682958 , 0.98797168, 0.97429839, 1.01538217, 1.00355829])




array([[-0.1995472 ,  0.75549522,  0.56231888, ..., -0.15209891,
        -1.30050832, -0.21958831],
       [ 1.05584266, -1.21968157,  0.60249244, ..., -0.20930007,
         1.51016841,  1.56525595],
       [ 0.88172372,  0.54448547, -1.27973369, ..., -0.73238552,
         1.12133535,  0.4595964 ],
       ...,
       [ 1.25228338, -1.53120089, -1.45442162, ..., -0.10951049,
         0.67464556,  0.01941047],
       [-0.88733807,  1.45352403, -0.73730275, ..., -0.99001508,
        -0.61406529,  1.42952951],
       [ 1.37772588, -0.95869092, -0.87117062, ...,  1.31157192,
        -0.37521979, -0.30643952]])

**********************************************************************


### Let's apply the PCA to standardized data

In [6]:
pca = PCA(n_components=7)

# Start with PCA
pca.fit(Shat)
Qm = pca.transform(Shat)

# Recovering of Shat_tilde
Shat_tilde = pca.inverse_transform(Qm)

# Recovering of S_tilde
S_tilde = scaler_znorm.inverse_transform(Shat_tilde)

print('*********************** RECOVERED DATASET Shat ***********************')
display(Shat_tilde)
print('**********************************************************************')
print('*********************** RECOVERED DATASET S ***********************')
display(Shat)
print('*******************************************************************')

*********************** RECOVERED DATASET Shat ***********************


array([[-0.13512288,  0.26692973,  0.14111788, ...,  0.00113901,
        -0.16389678, -0.0757107 ],
       [-0.13944136,  0.19615002,  0.21532564, ...,  0.0625367 ,
         0.28771108,  0.46587122],
       [ 0.44592719, -0.68953136, -0.64549856, ..., -0.01711101,
         0.26630799,  0.17132158],
       ...,
       [ 0.44465478,  0.09529308, -0.39987713, ..., -0.15947556,
         0.37004682, -0.25917965],
       [-0.41855022,  0.12683171, -0.25818889, ...,  0.16498091,
        -0.31373537, -0.33490777],
       [ 0.66562344, -0.03995053, -0.5538478 , ..., -0.05856073,
         0.03689237,  0.07931234]])

**********************************************************************
*********************** RECOVERED DATASET S ***********************


array([[-0.1995472 ,  0.75549522,  0.56231888, ..., -0.15209891,
        -1.30050832, -0.21958831],
       [ 1.05584266, -1.21968157,  0.60249244, ..., -0.20930007,
         1.51016841,  1.56525595],
       [ 0.88172372,  0.54448547, -1.27973369, ..., -0.73238552,
         1.12133535,  0.4595964 ],
       ...,
       [ 1.25228338, -1.53120089, -1.45442162, ..., -0.10951049,
         0.67464556,  0.01941047],
       [-0.88733807,  1.45352403, -0.73730275, ..., -0.99001508,
        -0.61406529,  1.42952951],
       [ 1.37772588, -0.95869092, -0.87117062, ...,  1.31157192,
        -0.37521979, -0.30643952]])

*******************************************************************


## Initialize $k$-Means

In [7]:
kmeans_default = KMeans()
kmeans_3c = KMeans(n_clusters=3, init='random', algorithm='full')

W0 = np.random.rand(3, 10)
kmeans_3cW0 = KMeans(n_clusters=3, init=W0, algorithm='full')

### And let's use one of them...

In [8]:
Snew = np.random.rand(N, n)

km = kmeans_3c  # change the KMeans object here if you want to try another one

# Start using the km object, fitting it on the data S
km.fit(S)

# Prediction of cluster belonging w.r.t. S
S_labels = km.labels_

# Prediction of cluster belonging w.r.t. Snew
Snew_labels = km.predict(Snew)

print(f'*********************** S labels ***********************')
display(S_labels[:10])
print('*********************************************************')
print('')
print(f'*********************** Snew labels ***********************')
display(Snew_labels[:10])
print('************************************************************')

*********************** S labels ***********************




array([2, 1, 1, 2, 0, 0, 1, 2, 1, 0], dtype=int32)

*********************************************************

*********************** Snew labels ***********************


array([2, 2, 0, 1, 1, 1, 1, 0, 0, 0], dtype=int32)

************************************************************


## Initialize Serieses

### Using Arrays

In [9]:
x = np.random.rand(10)
x

array([0.83850895, 0.94567261, 0.5155159 , 0.61776889, 0.13027081,
       0.30949194, 0.00495843, 0.90583258, 0.08530988, 0.01324445])

In [10]:
s1 = pd.Series(x, index=[f'index{i}' for i in range(1,11)], name='my_series1')
s2 = pd.Series(x, name='my_series2')

In [11]:
s1 

index1     0.838509
index2     0.945673
index3     0.515516
index4     0.617769
index5     0.130271
index6     0.309492
index7     0.004958
index8     0.905833
index9     0.085310
index10    0.013244
Name: my_series1, dtype: float64

In [12]:
s1['index1'] = 0

In [13]:
x

array([0.        , 0.94567261, 0.5155159 , 0.61776889, 0.13027081,
       0.30949194, 0.00495843, 0.90583258, 0.08530988, 0.01324445])

In [14]:
s2

0    0.000000
1    0.945673
2    0.515516
3    0.617769
4    0.130271
5    0.309492
6    0.004958
7    0.905833
8    0.085310
9    0.013244
Name: my_series2, dtype: float64

### Using Dictionaries

In [15]:
d = {'Age':30, 'Height':185, 'Weight':90}
d

{'Age': 30, 'Height': 185, 'Weight': 90}

In [16]:
s1d = pd.Series(d)

In [17]:
s1d

Age        30
Height    185
Weight     90
dtype: int64

## Initialize DataFrames

### Using Dictionaries

In [18]:
D = {'Float_random':np.random.rand(10), 'Integer_random':np.random.permutation(10)}
D

{'Float_random': array([0.18614063, 0.61816902, 0.58568968, 0.20702444, 0.79041852,
        0.31322631, 0.30699054, 0.55788872, 0.925253  , 0.7093775 ]),
 'Integer_random': array([0, 8, 6, 2, 3, 7, 4, 5, 1, 9])}

In [19]:
df1d = pd.DataFrame(D)
df1d

Unnamed: 0,Float_random,Integer_random
0,0.186141,0
1,0.618169,8
2,0.58569,6
3,0.207024,2
4,0.790419,3
5,0.313226,7
6,0.306991,4
7,0.557889,5
8,0.925253,1
9,0.709378,9


In [20]:
df1d.dtypes

Float_random      float64
Integer_random      int64
dtype: object

### Using Arrays

In [21]:
X = np.random.rand(10,5)
X

array([[0.09063792, 0.1654076 , 0.5399402 , 0.92403697, 0.00416718],
       [0.08541754, 0.18464366, 0.15334953, 0.12415208, 0.56713473],
       [0.70955137, 0.88405253, 0.31704615, 0.22289722, 0.34469882],
       [0.64374726, 0.43870604, 0.06722401, 0.43518456, 0.70336004],
       [0.56449627, 0.21225208, 0.88023891, 0.56725608, 0.73709372],
       [0.91148982, 0.98742224, 0.76454642, 0.99278533, 0.20031286],
       [0.69647757, 0.98461031, 0.38397471, 0.66478144, 0.06909106],
       [0.87777943, 0.66311598, 0.87251321, 0.26179558, 0.3809201 ],
       [0.74244651, 0.33526865, 0.49599132, 0.43277401, 0.88098925],
       [0.94979178, 0.8010462 , 0.00884765, 0.6564167 , 0.68387257]])

In [22]:
df1 = pd.DataFrame(X, index=range(1, X.shape[0] + 1), columns=[f'column_{i}' for i in range(1, X.shape[1] + 1)])
df1

Unnamed: 0,column_1,column_2,column_3,column_4,column_5
1,0.090638,0.165408,0.53994,0.924037,0.004167
2,0.085418,0.184644,0.15335,0.124152,0.567135
3,0.709551,0.884053,0.317046,0.222897,0.344699
4,0.643747,0.438706,0.067224,0.435185,0.70336
5,0.564496,0.212252,0.880239,0.567256,0.737094
6,0.91149,0.987422,0.764546,0.992785,0.200313
7,0.696478,0.98461,0.383975,0.664781,0.069091
8,0.877779,0.663116,0.872513,0.261796,0.38092
9,0.742447,0.335269,0.495991,0.432774,0.880989
10,0.949792,0.801046,0.008848,0.656417,0.683873


## Extract/Add Column

In [23]:
df1['column_2']

1     0.165408
2     0.184644
3     0.884053
4     0.438706
5     0.212252
6     0.987422
7     0.984610
8     0.663116
9     0.335269
10    0.801046
Name: column_2, dtype: float64

In [24]:
df1['column_6'] = np.random.rand(10)
df1

Unnamed: 0,column_1,column_2,column_3,column_4,column_5,column_6
1,0.090638,0.165408,0.53994,0.924037,0.004167,0.863345
2,0.085418,0.184644,0.15335,0.124152,0.567135,0.330708
3,0.709551,0.884053,0.317046,0.222897,0.344699,0.997673
4,0.643747,0.438706,0.067224,0.435185,0.70336,0.569756
5,0.564496,0.212252,0.880239,0.567256,0.737094,0.829821
6,0.91149,0.987422,0.764546,0.992785,0.200313,0.494
7,0.696478,0.98461,0.383975,0.664781,0.069091,0.077466
8,0.877779,0.663116,0.872513,0.261796,0.38092,0.360858
9,0.742447,0.335269,0.495991,0.432774,0.880989,0.02199
10,0.949792,0.801046,0.008848,0.656417,0.683873,0.958089


## DataFrame Attributes

We use the DataFrame df1 defined above.

In [25]:
df1.at[6, 'column_2']

0.9874222384807315

In [26]:
df1.iat[5, 1]

0.9874222384807315

In [27]:
df1.index

RangeIndex(start=1, stop=11, step=1)

In [28]:
df1.columns

Index(['column_1', 'column_2', 'column_3', 'column_4', 'column_5', 'column_6'], dtype='object')

In [29]:
df1.axes

[RangeIndex(start=1, stop=11, step=1),
 Index(['column_1', 'column_2', 'column_3', 'column_4', 'column_5', 'column_6'], dtype='object')]

In [30]:
df1.loc[[1,7,10], :]

Unnamed: 0,column_1,column_2,column_3,column_4,column_5,column_6
1,0.090638,0.165408,0.53994,0.924037,0.004167,0.863345
7,0.696478,0.98461,0.383975,0.664781,0.069091,0.077466
10,0.949792,0.801046,0.008848,0.656417,0.683873,0.958089


In [31]:
df1.loc[[1,7,10], ['column_1', 'column_3']]

Unnamed: 0,column_1,column_3
1,0.090638,0.53994
7,0.696478,0.383975
10,0.949792,0.008848


In [32]:
df1.iloc[[0,6,9],[0,2]]

Unnamed: 0,column_1,column_3
1,0.090638,0.53994
7,0.696478,0.383975
10,0.949792,0.008848


In [33]:
df1.loc[(df1.index > 3) & (df1.index <= 7), df1.columns != 'column_3']

Unnamed: 0,column_1,column_2,column_4,column_5,column_6
4,0.643747,0.438706,0.435185,0.70336,0.569756
5,0.564496,0.212252,0.567256,0.737094,0.829821
6,0.91149,0.987422,0.992785,0.200313,0.494
7,0.696478,0.98461,0.664781,0.069091,0.077466


In [34]:
df1.shape

(10, 6)

In [35]:
df1.ndim

2

In [36]:
df1.size

60

In [37]:
df1.values

array([[0.09063792, 0.1654076 , 0.5399402 , 0.92403697, 0.00416718,
        0.86334501],
       [0.08541754, 0.18464366, 0.15334953, 0.12415208, 0.56713473,
        0.33070768],
       [0.70955137, 0.88405253, 0.31704615, 0.22289722, 0.34469882,
        0.99767338],
       [0.64374726, 0.43870604, 0.06722401, 0.43518456, 0.70336004,
        0.56975569],
       [0.56449627, 0.21225208, 0.88023891, 0.56725608, 0.73709372,
        0.8298213 ],
       [0.91148982, 0.98742224, 0.76454642, 0.99278533, 0.20031286,
        0.49400019],
       [0.69647757, 0.98461031, 0.38397471, 0.66478144, 0.06909106,
        0.07746632],
       [0.87777943, 0.66311598, 0.87251321, 0.26179558, 0.3809201 ,
        0.36085787],
       [0.74244651, 0.33526865, 0.49599132, 0.43277401, 0.88098925,
        0.02199001],
       [0.94979178, 0.8010462 , 0.00884765, 0.6564167 , 0.68387257,
        0.95808875]])

## DataFrame Methods

We use the DataFrame df1 defined above.

### Exploration Methods

In [38]:
df1.head(3)

Unnamed: 0,column_1,column_2,column_3,column_4,column_5,column_6
1,0.090638,0.165408,0.53994,0.924037,0.004167,0.863345
2,0.085418,0.184644,0.15335,0.124152,0.567135,0.330708
3,0.709551,0.884053,0.317046,0.222897,0.344699,0.997673


In [39]:
df1.tail(2)

Unnamed: 0,column_1,column_2,column_3,column_4,column_5,column_6
9,0.742447,0.335269,0.495991,0.432774,0.880989,0.02199
10,0.949792,0.801046,0.008848,0.656417,0.683873,0.958089


In [40]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 1 to 10
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   column_1  10 non-null     float64
 1   column_2  10 non-null     float64
 2   column_3  10 non-null     float64
 3   column_4  10 non-null     float64
 4   column_5  10 non-null     float64
 5   column_6  10 non-null     float64
dtypes: float64(6)
memory usage: 612.0 bytes


In [41]:
df1.nunique()

column_1    10
column_2    10
column_3    10
column_4    10
column_5    10
column_6    10
dtype: int64

In [42]:
df1.nunique(axis=1)

1     6
2     6
3     6
4     6
5     6
6     6
7     6
8     6
9     6
10    6
dtype: int64

In [43]:
df1.isna()

Unnamed: 0,column_1,column_2,column_3,column_4,column_5,column_6
1,False,False,False,False,False,False
2,False,False,False,False,False,False
3,False,False,False,False,False,False
4,False,False,False,False,False,False
5,False,False,False,False,False,False
6,False,False,False,False,False,False
7,False,False,False,False,False,False
8,False,False,False,False,False,False
9,False,False,False,False,False,False
10,False,False,False,False,False,False


In [44]:
df1.count()

column_1    10
column_2    10
column_3    10
column_4    10
column_5    10
column_6    10
dtype: int64

In [45]:
df1.value_counts()

column_1  column_2  column_3  column_4  column_5  column_6
0.085418  0.184644  0.153350  0.124152  0.567135  0.330708    1
0.090638  0.165408  0.539940  0.924037  0.004167  0.863345    1
0.564496  0.212252  0.880239  0.567256  0.737094  0.829821    1
0.643747  0.438706  0.067224  0.435185  0.703360  0.569756    1
0.696478  0.984610  0.383975  0.664781  0.069091  0.077466    1
0.709551  0.884053  0.317046  0.222897  0.344699  0.997673    1
0.742447  0.335269  0.495991  0.432774  0.880989  0.021990    1
0.877779  0.663116  0.872513  0.261796  0.380920  0.360858    1
0.911490  0.987422  0.764546  0.992785  0.200313  0.494000    1
0.949792  0.801046  0.008848  0.656417  0.683873  0.958089    1
Name: count, dtype: int64

In [46]:
df1.value_counts(normalize=True)

column_1  column_2  column_3  column_4  column_5  column_6
0.085418  0.184644  0.153350  0.124152  0.567135  0.330708    0.1
0.090638  0.165408  0.539940  0.924037  0.004167  0.863345    0.1
0.564496  0.212252  0.880239  0.567256  0.737094  0.829821    0.1
0.643747  0.438706  0.067224  0.435185  0.703360  0.569756    0.1
0.696478  0.984610  0.383975  0.664781  0.069091  0.077466    0.1
0.709551  0.884053  0.317046  0.222897  0.344699  0.997673    0.1
0.742447  0.335269  0.495991  0.432774  0.880989  0.021990    0.1
0.877779  0.663116  0.872513  0.261796  0.380920  0.360858    0.1
0.911490  0.987422  0.764546  0.992785  0.200313  0.494000    0.1
0.949792  0.801046  0.008848  0.656417  0.683873  0.958089    0.1
Name: proportion, dtype: float64

### Statistical Analysis (Basic) and Operations

In [47]:
df1.describe()

Unnamed: 0,column_1,column_2,column_3,column_4,column_5,column_6
count,10.0,10.0,10.0,10.0,10.0,10.0
mean,0.627184,0.565653,0.448367,0.528208,0.457164,0.550371
std,0.308792,0.33651,0.320401,0.289509,0.302275,0.354793
min,0.085418,0.165408,0.008848,0.124152,0.004167,0.02199
25%,0.584309,0.243006,0.194274,0.30454,0.236409,0.338245
50%,0.703014,0.550911,0.439983,0.50122,0.474027,0.531878
75%,0.843946,0.863301,0.708395,0.66269,0.698488,0.854964
max,0.949792,0.987422,0.880239,0.992785,0.880989,0.997673


In [48]:
df1.describe(percentiles=[0.13, 0.87, 0.99])

Unnamed: 0,column_1,column_2,column_3,column_4,column_5,column_6
count,10.0,10.0,10.0,10.0,10.0,10.0
mean,0.627184,0.565653,0.448367,0.528208,0.457164,0.550371
std,0.308792,0.33651,0.320401,0.289509,0.302275,0.354793
min,0.085418,0.165408,0.008848,0.124152,0.004167,0.02199
13%,0.171194,0.189337,0.081865,0.22951,0.091399,0.120517
50%,0.703014,0.550911,0.439983,0.50122,0.474027,0.531878
87%,0.905759,0.967515,0.854159,0.879964,0.731359,0.941982
99%,0.946345,0.987169,0.879544,0.986598,0.868039,0.994111
max,0.949792,0.987422,0.880239,0.992785,0.880989,0.997673


In [49]:
df1.mean()

column_1    0.627184
column_2    0.565653
column_3    0.448367
column_4    0.528208
column_5    0.457164
column_6    0.550371
dtype: float64

In [50]:
df1.mean(axis=1)

1     0.431256
2     0.240901
3     0.579320
4     0.476330
5     0.631860
6     0.725093
7     0.479400
8     0.569497
9     0.484910
10    0.676344
dtype: float64

In [51]:
df1.corr()

Unnamed: 0,column_1,column_2,column_3,column_4,column_5,column_6
column_1,1.0,0.738017,0.136911,0.102578,0.194877,-0.055708
column_2,0.738017,1.0,-0.038145,0.200367,-0.381451,-0.021441
column_3,0.136911,-0.038145,1.0,0.254023,-0.229587,-0.121294
column_4,0.102578,0.200367,0.254023,1.0,-0.463146,0.159659
column_5,0.194877,-0.381451,-0.229587,-0.463146,1.0,-0.054184
column_6,-0.055708,-0.021441,-0.121294,0.159659,-0.054184,1.0


In [52]:
df1.cov()

Unnamed: 0,column_1,column_2,column_3,column_4,column_5,column_6
column_1,0.095352,0.076688,0.013546,0.00917,0.01819,-0.006103
column_2,0.076688,0.113239,-0.004113,0.01952,-0.038801,-0.00256
column_3,0.013546,-0.004113,0.102657,0.023563,-0.022235,-0.013788
column_4,0.00917,0.01952,0.023563,0.083816,-0.040531,0.016399
column_5,0.01819,-0.038801,-0.022235,-0.040531,0.09137,-0.005811
column_6,-0.006103,-0.00256,-0.013788,0.016399,-0.005811,0.125878


In [53]:
df1.sample(3, random_state=10)

Unnamed: 0,column_1,column_2,column_3,column_4,column_5,column_6
9,0.742447,0.335269,0.495991,0.432774,0.880989,0.02199
3,0.709551,0.884053,0.317046,0.222897,0.344699,0.997673
6,0.91149,0.987422,0.764546,0.992785,0.200313,0.494


### Attributes, Operation Methods, and Transformation Methods

In [54]:
df1_copy = df1.copy()
df1_fakecopy = df1

In [55]:
df1_fakecopy.at[1, 'column_1'] = 10

In [56]:
df1_copy.at[1, 'column_1'] = np.nan

In [57]:
df1_copy

Unnamed: 0,column_1,column_2,column_3,column_4,column_5,column_6
1,,0.165408,0.53994,0.924037,0.004167,0.863345
2,0.085418,0.184644,0.15335,0.124152,0.567135,0.330708
3,0.709551,0.884053,0.317046,0.222897,0.344699,0.997673
4,0.643747,0.438706,0.067224,0.435185,0.70336,0.569756
5,0.564496,0.212252,0.880239,0.567256,0.737094,0.829821
6,0.91149,0.987422,0.764546,0.992785,0.200313,0.494
7,0.696478,0.98461,0.383975,0.664781,0.069091,0.077466
8,0.877779,0.663116,0.872513,0.261796,0.38092,0.360858
9,0.742447,0.335269,0.495991,0.432774,0.880989,0.02199
10,0.949792,0.801046,0.008848,0.656417,0.683873,0.958089


In [58]:
df1

Unnamed: 0,column_1,column_2,column_3,column_4,column_5,column_6
1,10.0,0.165408,0.53994,0.924037,0.004167,0.863345
2,0.085418,0.184644,0.15335,0.124152,0.567135,0.330708
3,0.709551,0.884053,0.317046,0.222897,0.344699,0.997673
4,0.643747,0.438706,0.067224,0.435185,0.70336,0.569756
5,0.564496,0.212252,0.880239,0.567256,0.737094,0.829821
6,0.91149,0.987422,0.764546,0.992785,0.200313,0.494
7,0.696478,0.98461,0.383975,0.664781,0.069091,0.077466
8,0.877779,0.663116,0.872513,0.261796,0.38092,0.360858
9,0.742447,0.335269,0.495991,0.432774,0.880989,0.02199
10,0.949792,0.801046,0.008848,0.656417,0.683873,0.958089


**N.B.:** modification applied to df1_fakecopy modified df1, too!

In [59]:
pd.concat([df1, df1_copy])

Unnamed: 0,column_1,column_2,column_3,column_4,column_5,column_6
1,10.0,0.165408,0.53994,0.924037,0.004167,0.863345
2,0.085418,0.184644,0.15335,0.124152,0.567135,0.330708
3,0.709551,0.884053,0.317046,0.222897,0.344699,0.997673
4,0.643747,0.438706,0.067224,0.435185,0.70336,0.569756
5,0.564496,0.212252,0.880239,0.567256,0.737094,0.829821
6,0.91149,0.987422,0.764546,0.992785,0.200313,0.494
7,0.696478,0.98461,0.383975,0.664781,0.069091,0.077466
8,0.877779,0.663116,0.872513,0.261796,0.38092,0.360858
9,0.742447,0.335269,0.495991,0.432774,0.880989,0.02199
10,0.949792,0.801046,0.008848,0.656417,0.683873,0.958089


In [60]:
df1.drop([1, 3], axis=0)

Unnamed: 0,column_1,column_2,column_3,column_4,column_5,column_6
2,0.085418,0.184644,0.15335,0.124152,0.567135,0.330708
4,0.643747,0.438706,0.067224,0.435185,0.70336,0.569756
5,0.564496,0.212252,0.880239,0.567256,0.737094,0.829821
6,0.91149,0.987422,0.764546,0.992785,0.200313,0.494
7,0.696478,0.98461,0.383975,0.664781,0.069091,0.077466
8,0.877779,0.663116,0.872513,0.261796,0.38092,0.360858
9,0.742447,0.335269,0.495991,0.432774,0.880989,0.02199
10,0.949792,0.801046,0.008848,0.656417,0.683873,0.958089


In [61]:
df1_copy.dropna()

Unnamed: 0,column_1,column_2,column_3,column_4,column_5,column_6
2,0.085418,0.184644,0.15335,0.124152,0.567135,0.330708
3,0.709551,0.884053,0.317046,0.222897,0.344699,0.997673
4,0.643747,0.438706,0.067224,0.435185,0.70336,0.569756
5,0.564496,0.212252,0.880239,0.567256,0.737094,0.829821
6,0.91149,0.987422,0.764546,0.992785,0.200313,0.494
7,0.696478,0.98461,0.383975,0.664781,0.069091,0.077466
8,0.877779,0.663116,0.872513,0.261796,0.38092,0.360858
9,0.742447,0.335269,0.495991,0.432774,0.880989,0.02199
10,0.949792,0.801046,0.008848,0.656417,0.683873,0.958089


In [62]:
df1_copy.fillna(1000)

Unnamed: 0,column_1,column_2,column_3,column_4,column_5,column_6
1,1000.0,0.165408,0.53994,0.924037,0.004167,0.863345
2,0.085418,0.184644,0.15335,0.124152,0.567135,0.330708
3,0.709551,0.884053,0.317046,0.222897,0.344699,0.997673
4,0.643747,0.438706,0.067224,0.435185,0.70336,0.569756
5,0.564496,0.212252,0.880239,0.567256,0.737094,0.829821
6,0.91149,0.987422,0.764546,0.992785,0.200313,0.494
7,0.696478,0.98461,0.383975,0.664781,0.069091,0.077466
8,0.877779,0.663116,0.872513,0.261796,0.38092,0.360858
9,0.742447,0.335269,0.495991,0.432774,0.880989,0.02199
10,0.949792,0.801046,0.008848,0.656417,0.683873,0.958089


In [63]:
df1_copy.rename(index={1:'nuovo_index'}, columns={'column_1':'COLONNA_1'})

Unnamed: 0,COLONNA_1,column_2,column_3,column_4,column_5,column_6
nuovo_index,,0.165408,0.53994,0.924037,0.004167,0.863345
2,0.085418,0.184644,0.15335,0.124152,0.567135,0.330708
3,0.709551,0.884053,0.317046,0.222897,0.344699,0.997673
4,0.643747,0.438706,0.067224,0.435185,0.70336,0.569756
5,0.564496,0.212252,0.880239,0.567256,0.737094,0.829821
6,0.91149,0.987422,0.764546,0.992785,0.200313,0.494
7,0.696478,0.98461,0.383975,0.664781,0.069091,0.077466
8,0.877779,0.663116,0.872513,0.261796,0.38092,0.360858
9,0.742447,0.335269,0.495991,0.432774,0.880989,0.02199
10,0.949792,0.801046,0.008848,0.656417,0.683873,0.958089


In [64]:
df1_copy.reset_index()

Unnamed: 0,index,column_1,column_2,column_3,column_4,column_5,column_6
0,1,,0.165408,0.53994,0.924037,0.004167,0.863345
1,2,0.085418,0.184644,0.15335,0.124152,0.567135,0.330708
2,3,0.709551,0.884053,0.317046,0.222897,0.344699,0.997673
3,4,0.643747,0.438706,0.067224,0.435185,0.70336,0.569756
4,5,0.564496,0.212252,0.880239,0.567256,0.737094,0.829821
5,6,0.91149,0.987422,0.764546,0.992785,0.200313,0.494
6,7,0.696478,0.98461,0.383975,0.664781,0.069091,0.077466
7,8,0.877779,0.663116,0.872513,0.261796,0.38092,0.360858
8,9,0.742447,0.335269,0.495991,0.432774,0.880989,0.02199
9,10,0.949792,0.801046,0.008848,0.656417,0.683873,0.958089


In [65]:
df1_copy.sort_values('column_1')

Unnamed: 0,column_1,column_2,column_3,column_4,column_5,column_6
2,0.085418,0.184644,0.15335,0.124152,0.567135,0.330708
5,0.564496,0.212252,0.880239,0.567256,0.737094,0.829821
4,0.643747,0.438706,0.067224,0.435185,0.70336,0.569756
7,0.696478,0.98461,0.383975,0.664781,0.069091,0.077466
3,0.709551,0.884053,0.317046,0.222897,0.344699,0.997673
9,0.742447,0.335269,0.495991,0.432774,0.880989,0.02199
8,0.877779,0.663116,0.872513,0.261796,0.38092,0.360858
6,0.91149,0.987422,0.764546,0.992785,0.200313,0.494
10,0.949792,0.801046,0.008848,0.656417,0.683873,0.958089
1,,0.165408,0.53994,0.924037,0.004167,0.863345


(Notare il NaN messo in fondo; per maggiori informazioni, guardare la documentazione ufficiale)

### Exportation Methods

In [66]:
df1.to_csv('df1.csv', columns=['column_1', 'column_5', 'column_2'], index_label='ID')

In [67]:
df1.to_pickle('df1.pkl')

## Loading a DataFrame

In [68]:
pd.read_csv('df1.csv')

Unnamed: 0,ID,column_1,column_5,column_2
0,1,10.0,0.004167,0.165408
1,2,0.085418,0.567135,0.184644
2,3,0.709551,0.344699,0.884053
3,4,0.643747,0.70336,0.438706
4,5,0.564496,0.737094,0.212252
5,6,0.91149,0.200313,0.987422
6,7,0.696478,0.069091,0.98461
7,8,0.877779,0.38092,0.663116
8,9,0.742447,0.880989,0.335269
9,10,0.949792,0.683873,0.801046


In [69]:
pd.read_csv('df1.csv', usecols=['ID', 'column_1', 'column_2'], index_col='ID')

Unnamed: 0_level_0,column_1,column_2
ID,Unnamed: 1_level_1,Unnamed: 2_level_1
1,10.0,0.165408
2,0.085418,0.184644
3,0.709551,0.884053
4,0.643747,0.438706
5,0.564496,0.212252
6,0.91149,0.987422
7,0.696478,0.98461
8,0.877779,0.663116
9,0.742447,0.335269
10,0.949792,0.801046


In [70]:
pd.read_pickle('df1.pkl')

Unnamed: 0,column_1,column_2,column_3,column_4,column_5,column_6
1,10.0,0.165408,0.53994,0.924037,0.004167,0.863345
2,0.085418,0.184644,0.15335,0.124152,0.567135,0.330708
3,0.709551,0.884053,0.317046,0.222897,0.344699,0.997673
4,0.643747,0.438706,0.067224,0.435185,0.70336,0.569756
5,0.564496,0.212252,0.880239,0.567256,0.737094,0.829821
6,0.91149,0.987422,0.764546,0.992785,0.200313,0.494
7,0.696478,0.98461,0.383975,0.664781,0.069091,0.077466
8,0.877779,0.663116,0.872513,0.261796,0.38092,0.360858
9,0.742447,0.335269,0.495991,0.432774,0.880989,0.02199
10,0.949792,0.801046,0.008848,0.656417,0.683873,0.958089


## Concatenation of DataFrames

In [71]:
pd.concat([df1, df1.reset_index()], axis=1)

Unnamed: 0,column_1,column_2,column_3,column_4,column_5,column_6,index,column_1.1,column_2.1,column_3.1,column_4.1,column_5.1,column_6.1
1,10.0,0.165408,0.53994,0.924037,0.004167,0.863345,2.0,0.085418,0.184644,0.15335,0.124152,0.567135,0.330708
2,0.085418,0.184644,0.15335,0.124152,0.567135,0.330708,3.0,0.709551,0.884053,0.317046,0.222897,0.344699,0.997673
3,0.709551,0.884053,0.317046,0.222897,0.344699,0.997673,4.0,0.643747,0.438706,0.067224,0.435185,0.70336,0.569756
4,0.643747,0.438706,0.067224,0.435185,0.70336,0.569756,5.0,0.564496,0.212252,0.880239,0.567256,0.737094,0.829821
5,0.564496,0.212252,0.880239,0.567256,0.737094,0.829821,6.0,0.91149,0.987422,0.764546,0.992785,0.200313,0.494
6,0.91149,0.987422,0.764546,0.992785,0.200313,0.494,7.0,0.696478,0.98461,0.383975,0.664781,0.069091,0.077466
7,0.696478,0.98461,0.383975,0.664781,0.069091,0.077466,8.0,0.877779,0.663116,0.872513,0.261796,0.38092,0.360858
8,0.877779,0.663116,0.872513,0.261796,0.38092,0.360858,9.0,0.742447,0.335269,0.495991,0.432774,0.880989,0.02199
9,0.742447,0.335269,0.495991,0.432774,0.880989,0.02199,10.0,0.949792,0.801046,0.008848,0.656417,0.683873,0.958089
10,0.949792,0.801046,0.008848,0.656417,0.683873,0.958089,,,,,,,
