## Statistics using Pandas

In [9]:
import pandas as pd

cities = ['Mumbai', 'chennai', 'Pune', 'Ahmedabad', 'Kolkata', 'Kanpur', 'Delhi']
city_df = pd.DataFrame(cities)

In [11]:
city_df.columns = ['City_Name']

In [15]:
city_df

Unnamed: 0,City_Name
0,Mumbai
1,chennai
2,Pune
3,Ahmedabad
4,Kolkata
5,Kanpur
6,Delhi


In [17]:
condition_met = city_df.City_Name == 'Mumbai'

In [19]:
type(condition_met)

pandas.core.series.Series

In [21]:
city_df[condition_met]

Unnamed: 0,City_Name
0,Mumbai


In [23]:
condition_met

0     True
1    False
2    False
3    False
4    False
5    False
6    False
Name: City_Name, dtype: bool

In [25]:
city_df[city_df.City_Name == 'Pune']

Unnamed: 0,City_Name
2,Pune


## Aggregation and Grouping

In [28]:
import numpy as np

In [32]:
random_state = np.random.RandomState(42)
random_series = pd.Series(random_state.rand(10))

In [34]:
random_series

0    0.374540
1    0.950714
2    0.731994
3    0.598658
4    0.156019
5    0.155995
6    0.058084
7    0.866176
8    0.601115
9    0.708073
dtype: float64

In [36]:
random_series.mean()

0.5201367359526748

In [38]:
random_series.std()

0.3158656227180549

In [40]:
random_series.sum()

5.201367359526748

In [46]:
df = pd.DataFrame({'A': random_state.rand(5),
                'B': random_state.rand(5)})  
df

Unnamed: 0,A,B
0,0.607545,0.808397
1,0.170524,0.304614
2,0.065052,0.097672
3,0.948886,0.684233
4,0.965632,0.440152


### Column-wise

In [48]:
df.sum()

A    2.757638
B    2.335069
dtype: float64

In [50]:
df.mean()

A    0.551528
B    0.467014
dtype: float64

### Row-wise

In [57]:
df.sum(axis=0)

A    2.757638
B    2.335069
dtype: float64

In [54]:
df.mean(axis=1)

0    0.707971
1    0.237569
2    0.081362
3    0.816559
4    0.702892
dtype: float64

## Groupby

Three stages
* Split - we split dataframe into multiple smaller dataframe based on the values of keys.
* Apply - We apply desired aggregation/transformation on each dataframe.
* Combine - We combine results from apply state into a dataframe.

In [63]:
df = pd.DataFrame({'key' : ['A', 'B', 'C']*2, # list("ABCABC")
                   'data': range(6)})

In [65]:
df

Unnamed: 0,key,data
0,A,0
1,B,1
2,C,2
3,A,3
4,B,4
5,C,5


In [71]:
df.groupby("key")

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x106606bd0>

In [73]:
df.groupby("key").sum()

Unnamed: 0_level_0,data
key,Unnamed: 1_level_1
A,3
B,5
C,7


In [75]:
df.groupby("key").mean()

Unnamed: 0_level_0,data
key,Unnamed: 1_level_1
A,1.5
B,2.5
C,3.5


In [77]:
import pandas as pd
pd.__version__

'2.2.2'

In [79]:
from sklearn.datasets import load_diabetes

diabetes = load_diabetes(as_frame=True)
print(type(diabetes['data']))

<class 'pandas.core.frame.DataFrame'>


In [81]:
df = diabetes['data']

In [132]:
print(diabetes.DESCR)

.. _diabetes_dataset:

Diabetes dataset
----------------

Ten baseline variables, age, sex, body mass index, average blood
pressure, and six blood serum measurements were obtained for each of n =
442 diabetes patients, as well as the response of interest, a
quantitative measure of disease progression one year after baseline.

**Data Set Characteristics:**

:Number of Instances: 442

:Number of Attributes: First 10 columns are numeric predictive values

:Target: Column 11 is a quantitative measure of disease progression one year after baseline

:Attribute Information:
    - age     age in years
    - sex
    - bmi     body mass index
    - bp      average blood pressure
    - s1      tc, total serum cholesterol
    - s2      ldl, low-density lipoproteins
    - s3      hdl, high-density lipoproteins
    - s4      tch, total cholesterol / HDL
    - s5      ltg, possibly log of serum triglycerides level
    - s6      glu, blood sugar level

Note: Each of these 10 feature variables have bee

## Creating pandas DataFrame from Series

In [98]:
cities = pd.Series(['Mumbai', 'Bangalore', 'Chennai', 'Delhi'])
population = pd.Series([17000000, 13000000, 6000000])

In [100]:
city_info_df = pd.DataFrame({'city': cities, 'Population': population})

In [102]:
type(city_info_df)

pandas.core.frame.DataFrame

In [104]:
city_info_df

Unnamed: 0,city,Population
0,Mumbai,17000000.0
1,Bangalore,13000000.0
2,Chennai,6000000.0
3,Delhi,


# Exploring Data in DataFrame

Quickly Examine few entries in the dataframe first five and last five

In [124]:
df.shape

(442, 10)

In [122]:
df.head(n=10)

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6
0,0.038076,0.05068,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019907,-0.017646
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.068332,-0.092204
2,0.085299,0.05068,0.044451,-0.00567,-0.045599,-0.034194,-0.032356,-0.002592,0.002861,-0.02593
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022688,-0.009362
4,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031988,-0.046641
5,-0.092695,-0.044642,-0.040696,-0.019442,-0.068991,-0.079288,0.041277,-0.076395,-0.041176,-0.096346
6,-0.045472,0.05068,-0.047163,-0.015999,-0.040096,-0.0248,0.000779,-0.039493,-0.062917,-0.038357
7,0.063504,0.05068,-0.001895,0.066629,0.09062,0.108914,0.022869,0.017703,-0.035816,0.003064
8,0.041708,0.05068,0.061696,-0.040099,-0.013953,0.006202,-0.028674,-0.002592,-0.01496,0.011349
9,-0.0709,-0.044642,0.039062,-0.033213,-0.012577,-0.034508,-0.024993,-0.002592,0.067737,-0.013504


In [118]:
df.tail(n=3)

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6
439,0.041708,0.05068,-0.015906,0.017293,-0.037344,-0.01384,-0.024993,-0.01108,-0.046883,0.015491
440,-0.045472,-0.044642,0.039062,0.001215,0.016318,0.015283,-0.028674,0.02656,0.044529,-0.02593
441,-0.045472,-0.044642,-0.07303,-0.081413,0.08374,0.027809,0.173816,-0.039493,-0.004222,0.003064


In [114]:
? df.head

[0;31mSignature:[0m  [0mdf[0m[0;34m.[0m[0mhead[0m[0;34m([0m[0mn[0m[0;34m:[0m [0;34m'int'[0m [0;34m=[0m [0;36m5[0m[0;34m)[0m [0;34m->[0m [0;34m'Self'[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Return the first `n` rows.

This function returns the first `n` rows for the object based
on position. It is useful for quickly testing if your object
has the right type of data in it.

For negative values of `n`, this function returns all rows except
the last `|n|` rows, equivalent to ``df[:n]``.

If n is larger than the number of rows, this function returns all rows.

Parameters
----------
n : int, default 5
    Number of rows to select.

Returns
-------
same type as caller
    The first `n` rows of the caller object.

See Also
--------
DataFrame.tail: Returns the last `n` rows.

Examples
--------
>>> df = pd.DataFrame({'animal': ['alligator', 'bee', 'falcon', 'lion',
...                    'monkey', 'parrot', 'shark', 'whale', 'zebra']})
>>> df
      animal
0  

In [134]:
df.columns

Index(['age', 'sex', 'bmi', 'bp', 's1', 's2', 's3', 's4', 's5', 's6'], dtype='object')

In [128]:
list(df.columns)

['age', 'sex', 'bmi', 'bp', 's1', 's2', 's3', 's4', 's5', 's6']

In [136]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 442 entries, 0 to 441
Data columns (total 10 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   age     442 non-null    float64
 1   sex     442 non-null    float64
 2   bmi     442 non-null    float64
 3   bp      442 non-null    float64
 4   s1      442 non-null    float64
 5   s2      442 non-null    float64
 6   s3      442 non-null    float64
 7   s4      442 non-null    float64
 8   s5      442 non-null    float64
 9   s6      442 non-null    float64
dtypes: float64(10)
memory usage: 34.7 KB


In [140]:
df.describe()

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6
count,442.0,442.0,442.0,442.0,442.0,442.0,442.0,442.0,442.0,442.0
mean,-2.511817e-19,1.23079e-17,-2.245564e-16,-4.79757e-17,-1.3814990000000001e-17,3.9184340000000004e-17,-5.777179e-18,-9.04254e-18,9.293722000000001e-17,1.130318e-17
std,0.04761905,0.04761905,0.04761905,0.04761905,0.04761905,0.04761905,0.04761905,0.04761905,0.04761905,0.04761905
min,-0.1072256,-0.04464164,-0.0902753,-0.1123988,-0.1267807,-0.1156131,-0.1023071,-0.0763945,-0.1260971,-0.1377672
25%,-0.03729927,-0.04464164,-0.03422907,-0.03665608,-0.03424784,-0.0303584,-0.03511716,-0.03949338,-0.03324559,-0.03317903
50%,0.00538306,-0.04464164,-0.007283766,-0.005670422,-0.004320866,-0.003819065,-0.006584468,-0.002592262,-0.001947171,-0.001077698
75%,0.03807591,0.05068012,0.03124802,0.03564379,0.02835801,0.02984439,0.0293115,0.03430886,0.03243232,0.02791705
max,0.1107267,0.05068012,0.1705552,0.1320436,0.1539137,0.198788,0.1811791,0.1852344,0.1335973,0.1356118


In [142]:
df.describe(percentiles=[0.2, 0.6, 0.8])

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6
count,442.0,442.0,442.0,442.0,442.0,442.0,442.0,442.0,442.0,442.0
mean,-2.511817e-19,1.23079e-17,-2.245564e-16,-4.79757e-17,-1.3814990000000001e-17,3.9184340000000004e-17,-5.777179e-18,-9.04254e-18,9.293722000000001e-17,1.130318e-17
std,0.04761905,0.04761905,0.04761905,0.04761905,0.04761905,0.04761905,0.04761905,0.04761905,0.04761905,0.04761905
min,-0.1072256,-0.04464164,-0.0902753,-0.1123988,-0.1267807,-0.1156131,-0.1023071,-0.0763945,-0.1260971,-0.1377672
20%,-0.04547248,-0.04464164,-0.04048038,-0.04009893,-0.03871969,-0.03695017,-0.03971921,-0.03949338,-0.04117617,-0.03835666
50%,0.00538306,-0.04464164,-0.007283766,-0.005670422,-0.004320866,-0.003819065,-0.006584468,-0.002592262,-0.001947171,-0.001077698
60%,0.01628068,0.05068012,0.005218854,0.008100982,0.00806271,0.008706873,0.008142084,-0.002592262,0.01255119,0.007206516
80%,0.04170844,0.05068012,0.04229559,0.04941519,0.03943444,0.03952068,0.03759519,0.03430886,0.03885335,0.03620126
max,0.1107267,0.05068012,0.1705552,0.1320436,0.1539137,0.198788,0.1811791,0.1852344,0.1335973,0.1356118


In [146]:
df.describe().T  # Transpose

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
age,442.0,-2.511817e-19,0.047619,-0.107226,-0.037299,0.005383,0.038076,0.110727
sex,442.0,1.23079e-17,0.047619,-0.044642,-0.044642,-0.044642,0.05068,0.05068
bmi,442.0,-2.245564e-16,0.047619,-0.090275,-0.034229,-0.007284,0.031248,0.170555
bp,442.0,-4.79757e-17,0.047619,-0.112399,-0.036656,-0.00567,0.035644,0.132044
s1,442.0,-1.3814990000000001e-17,0.047619,-0.126781,-0.034248,-0.004321,0.028358,0.153914
s2,442.0,3.9184340000000004e-17,0.047619,-0.115613,-0.030358,-0.003819,0.029844,0.198788
s3,442.0,-5.777179e-18,0.047619,-0.102307,-0.035117,-0.006584,0.029312,0.181179
s4,442.0,-9.04254e-18,0.047619,-0.076395,-0.039493,-0.002592,0.034309,0.185234
s5,442.0,9.293722000000001e-17,0.047619,-0.126097,-0.033246,-0.001947,0.032432,0.133597
s6,442.0,1.130318e-17,0.047619,-0.137767,-0.033179,-0.001078,0.027917,0.135612


## Selection

In [149]:
df.columns

Index(['age', 'sex', 'bmi', 'bp', 's1', 's2', 's3', 's4', 's5', 's6'], dtype='object')

In [151]:
df['age']

0      0.038076
1     -0.001882
2      0.085299
3     -0.089063
4      0.005383
         ...   
437    0.041708
438   -0.005515
439    0.041708
440   -0.045472
441   -0.045472
Name: age, Length: 442, dtype: float64

In [153]:
type(df['age'])

pandas.core.series.Series

In [155]:
df['age'][0]

0.038075906433423026

In [157]:
df['age'][:5]

0    0.038076
1   -0.001882
2    0.085299
3   -0.089063
4    0.005383
Name: age, dtype: float64

In [159]:
df['age'][-5:]

437    0.041708
438   -0.005515
439    0.041708
440   -0.045472
441   -0.045472
Name: age, dtype: float64

In [161]:
df['age'][100:200]

100    0.016281
101    0.016281
102   -0.092695
103    0.059871
104   -0.027310
         ...   
195    0.027178
196   -0.023677
197    0.048974
198   -0.052738
199    0.041708
Name: age, Length: 100, dtype: float64

In [163]:
df[['age', 'sex']]

Unnamed: 0,age,sex
0,0.038076,0.050680
1,-0.001882,-0.044642
2,0.085299,0.050680
3,-0.089063,-0.044642
4,0.005383,-0.044642
...,...,...
437,0.041708,0.050680
438,-0.005515,0.050680
439,0.041708,0.050680
440,-0.045472,-0.044642


In [165]:
df[['age', 'sex']][:5]

Unnamed: 0,age,sex
0,0.038076,0.05068
1,-0.001882,-0.044642
2,0.085299,0.05068
3,-0.089063,-0.044642
4,0.005383,-0.044642


In [167]:
df[['age', 'sex']][-5:]

Unnamed: 0,age,sex
437,0.041708,0.05068
438,-0.005515,0.05068
439,0.041708,0.05068
440,-0.045472,-0.044642
441,-0.045472,-0.044642


In [171]:
df[['age', 'sex']][439:441]

Unnamed: 0,age,sex
439,0.041708,0.05068
440,-0.045472,-0.044642


* .loc
* .iloc

In [174]:
df.iloc[0]

age    0.038076
sex    0.050680
bmi    0.061696
bp     0.021872
s1    -0.044223
s2    -0.034821
s3    -0.043401
s4    -0.002592
s5     0.019907
s6    -0.017646
Name: 0, dtype: float64

In [180]:
df.loc[0]

age    0.038076
sex    0.050680
bmi    0.061696
bp     0.021872
s1    -0.044223
s2    -0.034821
s3    -0.043401
s4    -0.002592
s5     0.019907
s6    -0.017646
Name: 0, dtype: float64

In [178]:
df.head(n=1)

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6
0,0.038076,0.05068,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019907,-0.017646


In [182]:
df.loc[4, 'age']

0.005383060374248237

In [190]:
df.loc[4, ['age', 'sex']]

age    0.005383
sex   -0.044642
Name: 4, dtype: float64

In [186]:
df.iloc[4, 0]

0.005383060374248237

In [192]:
df.iloc[4, [0, 1]]

age    0.005383
sex   -0.044642
Name: 4, dtype: float64

## Conditional slicing

In [203]:
df[df.age > 5.383060e-03]

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6
0,0.038076,0.050680,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019907,-0.017646
2,0.085299,0.050680,0.044451,-0.005670,-0.045599,-0.034194,-0.032356,-0.002592,0.002861,-0.025930
4,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031988,-0.046641
7,0.063504,0.050680,-0.001895,0.066629,0.090620,0.108914,0.022869,0.017703,-0.035816,0.003064
8,0.041708,0.050680,0.061696,-0.040099,-0.013953,0.006202,-0.028674,-0.002592,-0.014960,0.011349
...,...,...,...,...,...,...,...,...,...,...
431,0.070769,0.050680,-0.030996,0.021872,-0.037344,-0.047034,0.033914,-0.039493,-0.014960,-0.001078
432,0.009016,-0.044642,0.055229,-0.005670,0.057597,0.044719,-0.002903,0.023239,0.055686,0.106617
434,0.016281,-0.044642,0.001339,0.008101,0.005311,0.010899,0.030232,-0.039493,-0.045424,0.032059
437,0.041708,0.050680,0.019662,0.059744,-0.005697,-0.002566,-0.028674,-0.002592,0.031193,0.007207


In [197]:
age_df_temp = df.loc[df.age < 5.383060e-03]

In [199]:
age_df_temp

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.068332,-0.092204
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022688,-0.009362
5,-0.092695,-0.044642,-0.040696,-0.019442,-0.068991,-0.079288,0.041277,-0.076395,-0.041176,-0.096346
6,-0.045472,0.050680,-0.047163,-0.015999,-0.040096,-0.024800,0.000779,-0.039493,-0.062917,-0.038357
9,-0.070900,-0.044642,0.039062,-0.033213,-0.012577,-0.034508,-0.024993,-0.002592,0.067737,-0.013504
...,...,...,...,...,...,...,...,...,...,...
435,-0.012780,-0.044642,-0.023451,-0.040099,-0.016704,0.004636,-0.017629,-0.002592,-0.038460,-0.038357
436,-0.056370,-0.044642,-0.074108,-0.050427,-0.024960,-0.047034,0.092820,-0.076395,-0.061176,-0.046641
438,-0.005515,0.050680,-0.015906,-0.067642,0.049341,0.079165,-0.028674,0.034309,-0.018114,0.044485
440,-0.045472,-0.044642,0.039062,0.001215,0.016318,0.015283,-0.028674,0.026560,0.044529,-0.025930


In [206]:
list("abcdefghi")

['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i']

In [226]:
range(100)

range(0, 100)

In [210]:
import numpy as np

In [234]:
another_df = pd.DataFrame(
    np.random.rand(100, 4),
    index=range(10, 110),
    columns=list("ABCD")
)

In [236]:
another_df.head()

Unnamed: 0,A,B,C,D
10,0.75172,0.560219,0.69577,0.603216
11,0.848941,0.2499,0.476168,0.341106
12,0.602272,0.168617,0.758899,0.829194
13,0.378185,0.445112,0.846345,0.15715
14,0.775085,0.130415,0.092947,0.531626


In [224]:
df = pd.DataFrame(np.random.rand(9, 4), index=list("abcdefghi"), columns=list("ABCD"))

In [212]:
df.shape

(9, 4)

In [218]:
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [220]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 9 entries, a to i
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   A       9 non-null      float64
 1   B       9 non-null      float64
 2   C       9 non-null      float64
 3   D       9 non-null      float64
dtypes: float64(4)
memory usage: 360.0+ bytes


In [222]:
df.head()

Unnamed: 0,A,B,C,D
a,0.046941,0.02535,0.301156,0.71749
b,0.61301,0.16467,0.083582,0.556034
c,0.030343,0.596002,0.849651,0.650812
d,0.931873,0.270511,0.988468,0.776774
e,0.742937,0.775569,0.293419,0.471908


In [238]:
df.index

Index(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i'], dtype='object')

In [246]:
df.loc['a']

A    0.517852
B    0.396550
C    0.626745
D    0.259100
Name: a, dtype: float64

In [248]:
df.loc['a', 'A':'D']

A    0.517852
B    0.396550
C    0.626745
D    0.259100
Name: a, dtype: float64

In [250]:
df.loc['a':'d', :]

Unnamed: 0,A,B,C,D
a,0.517852,0.39655,0.626745,0.2591
b,0.065031,0.875702,0.095496,0.370638
c,0.482769,0.3426,0.708207,0.498087
d,0.977853,0.913341,0.249029,0.615411


In [254]:
df.iloc[0:4, :]

Unnamed: 0,A,B,C,D
a,0.517852,0.39655,0.626745,0.2591
b,0.065031,0.875702,0.095496,0.370638
c,0.482769,0.3426,0.708207,0.498087
d,0.977853,0.913341,0.249029,0.615411


In [266]:
df.iloc[:4, 0:3]

Unnamed: 0,A,B,C
a,0.517852,0.39655,0.626745
b,0.065031,0.875702,0.095496
c,0.482769,0.3426,0.708207
d,0.977853,0.913341,0.249029


In [287]:
df.iloc[0:4, 1:3]

Unnamed: 0,B,C
a,0.39655,0.626745
b,0.875702,0.095496
c,0.3426,0.708207
d,0.913341,0.249029


In [297]:
selector = lambda df: df['A'] > 0

In [293]:
selector

<function __main__.<lambda>(df)>

In [295]:
df.loc[selector]

Unnamed: 0,A,B,C,D
a,0.517852,0.39655,0.626745,0.2591
b,0.065031,0.875702,0.095496,0.370638
c,0.482769,0.3426,0.708207,0.498087
d,0.977853,0.913341,0.249029,0.615411
e,0.162006,0.232901,0.033149,0.805948
f,0.188795,0.597944,0.200797,0.156956
g,0.591136,0.227077,0.118011,0.207771
h,0.260124,0.697319,0.427716,0.581015
i,0.511771,0.586435,0.3332,0.212575


In [303]:
selector = lambda df: df['A'] > 0.5

In [301]:
df.loc[selector]

Unnamed: 0,A,B,C,D
a,0.517852,0.39655,0.626745,0.2591
d,0.977853,0.913341,0.249029,0.615411
g,0.591136,0.227077,0.118011,0.207771
i,0.511771,0.586435,0.3332,0.212575


In [333]:
selector1 = lambda df: (df['A'] > 0.5) & (df['B'] < 0.25)

In [335]:
df.loc[selector1]

Unnamed: 0,A,B,C,D
g,0.591136,0.227077,0.118011,0.207771


In [341]:
condition_for_selection = (df['A'] > 0.5)&(df['B'] < 0.25)

In [343]:
condition_for_selection

a    False
b    False
c    False
d    False
e    False
f    False
g     True
h    False
i    False
dtype: bool

In [345]:
df[condition_for_selection]

Unnamed: 0,A,B,C,D
g,0.591136,0.227077,0.118011,0.207771


#### OR operator : | NOT operator : ~

## Adding a column in the DataFrame 

In [354]:
df['E'] = df['A']*100

In [358]:
df['F'] = df['A'] + df['C']

In [360]:
df

Unnamed: 0,A,B,C,D,E,F
a,0.517852,0.39655,0.626745,0.2591,51.785189,1.144596
b,0.065031,0.875702,0.095496,0.370638,6.503082,0.160527
c,0.482769,0.3426,0.708207,0.498087,48.276896,1.190976
d,0.977853,0.913341,0.249029,0.615411,97.785294,1.226882
e,0.162006,0.232901,0.033149,0.805948,16.200577,0.195155
f,0.188795,0.597944,0.200797,0.156956,18.879474,0.389591
g,0.591136,0.227077,0.118011,0.207771,59.113624,0.709147
h,0.260124,0.697319,0.427716,0.581015,26.012425,0.68784
i,0.511771,0.586435,0.3332,0.212575,51.177087,0.844971


In [362]:
criteria = df['A'] < 0.2

In [364]:
criteria

a    False
b     True
c    False
d    False
e     True
f     True
g    False
h    False
i    False
Name: A, dtype: bool

In [388]:
df.loc[criteria, 'A'] = 0

In [390]:
cities = ['Mumbai', 'Delhi', 'Chennai', 
          'kolkata','Bengalure', 'Hyderabad', 
          'Pune','Ahmedabad', 'Indore'] 

In [406]:
df['City'] = cities

In [411]:
df

Unnamed: 0,A,B,C,D,E,F,City
a,0.517852,0.39655,0.626745,0.2591,51.785189,1.144596,Mumbai
b,0.0,0.875702,0.095496,0.370638,6.503082,0.160527,Delhi
c,0.482769,0.3426,0.708207,0.498087,48.276896,1.190976,Chennai
d,0.977853,0.913341,0.249029,0.615411,97.785294,1.226882,kolkata
e,0.0,0.232901,0.033149,0.805948,16.200577,0.195155,Bengalure
f,0.0,0.597944,0.200797,0.156956,18.879474,0.389591,Hyderabad
g,0.591136,0.227077,0.118011,0.207771,59.113624,0.709147,Pune
h,0.260124,0.697319,0.427716,0.581015,26.012425,0.68784,Ahmedabad
i,0.511771,0.586435,0.3332,0.212575,51.177087,0.844971,Indore


In [415]:
df_copy = df

In [439]:
# Membership checking
criteria = df_copy['City'].isin(['Pune', 'Bengalure', 'Hyderabad'])

In [425]:
criteria

a    False
b    False
c    False
d    False
e     True
f     True
g     True
h    False
i    False
Name: City, dtype: bool

In [435]:
df_copy.loc[df.City == 'Bengalure', 'City'] = 'Bengaluru'

In [437]:
df_copy

Unnamed: 0,A,B,C,D,E,F,City
a,0.517852,0.39655,0.626745,0.2591,51.785189,1.144596,Mumbai
b,0.0,0.875702,0.095496,0.370638,6.503082,0.160527,Delhi
c,0.482769,0.3426,0.708207,0.498087,48.276896,1.190976,Chennai
d,0.977853,0.913341,0.249029,0.615411,97.785294,1.226882,kolkata
e,0.0,0.232901,0.033149,0.805948,16.200577,0.195155,Bengaluru
f,0.0,0.597944,0.200797,0.156956,18.879474,0.389591,Hyderabad
g,0.591136,0.227077,0.118011,0.207771,59.113624,0.709147,Pune
h,0.260124,0.697319,0.427716,0.581015,26.012425,0.68784,Ahmedabad
i,0.511771,0.586435,0.3332,0.212575,51.177087,0.844971,Indore


In [441]:
?df_copy.drop

[0;31mSignature:[0m
[0mdf_copy[0m[0;34m.[0m[0mdrop[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mlabels[0m[0;34m:[0m [0;34m'IndexLabel | None'[0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0;34m*[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0maxis[0m[0;34m:[0m [0;34m'Axis'[0m [0;34m=[0m [0;36m0[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mindex[0m[0;34m:[0m [0;34m'IndexLabel | None'[0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mcolumns[0m[0;34m:[0m [0;34m'IndexLabel | None'[0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mlevel[0m[0;34m:[0m [0;34m'Level | None'[0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0minplace[0m[0;34m:[0m [0;34m'bool'[0m [0;34m=[0m [0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0merrors[0m[0;34m:[0m [0;34m'IgnoreRaise'[0m [0;34m=[0m [0;34m'raise'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m [0

In [443]:
df_copy.drop(['E'], axis=1)

Unnamed: 0,A,B,C,D,F,City
a,0.517852,0.39655,0.626745,0.2591,1.144596,Mumbai
b,0.0,0.875702,0.095496,0.370638,0.160527,Delhi
c,0.482769,0.3426,0.708207,0.498087,1.190976,Chennai
d,0.977853,0.913341,0.249029,0.615411,1.226882,kolkata
e,0.0,0.232901,0.033149,0.805948,0.195155,Bengaluru
f,0.0,0.597944,0.200797,0.156956,0.389591,Hyderabad
g,0.591136,0.227077,0.118011,0.207771,0.709147,Pune
h,0.260124,0.697319,0.427716,0.581015,0.68784,Ahmedabad
i,0.511771,0.586435,0.3332,0.212575,0.844971,Indore


In [445]:
df_copy.drop(columns = ['D'])

Unnamed: 0,A,B,C,E,F,City
a,0.517852,0.39655,0.626745,51.785189,1.144596,Mumbai
b,0.0,0.875702,0.095496,6.503082,0.160527,Delhi
c,0.482769,0.3426,0.708207,48.276896,1.190976,Chennai
d,0.977853,0.913341,0.249029,97.785294,1.226882,kolkata
e,0.0,0.232901,0.033149,16.200577,0.195155,Bengaluru
f,0.0,0.597944,0.200797,18.879474,0.389591,Hyderabad
g,0.591136,0.227077,0.118011,59.113624,0.709147,Pune
h,0.260124,0.697319,0.427716,26.012425,0.68784,Ahmedabad
i,0.511771,0.586435,0.3332,51.177087,0.844971,Indore


In [453]:
df_copy.loc[:,'City'] = 'Chennai'

In [455]:
df_copy

Unnamed: 0,A,B,C,D,E,F,City
a,0.517852,0.39655,0.626745,0.2591,51.785189,1.144596,Chennai
b,0.0,0.875702,0.095496,0.370638,6.503082,0.160527,Chennai
c,0.482769,0.3426,0.708207,0.498087,48.276896,1.190976,Chennai
d,0.977853,0.913341,0.249029,0.615411,97.785294,1.226882,Chennai
e,0.0,0.232901,0.033149,0.805948,16.200577,0.195155,Chennai
f,0.0,0.597944,0.200797,0.156956,18.879474,0.389591,Chennai
g,0.591136,0.227077,0.118011,0.207771,59.113624,0.709147,Chennai
h,0.260124,0.697319,0.427716,0.581015,26.012425,0.68784,Chennai
i,0.511771,0.586435,0.3332,0.212575,51.177087,0.844971,Chennai
City,Chennai,Chennai,Chennai,Chennai,Chennai,Chennai,Chennai


In [475]:
df_copy.drop(['City'], axis=0)

Unnamed: 0,A,B,C,D,E,F,City
a,0.517852,0.39655,0.626745,0.2591,51.785189,1.144596,Chennai
b,0.0,0.875702,0.095496,0.370638,6.503082,0.160527,Chennai
c,0.482769,0.3426,0.708207,0.498087,48.276896,1.190976,Chennai
d,0.977853,0.913341,0.249029,0.615411,97.785294,1.226882,Chennai
e,0.0,0.232901,0.033149,0.805948,16.200577,0.195155,Chennai
f,0.0,0.597944,0.200797,0.156956,18.879474,0.389591,Chennai
g,0.591136,0.227077,0.118011,0.207771,59.113624,0.709147,Chennai
h,0.260124,0.697319,0.427716,0.581015,26.012425,0.68784,Chennai
i,0.511771,0.586435,0.3332,0.212575,51.177087,0.844971,Chennai


In [477]:
?df_copy.sample

[0;31mSignature:[0m
[0mdf_copy[0m[0;34m.[0m[0msample[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mn[0m[0;34m:[0m [0;34m'int | None'[0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mfrac[0m[0;34m:[0m [0;34m'float | None'[0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mreplace[0m[0;34m:[0m [0;34m'bool_t'[0m [0;34m=[0m [0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mweights[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mrandom_state[0m[0;34m:[0m [0;34m'RandomState | None'[0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0maxis[0m[0;34m:[0m [0;34m'Axis | None'[0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mignore_index[0m[0;34m:[0m [0;34m'bool_t'[0m [0;34m=[0m [0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m [0;34m->[0m [0;34m'Self'[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Return a ra

In [463]:
df_copy.sample(3)

Unnamed: 0,A,B,C,D,E,F,City
c,0.482769,0.3426,0.708207,0.498087,48.276896,1.190976,Chennai
i,0.511771,0.586435,0.3332,0.212575,51.177087,0.844971,Chennai
f,0.0,0.597944,0.200797,0.156956,18.879474,0.389591,Chennai


In [465]:
df_copy.sample(3)

Unnamed: 0,A,B,C,D,E,F,City
e,0.0,0.232901,0.033149,0.805948,16.200577,0.195155,Chennai
g,0.591136,0.227077,0.118011,0.207771,59.113624,0.709147,Chennai
i,0.511771,0.586435,0.3332,0.212575,51.177087,0.844971,Chennai


In [467]:
df_copy.sample(3, random_state=42)

Unnamed: 0,A,B,C,D,E,F,City
i,0.511771,0.586435,0.3332,0.212575,51.177087,0.844971,Chennai
b,0.0,0.875702,0.095496,0.370638,6.503082,0.160527,Chennai
f,0.0,0.597944,0.200797,0.156956,18.879474,0.389591,Chennai


In [471]:
df_copy.sample(3, random_state=42)

Unnamed: 0,A,B,C,D,E,F,City
i,0.511771,0.586435,0.3332,0.212575,51.177087,0.844971,Chennai
b,0.0,0.875702,0.095496,0.370638,6.503082,0.160527,Chennai
f,0.0,0.597944,0.200797,0.156956,18.879474,0.389591,Chennai


In [487]:
df_copy.sample(3, replace=True)  # By Default False

Unnamed: 0,A,B,C,D,E,F,City
c,0.482769,0.3426,0.708207,0.498087,48.276896,1.190976,Chennai
City,Chennai,Chennai,Chennai,Chennai,Chennai,Chennai,Chennai
c,0.482769,0.3426,0.708207,0.498087,48.276896,1.190976,Chennai
