In [1]:
import pandas as pd
import numpy as np


In [2]:
data_dummies=pd.DataFrame({'key':['b','a','a','c','a'],'data1':range(5)})
data_dummies

Unnamed: 0,key,data1
0,b,0
1,a,1
2,a,2
3,c,3
4,a,4


In [5]:
pd.get_dummies(data_dummies.key)

Unnamed: 0,a,b,c
0,0,1,0
1,1,0,0
2,1,0,0
3,0,0,1
4,1,0,0


In [6]:
pd.get_dummies(data_dummies.key,prefix='key')

Unnamed: 0,key_a,key_b,key_c
0,0,1,0
1,1,0,0
2,1,0,0
3,0,0,1
4,1,0,0


In [7]:
data_dummies_df=data_dummies[['data1']].join(pd.get_dummies(data_dummies['key'],prefix='key'))

In [8]:
data_dummies_df

Unnamed: 0,data1,key_a,key_b,key_c
0,0,0,1,0
1,1,1,0,0
2,2,1,0,0
3,3,0,0,1
4,4,1,0,0


In [12]:
np.random.seed(42)


In [13]:
v=np.random.rand(10)
v

array([0.37454012, 0.95071431, 0.73199394, 0.59865848, 0.15601864,
       0.15599452, 0.05808361, 0.86617615, 0.60111501, 0.70807258])

In [14]:
bins=[0.1,0.3,0.5,0.7,1]
bins

[0.1, 0.3, 0.5, 0.7, 1]

In [17]:
pd.get_dummies(pd.cut(v,bins))

Unnamed: 0,"(0.1, 0.3]","(0.3, 0.5]","(0.5, 0.7]","(0.7, 1.0]"
0,0,1,0,0
1,0,0,0,1
2,0,0,0,1
3,0,0,1,0
4,1,0,0,0
5,1,0,0,0
6,0,0,0,0
7,0,0,0,1
8,0,0,1,0
9,0,0,0,1


## Data Transformation

In [18]:
df=pd.Series([1,-9,2.,-9,-1,3])
df

0    1.0
1   -9.0
2    2.0
3   -9.0
4   -1.0
5    3.0
dtype: float64

In [19]:
df.replace(-9,np.nan)

0    1.0
1    NaN
2    2.0
3    NaN
4   -1.0
5    3.0
dtype: float64

In [20]:
df.replace([1,-1],[5,6])

0    5.0
1   -9.0
2    2.0
3   -9.0
4    6.0
5    3.0
dtype: float64

#### Lambda and map method


In [21]:
data_tor=pd.DataFrame(np.arange(12).reshape((3,4)),index=['Apple','Banana','Grapes'],columns=['one','two','three','four'])
data_tor

Unnamed: 0,one,two,three,four
Apple,0,1,2,3
Banana,4,5,6,7
Grapes,8,9,10,11


In [22]:
upper=lambda x: x[:5].upper()
data_tor.index.map(upper)


Index(['APPLE', 'BANAN', 'GRAPE'], dtype='object')

In [23]:
data_tor.index=data_tor.index.map(upper)

In [24]:
data_tor

Unnamed: 0,one,two,three,four
APPLE,0,1,2,3
BANAN,4,5,6,7
GRAPE,8,9,10,11


In [25]:
data_tor.rename(index=str.lower,columns=str.title)

Unnamed: 0,One,Two,Three,Four
apple,0,1,2,3
banan,4,5,6,7
grape,8,9,10,11


In [26]:
binned=pd.cut(v,bins)
binned

[(0.3, 0.5], (0.7, 1.0], (0.7, 1.0], (0.5, 0.7], (0.1, 0.3], (0.1, 0.3], NaN, (0.7, 1.0], (0.5, 0.7], (0.7, 1.0]]
Categories (4, interval[float64]): [(0.1, 0.3] < (0.3, 0.5] < (0.5, 0.7] < (0.7, 1.0]]

In [27]:
binned.codes

array([ 1,  3,  3,  2,  0,  0, -1,  3,  2,  3], dtype=int8)

In [28]:
binned.categories

IntervalIndex([(0.1, 0.3], (0.3, 0.5], (0.5, 0.7], (0.7, 1.0]],
              closed='right',
              dtype='interval[float64]')

In [29]:
binned.value_counts()

(0.1, 0.3]    2
(0.3, 0.5]    1
(0.5, 0.7]    2
(0.7, 1.0]    4
dtype: int64

In [30]:
rebinnd=pd.cut(v,bins,labels=['One','two','three','four'])
rebinnd

[two, four, four, three, One, One, NaN, four, three, four]
Categories (4, object): [One < two < three < four]

In [32]:
rd=np.random.rand(10)
rd

array([0.02058449, 0.96990985, 0.83244264, 0.21233911, 0.18182497,
       0.18340451, 0.30424224, 0.52475643, 0.43194502, 0.29122914])

In [36]:
pd.cut(rd,4,precision=1).value_counts()

(0.02, 0.3]    4
(0.3, 0.5]     3
(0.5, 0.7]     1
(0.7, 1.0]     2
dtype: int64

In [38]:
pd.qcut(rd,4,precision=1).value_counts()

(-0.08, 0.2]    3
(0.2, 0.3]      2
(0.3, 0.5]      2
(0.5, 1.0]      3
dtype: int64

### Detecting outlier


In [39]:
df=pd.DataFrame({'A':pd.Series(np.arange(10)),'B':pd.Series(np.arange(5,15)),'C':pd.Series(np.arange(10,20)),'D':pd.Series(np.arange(15,25))})
df

Unnamed: 0,A,B,C,D
0,0,5,10,15
1,1,6,11,16
2,2,7,12,17
3,3,8,13,18
4,4,9,14,19
5,5,10,15,20
6,6,11,16,21
7,7,12,17,22
8,8,13,18,23
9,9,14,19,24


In [40]:
df.describe()

Unnamed: 0,A,B,C,D
count,10.0,10.0,10.0,10.0
mean,4.5,9.5,14.5,19.5
std,3.02765,3.02765,3.02765,3.02765
min,0.0,5.0,10.0,15.0
25%,2.25,7.25,12.25,17.25
50%,4.5,9.5,14.5,19.5
75%,6.75,11.75,16.75,21.75
max,9.0,14.0,19.0,24.0


In [42]:
col=df['A']
col

0    0
1    1
2    2
3    3
4    4
5    5
6    6
7    7
8    8
9    9
Name: A, dtype: int32

In [43]:
col[np.abs(col)>4]

5    5
6    6
7    7
8    8
9    9
Name: A, dtype: int32

In [44]:
df[(np.abs(df)>20).any(1)]

Unnamed: 0,A,B,C,D
6,6,11,16,21
7,7,12,17,22
8,8,13,18,23
9,9,14,19,24


In [45]:
data_tr=pd.DataFrame({'Names':['Raju','vali','Salu','Balu','Vali','mali'],'Score':[4,3,2,6,5,1]})
data_tr

Unnamed: 0,Names,Score
0,Raju,4
1,vali,3
2,Salu,2
3,Balu,6
4,Vali,5
5,mali,1


In [48]:
match_data_tr={'raju':'yellow','vali':'green','salu':'pink','balu':'red','mali':'magenta'}
match_data_tr

{'raju': 'yellow',
 'vali': 'green',
 'salu': 'pink',
 'balu': 'red',
 'mali': 'magenta'}

In [50]:
lower_str=data_tr['Names'].str.lower()
lower_str

0    raju
1    vali
2    salu
3    balu
4    vali
5    mali
Name: Names, dtype: object

In [51]:
data_tr['Color']=lower_str.map(match_data_tr)
data_tr

Unnamed: 0,Names,Score,Color
0,Raju,4,yellow
1,vali,3,green
2,Salu,2,pink
3,Balu,6,red
4,Vali,5,green
5,mali,1,magenta


#### String Manipulation


In [52]:
python_string='python_is a programming , language!'
python_string

'python_is a programming , language!'

In [53]:
python_string.split('_')

['python', 'is a programming , language!']

In [55]:
cs=python_string.split(',')

In [56]:
'.'.join(cs)

'python_is a programming . language!'