[Reference](https://ai.plainenglish.io/data-transformation-in-pandas-29b2b3c61b34)

In [1]:
import pandas as pd

# 1. Finding the duplicate values

In [2]:
data = pd.DataFrame({"a":["one","two"]*3,
                     "b":[1,1,2,3,2,3]})
data

Unnamed: 0,a,b
0,one,1
1,two,1
2,one,2
3,two,3
4,one,2
5,two,3


In [3]:
data.duplicated()

0    False
1    False
2    False
3    False
4     True
5     True
dtype: bool

In [5]:
data.drop_duplicates()

Unnamed: 0,a,b
0,one,1
1,two,1
2,one,2
3,two,3


In [6]:
data["c"]=range(6)
data

Unnamed: 0,a,b,c
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,2,4
5,two,3,5


In [7]:
data.duplicated(["a","b"],keep="last")

0    False
1    False
2     True
3     True
4    False
5    False
dtype: bool

# 2. Mapping

In [8]:
df = pd.DataFrame({"names":["Tim","tom","Sam",
                            "kate","Kim"],
                   "scores":[60,50,70,80,40]})
df

Unnamed: 0,names,scores
0,Tim,60
1,tom,50
2,Sam,70
3,kate,80
4,Kim,40


In [9]:
classes={"Tim":"A","Tom":"A","Sam":"B",
         "Kate":"B","Kim":"B"}
n=df["names"].str.capitalize()

In [10]:
df["branches"]=n.map(classes)
df

Unnamed: 0,names,scores,branches
0,Tim,60,A
1,tom,50,A
2,Sam,70,B
3,kate,80,B
4,Kim,40,B


# 3. Replacing

In [11]:
s=pd.Series([80,70,90,60])
s

0    80
1    70
2    90
3    60
dtype: int64

In [12]:
import numpy as np
s.replace(70,np.nan)

0    80.0
1     NaN
2    90.0
3    60.0
dtype: float64

In [13]:
s.replace([70,60],[np.nan,0])

0    80.0
1     NaN
2    90.0
3     0.0
dtype: float64

In [14]:
s.replace({90:100,60:0})

0     80
1     70
2    100
3      0
dtype: int64

# 4. Renaming

In [15]:
df=pd.DataFrame(
    np.arange(12).reshape(3,4),
    index=[0,1,2],
    columns=["tim",'tom','kim','sam']
)

In [16]:
df

Unnamed: 0,tim,tom,kim,sam
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11


In [17]:
s=pd.Series(["one","two","three"])
df.index=df.index.map(s)

In [18]:
df

Unnamed: 0,tim,tom,kim,sam
one,0,1,2,3
two,4,5,6,7
three,8,9,10,11


In [19]:
df.rename(index=str.title,columns=str.upper)

Unnamed: 0,TIM,TOM,KIM,SAM
One,0,1,2,3
Two,4,5,6,7
Three,8,9,10,11


In [20]:
df.rename(index={"one":'ten'},
          columns={"sam":'kate'},
          inplace=True)
df

Unnamed: 0,tim,tom,kim,kate
ten,0,1,2,3
two,4,5,6,7
three,8,9,10,11


# 5. Cutting

In [21]:
sc=[30,80,40,90,60,45,95,75,55,100,65,85]
x=[20,40,60,80,100]
y=pd.cut(sc,x)
y

[(20, 40], (60, 80], (20, 40], (80, 100], (40, 60], ..., (60, 80], (40, 60], (80, 100], (60, 80], (80, 100]]
Length: 12
Categories (4, interval[int64, right]): [(20, 40] < (40, 60] < (60, 80] < (80, 100]]

In [22]:
y.categories

IntervalIndex([(20, 40], (40, 60], (60, 80], (80, 100]], dtype='interval[int64, right]')

In [23]:
pd.value_counts(y)

(80, 100]    4
(40, 60]     3
(60, 80]     3
(20, 40]     2
dtype: int64

In [24]:
y=pd.cut(sc,x,right=False)
y

[[20, 40), [80, 100), [40, 60), [80, 100), [60, 80), ..., [60.0, 80.0), [40.0, 60.0), NaN, [60.0, 80.0), [80.0, 100.0)]
Length: 12
Categories (4, interval[int64, left]): [[20, 40) < [40, 60) < [60, 80) < [80, 100)]

In [26]:
nm=["low",'medium','high','very high']
pd.cut(sc,x,labels=nm)

['low', 'high', 'low', 'very high', 'medium', ..., 'high', 'medium', 'very high', 'high', 'very high']
Length: 12
Categories (4, object): ['low' < 'medium' < 'high' < 'very high']

In [27]:
pd.cut(sc,10)

[(29.93, 37.0], (79.0, 86.0], (37.0, 44.0], (86.0, 93.0], (58.0, 65.0], ..., (72.0, 79.0], (51.0, 58.0], (93.0, 100.0], (58.0, 65.0], (79.0, 86.0]]
Length: 12
Categories (10, interval[float64, right]): [(29.93, 37.0] < (37.0, 44.0] < (44.0, 51.0] <
                                            (51.0, 58.0] ... (72.0, 79.0] < (79.0, 86.0] <
                                            (86.0, 93.0] < (93.0, 100.0]]

In [28]:
data=np.random.randn(100)
c=pd.qcut(data,4)
c

[(0.598, 2.495], (-0.174, 0.598], (-0.907, -0.174], (0.598, 2.495], (-0.907, -0.174], ..., (-0.907, -0.174], (-0.907, -0.174], (-0.174, 0.598], (0.598, 2.495], (0.598, 2.495]]
Length: 100
Categories (4, interval[float64, right]): [(-1.9889999999999999, -0.907] < (-0.907, -0.174] < (-0.174, 0.598] <
                                           (0.598, 2.495]]

In [29]:
pd.value_counts(c)

(-1.9889999999999999, -0.907]    25
(-0.907, -0.174]                 25
(-0.174, 0.598]                  25
(0.598, 2.495]                   25
dtype: int64

# 6. Finding the specific values in a dataset


In [30]:
data=pd.DataFrame(np.random.randn(1000,4))
data.head()

Unnamed: 0,0,1,2,3
0,0.039501,-0.722882,0.365598,0.705373
1,0.277343,0.808021,-0.194761,0.040014
2,1.19222,-1.972406,-1.153142,0.262597
3,0.94315,-1.073868,-0.965331,1.584877
4,0.11038,0.102637,-0.167459,-0.078545


In [31]:
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,-0.031664,-0.011269,0.020176,-0.02886
std,0.929313,0.986473,0.962158,0.961403
min,-2.897299,-3.247163,-2.916476,-3.282283
25%,-0.667055,-0.686047,-0.607756,-0.669878
50%,-0.044144,0.032012,0.004408,-0.016951
75%,0.602788,0.649154,0.621236,0.616861
max,2.775391,3.425898,3.161205,2.724488


In [32]:
col=data[1]
col

0     -0.722882
1      0.808021
2     -1.972406
3     -1.073868
4      0.102637
         ...   
995   -0.285158
996    0.493734
997    0.538819
998   -0.752645
999    0.390163
Name: 1, Length: 1000, dtype: float64

In [33]:
col[np.abs(col)>3]

171    3.093358
199   -3.025666
765    3.425898
842   -3.247163
Name: 1, dtype: float64

In [35]:
data[(np.abs(data)>3).any(1)]

Unnamed: 0,0,1,2,3
171,0.237253,3.093358,0.95896,2.011707
199,0.377464,-3.025666,0.657728,0.991916
392,-1.020106,-0.032019,3.007505,-2.220461
450,0.908077,2.057724,-0.211037,-3.282283
765,1.111011,3.425898,1.995738,-1.50355
842,-0.179564,-3.247163,-0.155432,-0.421469
937,-0.328761,0.790775,3.161205,-0.517723


In [36]:
np.sign(data.head())

Unnamed: 0,0,1,2,3
0,1.0,-1.0,1.0,1.0
1,1.0,1.0,-1.0,1.0
2,1.0,-1.0,-1.0,1.0
3,1.0,-1.0,-1.0,1.0
4,1.0,1.0,-1.0,-1.0


# 7. Selecting


In [37]:
data=pd.DataFrame(
    np.arange(12).reshape(4,3)
)
data

Unnamed: 0,0,1,2
0,0,1,2
1,3,4,5
2,6,7,8
3,9,10,11


In [38]:
rw=np.random.permutation(4)
rw

array([3, 0, 2, 1])

In [39]:
data.take(rw)

Unnamed: 0,0,1,2
3,9,10,11
0,0,1,2
2,6,7,8
1,3,4,5


In [40]:
data.sample()

Unnamed: 0,0,1,2
1,3,4,5


In [41]:
data.sample(n=2)

Unnamed: 0,0,1,2
0,0,1,2
2,6,7,8


# 8. Converts categorical data into dummy variables


In [44]:
data=pd.DataFrame(
    {"letter":["c","b","a","b","b","a"],
     "number":range(6)}
)
data

Unnamed: 0,letter,number
0,c,0
1,b,1
2,a,2
3,b,3
4,b,4
5,a,5


In [45]:
pd.get_dummies(data["letter"])

Unnamed: 0,a,b,c
0,0,0,1
1,0,1,0
2,1,0,0
3,0,1,0
4,0,1,0
5,1,0,0


In [46]:
data=np.random.randn(10)
data

array([-1.68660783,  1.08676593,  0.47844931, -0.67372545, -0.15717798,
        0.25751165, -1.60165811, -1.60721938, -0.25907932, -1.60645   ])

In [47]:
pd.get_dummies(pd.cut(data,4))

Unnamed: 0,"(-1.689, -0.993]","(-0.993, -0.3]","(-0.3, 0.393]","(0.393, 1.087]"
0,1,0,0,0
1,0,0,0,1
2,0,0,0,1
3,0,1,0,0
4,0,0,1,0
5,0,0,1,0
6,1,0,0,0
7,1,0,0,0
8,0,0,1,0
9,1,0,0,0
