# Selection and Transformation of Relevant Characteristics

Some examples of dropping duplicates, coding categorical variables and applying logarithmic transformation

In [5]:
# import libraries

import pandas as pd 
import numpy as np
from sklearn.preprocessing import LabelEncoder

In [29]:
# create an example dataframe with duplicated rows

data = {
    'D' : ['cat', 'dog', 'cat', 'bird', 'dog', 'cat', 'bird', 'cat', 'dog', 'cat'], 
    'E' : [1,1,2,2,3,3,4,4,5,1], 
    'F' : ['a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'a']
}

df = pd.DataFrame(data)

print("DataFrame original:")
print(df)

DataFrame original:
      D  E  F
0   cat  1  a
1   dog  1  b
2   cat  2  a
3  bird  2  b
4   dog  3  a
5   cat  3  b
6  bird  4  a
7   cat  4  b
8   dog  5  a
9   cat  1  a


In [30]:
# add a new duplicate row

df = pd.concat([df, df.iloc[0].to_frame().T], ignore_index=True)


       D  E  F
0    cat  1  a
1    dog  1  b
2    cat  2  a
3   bird  2  b
4    dog  3  a
5    cat  3  b
6   bird  4  a
7    cat  4  b
8    dog  5  a
9    cat  1  a
10   cat  1  a


In [31]:
print("Original DataFrame with a duplicated row:")
print(df)

DataFrame original con una fila duplicada:
       D  E  F
0    cat  1  a
1    dog  1  b
2    cat  2  a
3   bird  2  b
4    dog  3  a
5    cat  3  b
6   bird  4  a
7    cat  4  b
8    dog  5  a
9    cat  1  a
10   cat  1  a


In [32]:
# coding categorical variables using LabelEncoder

encoder = LabelEncoder()
df['D'] = encoder.fit_transform(df['D'])

print("\nDataFrame after coding the categorical variables")
print(df)


DataFrame after coding the categorical variables
    D  E  F
0   1  1  a
1   2  1  b
2   1  2  a
3   0  2  b
4   2  3  a
5   1  3  b
6   0  4  a
7   1  4  b
8   2  5  a
9   1  1  a
10  1  1  a


In [33]:
# Example of how to eliminate duplicate rows

df.drop_duplicates(inplace=True)

print("\nDataFrame after deleting the duplicates")
print(df)


DataFrame after deleting the duplicates
   D  E  F
0  1  1  a
1  2  1  b
2  1  2  a
3  0  2  b
4  2  3  a
5  1  3  b
6  0  4  a
7  1  4  b
8  2  5  a


In [35]:
# logarithmic transformation data for high variability variables
df["E"] = df['E'].astype(float).apply(np.log)

print("\nDataFrame after logarithmic transformation")
print(df)


DataFrame after logaritmic transformation
   D         E  F
0  1  0.000000  a
1  2  0.000000  b
2  1  0.693147  a
3  0  0.693147  b
4  2  1.098612  a
5  1  1.098612  b
6  0  1.386294  a
7  1  1.386294  b
8  2  1.609438  a
