In [1]:
import pandas as pd
import numpy as np
import random

In [2]:
rocks = ['limestone','granit','marble','sandstone','obsidian']
r_types = ['sediment','ingeous','metamorphic','sediment','ingeous']

In [3]:
data_len = 200
r = []
for i in range(data_len):
    r.append(random.randrange(0, len(rocks)))

In [4]:
data = pd.DataFrame([rocks[i] for i in r], columns = ['rocks'])
data['types'] = [r_types[i] for i in r]
data['fracture'] = [random.randrange(0,10) for i in range(len(data))] #10 grades here now!

In [5]:
data

Unnamed: 0,rocks,types,fracture
0,limestone,sediment,3
1,sandstone,sediment,4
2,obsidian,ingeous,5
3,obsidian,ingeous,9
4,marble,metamorphic,0
...,...,...,...
195,limestone,sediment,3
196,obsidian,ingeous,7
197,sandstone,sediment,8
198,marble,metamorphic,1


# we have the data now

In [6]:
data

Unnamed: 0,rocks,types,fracture
0,limestone,sediment,3
1,sandstone,sediment,4
2,obsidian,ingeous,5
3,obsidian,ingeous,9
4,marble,metamorphic,0
...,...,...,...
195,limestone,sediment,3
196,obsidian,ingeous,7
197,sandstone,sediment,8
198,marble,metamorphic,1


## labeling

In [7]:
data['fracture_class']=pd.cut(data['fracture'], 5,
                    labels=['Poor','Below_average','Average','Above_Average','Excellent'])

In [8]:
data[(data['fracture'] == 0) | (data['fracture'] == 1) ]['fracture_class'].unique()

[Poor]
Categories (1, object): [Poor]

In [12]:
data

Unnamed: 0,rocks,types,fracture,fracture_class
0,limestone,sediment,3,Below_average
1,sandstone,sediment,4,Average
2,obsidian,ingeous,5,Average
3,obsidian,ingeous,9,Excellent
4,marble,metamorphic,0,Poor
...,...,...,...,...
195,limestone,sediment,3,Below_average
196,obsidian,ingeous,7,Above_Average
197,sandstone,sediment,8,Excellent
198,marble,metamorphic,1,Poor


In [13]:
pd.cut(np.array([1, 7, 5, 4, 6, 3]), 3)

[(0.994, 3.0], (5.0, 7.0], (3.0, 5.0], (3.0, 5.0], (5.0, 7.0], (0.994, 3.0]]
Categories (3, interval[float64]): [(0.994, 3.0] < (3.0, 5.0] < (5.0, 7.0]]

In [14]:
bins = pd.IntervalIndex.from_tuples([(0, 1), (2, 3), (4, 5)])
pd.cut([0, 0.5, 1.5, 2.5, 4.5], bins)

[NaN, (0.0, 1.0], NaN, (2.0, 3.0], (4.0, 5.0]]
Categories (3, interval[int64]): [(0, 1] < (2, 3] < (4, 5]]

# encoding

In [15]:
# converting type of columns to 'category'
data['fracture_class_cat'] = data['fracture_class'].cat.codes

In [16]:
data

Unnamed: 0,rocks,types,fracture,fracture_class,fracture_class_cat
0,limestone,sediment,3,Below_average,1
1,sandstone,sediment,4,Average,2
2,obsidian,ingeous,5,Average,2
3,obsidian,ingeous,9,Excellent,4
4,marble,metamorphic,0,Poor,0
...,...,...,...,...,...
195,limestone,sediment,3,Below_average,1
196,obsidian,ingeous,7,Above_Average,3
197,sandstone,sediment,8,Excellent,4
198,marble,metamorphic,1,Poor,0


In [17]:
data.dtypes  #data['fracture_class_cat'] not really a category

rocks                   object
types                   object
fracture                 int64
fracture_class        category
fracture_class_cat        int8
dtype: object

In [18]:
data['f_c_c_real_categ'] = data['fracture_class_cat'].astype("category")

In [19]:
data

Unnamed: 0,rocks,types,fracture,fracture_class,fracture_class_cat,f_c_c_real_categ
0,limestone,sediment,3,Below_average,1,1
1,sandstone,sediment,4,Average,2,2
2,obsidian,ingeous,5,Average,2,2
3,obsidian,ingeous,9,Excellent,4,4
4,marble,metamorphic,0,Poor,0,0
...,...,...,...,...,...,...
195,limestone,sediment,3,Below_average,1,1
196,obsidian,ingeous,7,Above_Average,3,3
197,sandstone,sediment,8,Excellent,4,4
198,marble,metamorphic,1,Poor,0,0


In [20]:
data.dtypes #yes! data['f_c_c_real_categ'] is a category now

rocks                   object
types                   object
fracture                 int64
fracture_class        category
fracture_class_cat        int8
f_c_c_real_categ      category
dtype: object

## label encoding

In [21]:
from sklearn.preprocessing import LabelEncoder

In [22]:
# creating instance of labelencoder
labelencoder = LabelEncoder()

In [23]:
data['fracture'].unique()

array([3, 4, 5, 9, 0, 1, 7, 6, 8, 2])

In [24]:
data['fract_labels'] = labelencoder.fit_transform(data['fracture']) #Nans will be encoded too!

In [25]:
data #clreated int labels, not category type

Unnamed: 0,rocks,types,fracture,fracture_class,fracture_class_cat,f_c_c_real_categ,fract_labels
0,limestone,sediment,3,Below_average,1,1,3
1,sandstone,sediment,4,Average,2,2,4
2,obsidian,ingeous,5,Average,2,2,5
3,obsidian,ingeous,9,Excellent,4,4,9
4,marble,metamorphic,0,Poor,0,0,0
...,...,...,...,...,...,...,...
195,limestone,sediment,3,Below_average,1,1,3
196,obsidian,ingeous,7,Above_Average,3,3,7
197,sandstone,sediment,8,Excellent,4,4,8
198,marble,metamorphic,1,Poor,0,0,1


## One-Hot Encoder

In [26]:
from sklearn.preprocessing import OneHotEncoder

In [27]:
# creating instance of one-hot-encoder
enc = OneHotEncoder()

In [28]:
data['fract_labels'].unique()

array([3, 4, 5, 9, 0, 1, 7, 6, 8, 2])

In [29]:
enc_fl = pd.DataFrame(enc.fit_transform(data[['fract_labels']]).toarray())
enc_fl

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...
195,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
196,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
197,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
198,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [30]:
# merge with main data on key values
data = data.join(enc_fl)

In [31]:
data

Unnamed: 0,rocks,types,fracture,fracture_class,fracture_class_cat,f_c_c_real_categ,fract_labels,0,1,2,3,4,5,6,7,8,9
0,limestone,sediment,3,Below_average,1,1,3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,sandstone,sediment,4,Average,2,2,4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,obsidian,ingeous,5,Average,2,2,5,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,obsidian,ingeous,9,Excellent,4,4,9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,marble,metamorphic,0,Poor,0,0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,limestone,sediment,3,Below_average,1,1,3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
196,obsidian,ingeous,7,Above_Average,3,3,7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
197,sandstone,sediment,8,Excellent,4,4,8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
198,marble,metamorphic,1,Poor,0,0,1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [34]:
data.dtypes

rocks                   object
types                   object
fracture                 int64
fracture_class        category
fracture_class_cat        int8
f_c_c_real_categ      category
fract_labels             int64
0                      float64
1                      float64
2                      float64
3                      float64
4                      float64
5                      float64
6                      float64
7                      float64
8                      float64
9                      float64
dtype: object

# Categorical OneHotEncoding

In [35]:
from sklearn.preprocessing import LabelBinarizer

In [36]:
lb = LabelBinarizer()
lb.fit(data['fracture_class'].unique())

LabelBinarizer()

In [37]:
lb_enc = lb.transform(data['fracture_class'])

In [38]:
lb.inverse_transform(lb_enc) #back to column of labels!

array(['Below_average', 'Average', 'Average', 'Excellent', 'Poor', 'Poor',
       'Average', 'Above_Average', 'Average', 'Average', 'Below_average',
       'Poor', 'Above_Average', 'Average', 'Excellent', 'Above_Average',
       'Excellent', 'Poor', 'Poor', 'Average', 'Above_Average',
       'Above_Average', 'Poor', 'Below_average', 'Above_Average',
       'Above_Average', 'Poor', 'Above_Average', 'Above_Average',
       'Average', 'Average', 'Average', 'Poor', 'Average', 'Poor',
       'Above_Average', 'Above_Average', 'Average', 'Excellent',
       'Below_average', 'Excellent', 'Poor', 'Above_Average',
       'Below_average', 'Average', 'Average', 'Below_average', 'Average',
       'Average', 'Excellent', 'Average', 'Above_Average', 'Excellent',
       'Poor', 'Above_Average', 'Below_average', 'Above_Average',
       'Below_average', 'Average', 'Below_average', 'Poor', 'Excellent',
       'Poor', 'Above_Average', 'Below_average', 'Above_Average',
       'Below_average', 'Poor', 'Abov

### and here troubles come!

In [44]:
lb_encdf = pd.DataFrame(lb_enc)

In [45]:
data['fracture_class'].unique()

[Below_average, Average, Excellent, Poor, Above_Average]
Categories (5, object): [Poor < Below_average < Average < Above_Average < Excellent]

In [40]:
lb_encdf

Unnamed: 0,0,1,2,3,4
0,0,0,1,0,0
1,0,1,0,0,0
2,0,1,0,0,0
3,0,0,0,1,0
4,0,0,0,0,1
...,...,...,...,...,...
195,0,0,1,0,0
196,1,0,0,0,0
197,0,0,0,1,0
198,0,0,0,0,1


In [46]:
columns = data['fracture_class'].unique() #lets name the columns!

In [47]:
columns

[Below_average, Average, Excellent, Poor, Above_Average]
Categories (5, object): [Poor < Below_average < Average < Above_Average < Excellent]

In [48]:
lb_encdf.columns = columns

In [51]:
lb_encdf[:6] #compare!

Unnamed: 0,Below_average,Average,Excellent,Poor,Above_Average
0,0,0,1,0,0
1,0,1,0,0,0
2,0,1,0,0,0
3,0,0,0,1,0
4,0,0,0,0,1
5,0,0,0,0,1


In [52]:
data.head(6) #it does not match!

Unnamed: 0,rocks,types,fracture,fracture_class,fracture_class_cat,f_c_c_real_categ,fract_labels,0,1,2,3,4,5,6,7,8,9
0,limestone,sediment,3,Below_average,1,1,3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,sandstone,sediment,4,Average,2,2,4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,obsidian,ingeous,5,Average,2,2,5,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,obsidian,ingeous,9,Excellent,4,4,9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,marble,metamorphic,0,Poor,0,0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,granit,ingeous,1,Poor,0,0,1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [53]:
# because it implicitly sorted our labels' names!
np.sort(columns)

array(['Above_Average', 'Average', 'Below_average', 'Excellent', 'Poor'],
      dtype=object)

In [54]:
lb_encdf.columns = np.sort(columns) #this would be a right way to pick our encoded values

In [56]:
lb_encdf[:3]

Unnamed: 0,Above_Average,Average,Below_average,Excellent,Poor
0,0,0,1,0,0
1,0,1,0,0,0
2,0,1,0,0,0


In [57]:
data.head(3) #now it matches!

Unnamed: 0,rocks,types,fracture,fracture_class,fracture_class_cat,f_c_c_real_categ,fract_labels,0,1,2,3,4,5,6,7,8,9
0,limestone,sediment,3,Below_average,1,1,3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,sandstone,sediment,4,Average,2,2,4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,obsidian,ingeous,5,Average,2,2,5,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


# bevare of the implicit transformations!!!