In [1]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler

from matplotlib import pyplot as plt
from collections import Counter
import pandas as pd
import numpy as np
import seaborn as sns
import random
import math
import re

plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = [6,4]
plt.rcParams["axes.edgecolor"] = "black"

## Feature Engineering notebook

In [2]:
df = pd.read_csv('datasets/cleaned.csv')
df

Unnamed: 0,number,code,price,num_digits
0,1313,U,109000,4
1,3222,H,72000,4
2,30033,P,24000,5
3,7765,L,17500,4
4,8876,U,17500,4
...,...,...,...,...
2963,26277,U,2400,5
2964,28299,U,2400,5
2965,63383,U,2400,5
2966,93966,U,2400,5


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2968 entries, 0 to 2967
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   number      2968 non-null   int64 
 1   code        2968 non-null   object
 2   price       2968 non-null   int64 
 3   num_digits  2968 non-null   int64 
dtypes: int64(3), object(1)
memory usage: 92.9+ KB


In [4]:
# str helps extracting features easier
df['number'] = df['number'].astype(str)

# both capture pattern / lack of it 
df['unique_digits'] = df['number'].apply(lambda x: len(set(x)))
df['max_group'] = df['number'].apply(lambda x: Counter(x).most_common(1)[0][1])

df.head()

Unnamed: 0,number,code,price,num_digits,unique_digits,max_group
0,1313,U,109000,4,2,2
1,3222,H,72000,4,2,3
2,30033,P,24000,5,2,3
3,7765,L,17500,4,3,2
4,8876,U,17500,4,3,2


$H(X) = -\Large\sum_{i=1}^{n}P_{r}(x_{i})(log_{b}P_{r}(x_{i}))$

Is a measure of the amount of information in a string I suppose it negatively correlates with price

In [5]:
def entropy(string):
    '''Calculates the Shannon entropy of a string'''
    
    prob = [ float(string.count(c)) / len(string) for c in dict.fromkeys(list(string)) ]
    return - sum(p * math.log(p) / math.log(2.0) for p in prob)

df['shan_entrop'] = df['number'].apply(entropy)

df.head()

Unnamed: 0,number,code,price,num_digits,unique_digits,max_group,shan_entrop
0,1313,U,109000,4,2,2,1.0
1,3222,H,72000,4,2,3,0.811278
2,30033,P,24000,5,2,3,0.970951
3,7765,L,17500,4,3,2,1.5
4,8876,U,17500,4,3,2,1.5
