In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('homeprices.csv')
df.head()

Unnamed: 0,town,area,price
0,monroe township,2600,550000
1,monroe township,3000,565000
2,monroe township,3200,610000
3,monroe township,3600,680000
4,monroe township,4000,725000


__Label Encoding__

In [3]:
from sklearn.preprocessing import LabelEncoder

In [4]:
df1 = df.copy()

In [5]:
le = LabelEncoder()
le.fit_transform(df1['town'])

array([0, 0, 0, 0, 0, 2, 2, 2, 2, 1, 1, 1, 1])

In [6]:
df1['town'] = le.fit_transform(df1['town'])
df1

Unnamed: 0,town,area,price
0,0,2600,550000
1,0,3000,565000
2,0,3200,610000
3,0,3600,680000
4,0,4000,725000
5,2,2600,585000
6,2,2800,615000
7,2,3300,650000
8,2,3600,710000
9,1,2600,575000


__Dummy Variables__

In [7]:
dummies = pd.get_dummies(df.town)
dummies

Unnamed: 0,monroe township,robinsville,west windsor
0,1,0,0
1,1,0,0
2,1,0,0
3,1,0,0
4,1,0,0
5,0,0,1
6,0,0,1
7,0,0,1
8,0,0,1
9,0,1,0


In [8]:
pd.concat([df, dummies], axis='columns')

Unnamed: 0,town,area,price,monroe township,robinsville,west windsor
0,monroe township,2600,550000,1,0,0
1,monroe township,3000,565000,1,0,0
2,monroe township,3200,610000,1,0,0
3,monroe township,3600,680000,1,0,0
4,monroe township,4000,725000,1,0,0
5,west windsor,2600,585000,0,0,1
6,west windsor,2800,615000,0,0,1
7,west windsor,3300,650000,0,0,1
8,west windsor,3600,710000,0,0,1
9,robinsville,2600,575000,0,1,0


__OrdinalEncoder__

In [9]:
arr = {"review":  ["Good", "Good", "Poor","Average", "Good", "Poor"], 
       "degree" : ["PhD","Masters","Bachelors","PhD","Masters","Masters"]}

df2 = pd.DataFrame(arr)
df2

Unnamed: 0,review,degree
0,Good,PhD
1,Good,Masters
2,Poor,Bachelors
3,Average,PhD
4,Good,Masters
5,Poor,Masters


In [10]:
from sklearn.preprocessing import OrdinalEncoder
enc = OrdinalEncoder(categories=[['Poor','Average','Good'],['Bachelors','Masters','PhD']])
enc.fit_transform(df2)

array([[2., 2.],
       [2., 1.],
       [0., 0.],
       [1., 2.],
       [2., 1.],
       [0., 1.]])

In [11]:
df2[['review','degree']] = enc.fit_transform(df2)
df2

Unnamed: 0,review,degree
0,2.0,2.0
1,2.0,1.0
2,0.0,0.0
3,1.0,2.0
4,2.0,1.0
5,0.0,1.0


__OneHotEncoder__

In [12]:
df3 = pd.read_csv('https://gist.githubusercontent.com/bigsnarfdude/515849391ad37fe593997fe0db98afaa/raw')
df3.head()

Unnamed: 0,outlook,temperature,humidity,windy,play
0,overcast,hot,high,False,yes
1,overcast,cool,normal,True,yes
2,overcast,mild,high,True,yes
3,overcast,hot,normal,False,yes
4,rainy,mild,high,False,yes


In [13]:
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder(sparse=False)
enc.fit_transform(df3)

array([[1., 0., 0., 0., 1., 0., 1., 0., 1., 0., 0., 1.],
       [1., 0., 0., 1., 0., 0., 0., 1., 0., 1., 0., 1.],
       [1., 0., 0., 0., 0., 1., 1., 0., 0., 1., 0., 1.],
       [1., 0., 0., 0., 1., 0., 0., 1., 1., 0., 0., 1.],
       [0., 1., 0., 0., 0., 1., 1., 0., 1., 0., 0., 1.],
       [0., 1., 0., 1., 0., 0., 0., 1., 1., 0., 0., 1.],
       [0., 1., 0., 1., 0., 0., 0., 1., 0., 1., 1., 0.],
       [0., 1., 0., 0., 0., 1., 0., 1., 1., 0., 0., 1.],
       [0., 1., 0., 0., 0., 1., 1., 0., 0., 1., 1., 0.],
       [0., 0., 1., 0., 1., 0., 1., 0., 1., 0., 1., 0.],
       [0., 0., 1., 0., 1., 0., 1., 0., 0., 1., 1., 0.],
       [0., 0., 1., 0., 0., 1., 1., 0., 1., 0., 1., 0.],
       [0., 0., 1., 1., 0., 0., 0., 1., 1., 0., 0., 1.],
       [0., 0., 1., 0., 0., 1., 0., 1., 0., 1., 0., 1.]])

In [14]:
pd.DataFrame(enc.fit_transform(df3))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
1,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0
2,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0
3,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0
4,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0
5,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0
6,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0
7,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0
8,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0
9,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0


__ColumnTransformer__

In [15]:
arr4 = {"name":    ["A", "B", "C","D", "E", "F"], 
       "salary" : [30000,80000,40000,'NaN',60000,40500],
       "review":  ["Good", "Good", "Poor","Average", "Good", "Poor"], 
       "degree" : ["PhD","Masters","Bachelors","PhD","Masters","Masters"],
       "city" :   ["Alberta","Quebec","Alberta","Regina","Regina","Quebec"],
      }

df4 = pd.DataFrame(arr4)
df4

Unnamed: 0,name,salary,review,degree,city
0,A,30000.0,Good,PhD,Alberta
1,B,80000.0,Good,Masters,Quebec
2,C,40000.0,Poor,Bachelors,Alberta
3,D,,Average,PhD,Regina
4,E,60000.0,Good,Masters,Regina
5,F,40500.0,Poor,Masters,Quebec


In [16]:
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
ct = ColumnTransformer(transformers=[('tnf1', SimpleImputer(), ['salary']), 
                                     ('tnf2', OrdinalEncoder(categories=[['Poor','Average','Good'],['Bachelors','Masters','PhD']]), ['review', 'degree']),
                                     ('tnf3', OneHotEncoder(sparse=False), ['city'])], 
                       remainder = 'passthrough')

In [17]:
ct.fit_transform(df4)

array([[30000.0, 2.0, 2.0, 1.0, 0.0, 0.0, 'A'],
       [80000.0, 2.0, 1.0, 0.0, 1.0, 0.0, 'B'],
       [40000.0, 0.0, 0.0, 1.0, 0.0, 0.0, 'C'],
       [50100.0, 1.0, 2.0, 0.0, 0.0, 1.0, 'D'],
       [60000.0, 2.0, 1.0, 0.0, 0.0, 1.0, 'E'],
       [40500.0, 0.0, 1.0, 0.0, 1.0, 0.0, 'F']], dtype=object)

### Train Test Split

In [18]:
X = df[['town','area']]
y = df['price']

In [19]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3) 

In [20]:
X_test

Unnamed: 0,town,area
7,west windsor,3300
9,robinsville,2600
3,monroe township,3600
4,monroe township,4000


In [21]:
y_test

7    650000
9    575000
3    680000
4    725000
Name: price, dtype: int64

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=10)
X_train

Unnamed: 0,town,area
8,west windsor,3600
2,monroe township,3200
12,robinsville,3600
5,west windsor,2600
10,robinsville,2900
1,monroe township,3000
0,monroe township,2600
4,monroe township,4000
9,robinsville,2600


In [23]:
X_test

Unnamed: 0,town,area
3,monroe township,3600
7,west windsor,3300
11,robinsville,3100
6,west windsor,2800
