## This Notebook is to summarize Notes about Machine Learning Model
## for How to deal with Categorical Columns in Dataset

### =====================================================================

### By : Mohammed Agoor

Git : https://github.com/AGOOR97

#### ============================= Coded By AGOOR ===================================

In [1]:
import pandas as pd

# importing the total Library and import its methods below to know what we use
import sklearn as sk
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

In [2]:
house = pd.read_csv('house_dataset.csv')

### Here I will deal with categorical values by diffrent Method

#### 1- Dropping them
#### 2- Label Encoding
#### 3- One-Hot-Encoding 


### Firstly : Dropping them - Categorical Columns

In [3]:
y = house.Price

X = house.drop(['Price'] , axis = 1)

# we have two options 
# 1-divide data then solve problem of categorical values for both training and validation features
# 2-solve categorical columns for all the features Dataset , then divide them to train & test.

# I will use the second Option

# first : dropping Columns with missing values

In [4]:
# Define the function which returrns the MAE under requtires N of Leaves
def getMAE(leaf_nodes,train_x_sent,val_x_sent,train_y_sent,val_y_sent):
    '''this function is made to return MAE uder diffrent Number of Leaves'''
    model = sk.ensemble.RandomForestRegressor(n_estimators = leaf_nodes,random_state = 0)
    model.fit(train_x_sent,train_y_sent)
    pred = model.predict(val_x_sent)
    mae_cal = mean_absolute_error(pred,val_y_sent)
    return mae_cal

In [5]:
col_miss_values = [col for col in X.columns if X[col].isnull().any()]
col_miss_values

['Car', 'BuildingArea', 'YearBuilt', 'CouncilArea']

In [6]:
# drop these columns for all X

X_full_features = X.drop(col_miss_values , axis = 1 )

In [7]:
# here in our first Approach , we will drop categorical columns also

categ_cols = [cname for cname in X_full_features.columns if 
                             X_full_features[cname].dtype == 'object']

In [8]:
categ_cols

['Suburb', 'Address', 'Type', 'Method', 'SellerG', 'Date', 'Regionname']

In [9]:
# Now , the X_full_ready is dropping the categorical columns and it is ready to be split
X_full_ready = X_full_features.drop(categ_cols , axis = 1)

In [10]:
train_x,val_x,train_y,val_y = sk.model_selection.train_test_split(X_full_ready , y , 
                                        train_size=0.8,test_size=0.2 , random_state=0)


In [11]:
mae_approach1 = getMAE(100,train_x,val_x,train_y,val_y)
print('Categorical Solution ... (Aproach 1) - Dropping them \n' , mae_approach1)

Categorical Solution ... (Aproach 1) - Dropping them 
 175703.48185157913


### =====================================================================

### Secondly : Label Encoding them - Categorical Columns

In [12]:
y = house.Price

X = house.drop(['Price'] , axis = 1)

In [13]:
# Define the function which returrns the MAE under requtires N of Leaves
def getMAE(leaf_nodes,train_x_sent,val_x_sent,train_y_sent,val_y_sent):
    '''this function is made to return MAE uder diffrent Number of Leaves'''
    model = sk.ensemble.RandomForestRegressor(n_estimators = leaf_nodes,random_state = 0)
    model.fit(train_x_sent,train_y_sent)
    pred = model.predict(val_x_sent)
    mae_cal = mean_absolute_error(pred,val_y_sent)
    return mae_cal

In [14]:
col_miss_values = [col for col in X.columns if X[col].isnull().any()]
col_miss_values

['Car', 'BuildingArea', 'YearBuilt', 'CouncilArea']

In [15]:
# drop these columns for all X

X_full_features = X.drop(col_miss_values , axis = 1 )

In [16]:
# get Categorical columns which have dtypes of Object and Number of Unique Values < 10
# i will take only columns with number of unique values < 10 to not increase dataset


num_cols = [cname for cname in X_full_features.columns if
                            X_full_features[cname].dtype in ['int64','float64']]
                           

cols_categ_encoding = [cname for cname in X_full_features.columns if
                            X_full_features[cname].dtype == 'object' and
                            X_full_features[cname].nunique() < 10]

new_cols = num_cols+cols_categ_encoding

X_new = X_full_features[new_cols].copy()


In [17]:
# these columns which will be dealed with for label encoding
cols_categ_encoding

['Type', 'Method', 'Regionname']

In [18]:
# get categorical columns in the new X_new to make Labelencoding for them

categ = (X_new.dtypes == 'object')
list_object_for_encoding = categ[categ].index

In [19]:
list_object_for_encoding

Index(['Type', 'Method', 'Regionname'], dtype='object')

In [20]:
# take a copy of X_new to make encoding for it

X_encoding = X_new.copy()

In [21]:

my_encoder = LabelEncoder()

for col in list_object_for_encoding:
    X_encoding[col] = my_encoder.fit_transform(X_new[col])

In [22]:
X_encoding.head()

Unnamed: 0,Rooms,Distance,Postcode,Bedroom2,Bathroom,Landsize,Lattitude,Longtitude,Propertycount,Type,Method,Regionname
0,2,2.5,3067.0,2.0,1.0,202.0,-37.7996,144.9984,4019.0,0,1,2
1,2,2.5,3067.0,2.0,1.0,156.0,-37.8079,144.9934,4019.0,0,1,2
2,3,2.5,3067.0,3.0,2.0,134.0,-37.8093,144.9944,4019.0,0,3,2
3,3,2.5,3067.0,3.0,2.0,94.0,-37.7969,144.9969,4019.0,0,0,2
4,4,2.5,3067.0,3.0,1.0,120.0,-37.8072,144.9941,4019.0,0,4,2


In [23]:
# ready to split

train_x,val_x,train_y,val_y = sk.model_selection.train_test_split(X_encoding , y , 
                                        train_size=0.8,test_size=0.2 , random_state=0)


In [24]:
mae_approach2 = getMAE(100,train_x,val_x,train_y,val_y)
print('Categorical Solution ... (Aproach 2) - Label Encoding them \n' , mae_approach2)

Categorical Solution ... (Aproach 2) - Label Encoding them 
 166176.0778164668


### =====================================================================

### Thirdly : One Hot Encoding - Categorical Columns

In [25]:
y = house.Price

X = house.drop(['Price'] , axis = 1)

In [26]:
# Define the function which returrns the MAE under requtires N of Leaves
def getMAE(leaf_nodes,train_x_sent,val_x_sent,train_y_sent,val_y_sent):
    '''this function is made to return MAE uder diffrent Number of Leaves'''
    model = sk.ensemble.RandomForestRegressor(n_estimators = leaf_nodes,random_state = 0)
    model.fit(train_x_sent,train_y_sent)
    pred = model.predict(val_x_sent)
    mae_cal = mean_absolute_error(pred,val_y_sent)
    return mae_cal

In [27]:
col_miss_values = [col for col in X.columns if X[col].isnull().any()]
col_miss_values

['Car', 'BuildingArea', 'YearBuilt', 'CouncilArea']

In [28]:
# drop these columns for all X

X_full_features = X.drop(col_miss_values , axis = 1 )

In [29]:
# get Categorical columns which have dtypes of Object and Number of Unique Values < 10
# i will take only columns with number of unique values < 10 to not increase dataset


num_cols = [cname for cname in X_full_features.columns if
                            X_full_features[cname].dtype in ['int64','float64']]
                           

cols_categ_encoding = [cname for cname in X_full_features.columns if
                            X_full_features[cname].dtype == 'object' and
                            X_full_features[cname].nunique() < 10]

new_cols = num_cols+cols_categ_encoding

X_new = X_full_features[new_cols].copy()


In [30]:
# these columns which will be dealed with for label encoding
cols_categ_encoding

['Type', 'Method', 'Regionname']

In [31]:
# get categorical columns in the new X_new to make Labelencoding for them

categ = (X_new.dtypes == 'object')
list_object_for_encoding = categ[categ].index

In [32]:
list_object_for_encoding

Index(['Type', 'Method', 'Regionname'], dtype='object')

In [33]:
# take a copy of X_new to make encoding for it

X_OH_new = X_new.copy()

In [34]:
# Apply one-hot encoder to each column with categorical data

OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
X_OH_encoding = pd.DataFrame(OH_encoder.fit_transform(X_OH_new[list_object_for_encoding]))

In [35]:
# One-hot encoding removed index; put it back
X_OH_encoding.index = X_OH_new.index


In [36]:
# Remove categorical columns (will replace with one-hot encoding)
X_num_cols = X_OH_new.drop(list_object_for_encoding, axis = 1)


# Add one-hot encoded columns to numerical features
X_OH_final = pd.concat([X_num_cols, X_OH_encoding], axis = 1)


In [37]:
# ready to split

train_x,val_x,train_y,val_y = sk.model_selection.train_test_split(X_OH_final , y , 
                                        train_size=0.8,test_size=0.2 , random_state=0)


In [38]:
mae_approach3 = getMAE(100,train_x,val_x,train_y,val_y)
print('Categorical Solution ... (Aproach 3) - One Hot Encoding \n' , mae_approach3)

Categorical Solution ... (Aproach 3) - One Hot Encoding 
 166089.4893009678


### =====================================================================