## This Notebook is to summarize Notes about Machine Learning Model
## for How to deal with Missing Values in Dataset

### =====================================================================

### By : Mohammed Agoor

Git : https://github.com/AGOOR97

#### ============================= Coded By AGOOR ===================================

In [1]:
import pandas as pd

# importing the total Library and import its methods below to know what we use
import sklearn as sk   
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer


In [2]:
house = pd.read_csv('house_dataset.csv')

In [3]:
# to make a simple Model 
# you can take only a few Numerical Features and Assign to X
# take a target as a Prcie and assign its Value to y
# drop all columns with NaN Values

### Simple Model


In [4]:
house_dropped = house.dropna(axis = 0)

In [5]:
y = house_dropped.Price

features = ['Rooms', 'Bathroom', 'Landsize', 'Lattitude', 'Longtitude']
X = house_dropped[features]

In [6]:
train_x,val_x,train_y,val_y = sk.model_selection.train_test_split(X,y
                                                                  ,train_size = 0.8,
                                                                  test_size = 0.2
                                                                  ,random_state=0)


In [7]:
## Define the function which returrns the MAE under requtires N of Leaves
def getMAE(leaf_nodes,train_x_sent,val_x_sent,train_y_sent,val_y_sent):
    '''this function is made to return MAE uder diffrent Number of Leaves'''
    model = sk.ensemble.RandomForestRegressor(n_estimators = leaf_nodes,random_state = 0)
    model.fit(train_x_sent,train_y_sent)
    pred = model.predict(val_x_sent)
    mae_cal = mean_absolute_error(pred,val_y_sent)
    return mae_cal

In [8]:
old_model = getMAE(100,train_x,val_x,train_y,val_y)

print('MAE under old Calculations : \n' , old_model)

MAE under old Calculations : 
 202281.60010023043


### =====================================================================

## NewValues Models dealing with Missing Values

### Firstly : Missing Values

### Your Options are :
#### 1- drop these columns as we did before
#### 2- impute these columns 

#### Firstly : I will deal by dropping Columns which have null values

In [9]:
# target Value
y = house.Price

# drop price column from features table
features_drop_Price = house.drop(['Price'] , axis = 1)

# Here, I am dealing only with Numerical Values not Categorical Ones ,
# So, I will drop Categorical Values that have (object) types
X_features = features_drop_Price.select_dtypes(exclude = 'object')

In [10]:
# divide data

train_x,val_x,train_y,val_y = sk.model_selection.train_test_split(X_features,y
                                                                  ,train_size = 0.8,
                                                                  test_size = 0.2
                                                                  ,random_state=0)


In [11]:
# after dropping categorical cols ,  get cols with missing values and also drop them

cols_miss_values = [col for col in train_x.columns if train_x[col].isnull().any()]                   

In [12]:
print('cols with missing values in it , I will drop them \n',cols_miss_values)

cols with missing values in it , I will drop them 
 ['Car', 'BuildingArea', 'YearBuilt']


In [13]:
# the training final data after dropping categorical Cols and Numerical which have missing Values
train_x_dropped = train_x.drop(cols_miss_values , axis = 1)
val_x_dropped = val_x.drop(cols_miss_values , axis = 1)

In [14]:
# MAE after dropping categorical Cols and Numerical which have missing Values
mae_drop_categ_missNum = getMAE(150,train_x_dropped,val_x_dropped,train_y,val_y)
print('MAE after dropping categorical Cols and Numerical which have missing Values (Approach 1):\n',
                                                     mae_drop_categ_missNum)

MAE after dropping categorical Cols and Numerical which have missing Values (Approach 1):
 175733.2014838971


### =====================================================================

#### Secondly : I will deal by - Imputation - Imputing (average)

In [15]:

my_emputer = sk.impute.SimpleImputer()

In [16]:
# data before imputation
train_x.head()

Unnamed: 0,Rooms,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,Lattitude,Longtitude,Propertycount
12167,1,5.0,3182.0,1.0,1.0,1.0,0.0,,1940.0,-37.85984,144.9867,13240.0
6524,2,8.0,3016.0,2.0,2.0,1.0,193.0,,,-37.858,144.9005,6380.0
8413,3,12.6,3020.0,3.0,1.0,1.0,555.0,,,-37.7988,144.822,3755.0
2919,3,13.0,3046.0,3.0,1.0,1.0,265.0,,1995.0,-37.7083,144.9158,8870.0
6043,3,13.3,3020.0,3.0,1.0,2.0,673.0,673.0,1970.0,-37.7623,144.8272,4217.0


In [17]:
# imputation 
impute_train_x = pd.DataFrame(my_emputer.fit_transform(train_x))
impute_val_x = pd.DataFrame(my_emputer.transform(val_x))


# data after imputation  , imputation removes index of DataFarme
impute_train_x.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,1.0,5.0,3182.0,1.0,1.0,1.0,0.0,153.764119,1940.0,-37.85984,144.9867,13240.0
1,2.0,8.0,3016.0,2.0,2.0,1.0,193.0,153.764119,1964.839866,-37.858,144.9005,6380.0
2,3.0,12.6,3020.0,3.0,1.0,1.0,555.0,153.764119,1964.839866,-37.7988,144.822,3755.0
3,3.0,13.0,3046.0,3.0,1.0,1.0,265.0,153.764119,1995.0,-37.7083,144.9158,8870.0
4,3.0,13.3,3020.0,3.0,1.0,2.0,673.0,673.0,1970.0,-37.7623,144.8272,4217.0


In [18]:
# return cols Names to DataFrame

impute_train_x.columns = train_x.columns
impute_val_x.columns = val_x.columns


impute_train_x.head()

Unnamed: 0,Rooms,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,Lattitude,Longtitude,Propertycount
0,1.0,5.0,3182.0,1.0,1.0,1.0,0.0,153.764119,1940.0,-37.85984,144.9867,13240.0
1,2.0,8.0,3016.0,2.0,2.0,1.0,193.0,153.764119,1964.839866,-37.858,144.9005,6380.0
2,3.0,12.6,3020.0,3.0,1.0,1.0,555.0,153.764119,1964.839866,-37.7988,144.822,3755.0
3,3.0,13.0,3046.0,3.0,1.0,1.0,265.0,153.764119,1995.0,-37.7083,144.9158,8870.0
4,3.0,13.3,3020.0,3.0,1.0,2.0,673.0,673.0,1970.0,-37.7623,144.8272,4217.0


In [19]:
# Now , we almost done , Imputuion done , dropping categorical values is done in the first

mae_imputation = getMAE(10,impute_train_x,impute_val_x,train_y,val_y)

print('MAE usign imputation (Approach 2) : \n' , mae_imputation)

MAE usign imputation (Approach 2) : 
 178166.46269899711


####  ==========================================================================================