## Introduction To SciKit-Learn 
This notebook will walk you through some of the useful functions of Scikit-Learn Library.

What Are you going to Learn:

    1.An end to end  sklearn workflow
    2.getting the data ready
    3.choose the right estimator/algorithm for our problem
    4.Fit the model/algorithm and use it to make prediction on our data
    5.Evaluating a model
    6.Improve a model
    7.Save and load a trained model
    8.Putting it all together

## 0. And End-to-End Scikit-Learn workflow

In [1]:
#1. Get the data ready
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
heart_disease = pd.read_csv('43 - heart-disease.csv')
heart_disease.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [2]:
# Create X (Feature Matrix)
X = heart_disease.drop('target',axis=1)

#create Y(labels)
Y = heart_disease['target']


#Chose the right model and hyperparameter

from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier()

# We will keep the defult hyperparameters

clf.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [3]:
# 3. Fit the model to the tranning data 

from sklearn.model_selection import train_test_split

x_train, x_test,y_train,y_test = train_test_split(X,Y, test_size=0.2)

clf.fit(x_train,y_train);

In [4]:
# Make Prediciton
x_test

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
57,45,1,0,115,260,0,0,185,0,0.0,2,0,2
78,52,1,1,128,205,1,1,184,0,0.0,2,0,2
288,57,1,0,110,335,0,1,143,1,3.0,1,1,3
107,45,0,0,138,236,0,0,152,1,0.2,1,0,2
138,57,1,0,110,201,0,1,126,1,1.5,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
260,66,0,0,178,228,1,1,165,1,1.0,1,2,3
23,61,1,2,150,243,1,1,137,1,1.0,1,0,2
200,44,1,0,110,197,0,0,177,0,0.0,2,1,2
246,56,0,0,134,409,0,0,150,1,1.9,1,2,3


In [5]:
y_predc = clf.predict(x_test)
y_predc

array([1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0,
       1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1], dtype=int64)

In [6]:
y_test

57     1
78     1
288    0
107    1
138    1
      ..
260    0
23     1
200    0
246    0
43     1
Name: target, Length: 61, dtype: int64

In [7]:
# 4.Evaluate The model on the tranning data and testing data
clf.score(x_train,y_train)

1.0

In [8]:
clf.score(x_test,y_test)

0.8524590163934426

In [9]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(classification_report(y_test,y_predc))

              precision    recall  f1-score   support

           0       0.84      0.81      0.82        26
           1       0.86      0.89      0.87        35

    accuracy                           0.85        61
   macro avg       0.85      0.85      0.85        61
weighted avg       0.85      0.85      0.85        61



In [10]:
confusion_matrix(y_test, y_predc)

array([[21,  5],
       [ 4, 31]], dtype=int64)

In [11]:
accuracy_score(y_test,y_predc)

0.8524590163934426

In [12]:
# 5. Improve a model 
# Try Different amount of n_estimators
np.random.seed(42)
for i in range(10,100,10):
    print(f"Trying model with {i} estimators...")
    clf = RandomForestClassifier(n_estimators=i).fit(x_train,y_train)
    print(f"Model accuracy on test set:{clf.score(x_test,y_test)*100:.2f}%")
    print("")

Trying model with 10 estimators...
Model accuracy on test set:83.61%

Trying model with 20 estimators...
Model accuracy on test set:83.61%

Trying model with 30 estimators...
Model accuracy on test set:83.61%

Trying model with 40 estimators...
Model accuracy on test set:86.89%

Trying model with 50 estimators...
Model accuracy on test set:85.25%

Trying model with 60 estimators...
Model accuracy on test set:85.25%

Trying model with 70 estimators...
Model accuracy on test set:86.89%

Trying model with 80 estimators...
Model accuracy on test set:81.97%

Trying model with 90 estimators...
Model accuracy on test set:83.61%



In [13]:
 # 6. Save a model and load it
import pickle

pickle.dump(clf,open("random_forest_model_1.pkl","wb"))

In [14]:
loaded_model = pickle.load(open("random_forest_model_1.pkl",'rb'))
loaded_model.score(x_test,y_test)

0.8360655737704918

## 1. Getting our Data ready to be used with machine learning 

    Three main things we have to do:
        1. Split the data into features and labels (usually'x' and 'y')
        2. Filling (also called imputing) or disregarding missing values
        3. Converting non-numerical values to numerical values (also callled feature encoding)

In [15]:
heart_disease.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [16]:
x = heart_disease.drop('target',axis=1)
x.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2


In [17]:
y = heart_disease['target']
y.head()

0    1
1    1
2    1
3    1
4    1
Name: target, dtype: int64

In [18]:
# Split the data into tranning and test sets
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test = train_test_split(x,y, test_size = 0.2)

In [19]:
X_train.shape, X_test.shape,Y_train.shape,Y_test.shape

((242, 13), (61, 13), (242,), (61,))

In [20]:
x.shape

(303, 13)

In [21]:
len(heart_disease)

303

## 1.1 Make Sure Its all numericals

In [22]:
car_sales = pd.read_csv('car-sales-extended.csv')
car_sales

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431,4,15323
1,BMW,Blue,192714,5,19943
2,Honda,White,84714,4,28343
3,Toyota,White,154365,4,13434
4,Nissan,Blue,181577,3,14043
...,...,...,...,...,...
995,Toyota,Black,35820,4,32042
996,Nissan,White,155144,3,5716
997,Nissan,Blue,66604,4,31570
998,Honda,White,215883,4,4001


In [23]:
car_sales.dtypes

Make             object
Colour           object
Odometer (KM)     int64
Doors             int64
Price             int64
dtype: object

In [24]:
#Split the data into x/y 

x= car_sales.drop('Price', axis=1)
y= car_sales['Price']

# Split into tranning and testing datasets
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2)


In [25]:
 # Build Machine learing model 
from sklearn.ensemble import RandomForestRegressor 

model = RandomForestRegressor()
model.fit(x_train,y_train)
model.score(x_test,y_test)

ValueError: could not convert string to float: 'Toyota'

In [26]:
# Trun the data into numerical data
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

categorical_feature = ['Make','Colour','Doors']
one_hot = OneHotEncoder()
transformer = ColumnTransformer([('one_hot',
                                one_hot,
                                categorical_feature)],
                               remainder = 'passthrough')
transformed_x = transformer.fit_transform(x)
transformed_x


array([[0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 3.54310e+04],
       [1.00000e+00, 0.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        1.00000e+00, 1.92714e+05],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 8.47140e+04],
       ...,
       [0.00000e+00, 0.00000e+00, 1.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 6.66040e+04],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 2.15883e+05],
       [0.00000e+00, 0.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 2.48360e+05]])

In [27]:
pd.DataFrame(transformed_x)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,35431.0
1,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,192714.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,84714.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,154365.0
4,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,181577.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,35820.0
996,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,155144.0
997,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,66604.0
998,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,215883.0


In [28]:
dummies = pd.get_dummies(car_sales[['Make','Colour','Doors']])
dummies

Unnamed: 0,Doors,Make_BMW,Make_Honda,Make_Nissan,Make_Toyota,Colour_Black,Colour_Blue,Colour_Green,Colour_Red,Colour_White
0,4,0,1,0,0,0,0,0,0,1
1,5,1,0,0,0,0,1,0,0,0
2,4,0,1,0,0,0,0,0,0,1
3,4,0,0,0,1,0,0,0,0,1
4,3,0,0,1,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...
995,4,0,0,0,1,1,0,0,0,0
996,3,0,0,1,0,0,0,0,0,1
997,4,0,0,1,0,0,1,0,0,0
998,4,0,1,0,0,0,0,0,0,1


In [29]:
# Let's refit the model
np.random.seed(42)
x_train,x_test,y_train,y_test = train_test_split(transformed_x,y,test_size=0.2)
model.fit(x_train,y_train)

In [30]:
model.score(x_test,y_test)

0.3235867221569877

## 2. What if There was missing values ?
    1.Fill them with some values(also known as imputation)
    2.Remove the sample with the missing data altogether
    

In [31]:
car_sales_missing = pd.read_csv('car-sales-extended-missing-data.csv')
car_sales_missing.head(20)

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431.0,4.0,15323.0
1,BMW,Blue,192714.0,5.0,19943.0
2,Honda,White,84714.0,4.0,28343.0
3,Toyota,White,154365.0,4.0,13434.0
4,Nissan,Blue,181577.0,3.0,14043.0
5,Honda,Red,42652.0,4.0,23883.0
6,Toyota,Blue,163453.0,4.0,8473.0
7,Honda,White,,4.0,20306.0
8,,White,130538.0,4.0,9374.0
9,Honda,Blue,51029.0,4.0,26683.0


In [32]:
car_sales_missing.isnull().sum()


Make             49
Colour           50
Odometer (KM)    50
Doors            50
Price            50
dtype: int64

## !! Fill missing data  with pandas

In [33]:
# Fill the Make Column 
car_sales_missing['Make'].fillna("missing",inplace=True)
# Fill the "Colour" Column
car_sales_missing['Colour'].fillna("missing",inplace=True)
# Fill the "Odometer (KM)" column
car_sales_missing['Odometer (KM)'].fillna(car_sales_missing['Odometer (KM)'].mean(),inplace= True)
#Fill the 'Doors' Column
car_sales_missing['Doors'].fillna(4,inplace=True)

In [34]:
# Remove 'price' rows with missing values
car_sales_missing.dropna(inplace=True)

In [35]:
car_sales_missing.isna().sum()

Make             0
Colour           0
Odometer (KM)    0
Doors            0
Price            0
dtype: int64

In [36]:
# Split our datta into  x and y 
x = car_sales_missing.drop('Price',axis=1)
y = car_sales_missing['Price']

In [37]:
# lets try and convert our data to numbers
#Turn the categories into numbers

categorical_feature = ['Make','Colour','Doors']
one_hot = OneHotEncoder()
transformer = ColumnTransformer([('one_hot',
                                one_hot,
                                categorical_feature)],
                               remainder = 'passthrough')
transformed_x = transformer.fit_transform(car_sales_missing)
transformed_x

array([[0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        3.54310e+04, 1.53230e+04],
       [1.00000e+00, 0.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        1.92714e+05, 1.99430e+04],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        8.47140e+04, 2.83430e+04],
       ...,
       [0.00000e+00, 0.00000e+00, 1.00000e+00, ..., 0.00000e+00,
        6.66040e+04, 3.15700e+04],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        2.15883e+05, 4.00100e+03],
       [0.00000e+00, 0.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        2.48360e+05, 1.27320e+04]])

In [38]:
np.random.seed(42)
x_train,x_test,y_train,y_test = train_test_split(transformed_x,y,test_size=0.2)
model.fit(x_train,y_train)

In [39]:
model.score(x_test,y_test)

0.9998421058539825

## 2.2 Choosing an estimator for a classification problem

In [41]:
heart_disease = pd.read_csv('43 - heart-disease.csv')
heart_disease.head(10)

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1
5,57,1,0,140,192,0,1,148,0,0.4,1,0,1,1
6,56,0,1,140,294,0,0,153,0,1.3,1,0,2,1
7,44,1,1,120,263,0,1,173,0,0.0,2,0,3,1
8,52,1,2,172,199,1,1,162,0,0.5,2,0,3,1
9,57,1,2,150,168,0,1,174,0,1.6,2,0,2,1


In [44]:
 len(heart_disease)

303

Consulting the map and it says to try LinearSVC.

In [50]:
# Importing the LinearSVC estimator
from sklearn.svm import LinearSVC

# Setup random seed
np.random.seed(42)

# Make the data 
x = heart_disease.drop('target', axis=1)
y = heart_disease['target']

# Split the data 

x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2)


# Instantiate LinearSVC 
clf = LinearSVC()
clf.fit(x_train,y_train)

# Evaluate the LinearSVC 

clf.score(x_test,y_test)



0.8688524590163934

In [47]:
heart_disease['target'].value_counts()

1    165
0    138
Name: target, dtype: int64

# Tidbit:
1. If you have structured data, used ensemble methods
2. If you have unstructured data, use deep learing or transfer learning
    

In [49]:
# Importing the RandomForestClassifier estimator class
from sklearn.ensemble import RandomForestClassifier

# Setup random seed
np.random.seed(42)

# Make the data 
x = heart_disease.drop('target', axis=1)
y = heart_disease['target']

# Split the data 

x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2)


# Instantiate RandomForestClassifier 
clf = RandomForestClassifier(n_estimators=100)
clf.fit(x_train,y_train)

# Evaluate the RandomForestClassifier 

clf.score(x_test,y_test)

0.8524590163934426

## 3. Fit the model/algorithm on our data and use it to make predictions 

### 3.1 Fitting the model to the data 

Different names for:
* 'X' = features, features variables, data
* 'Y' = labels, targets, target variables

In [None]:
# Importing the RandomForestClassifier estimator class
from sklearn.ensemble import RandomForestClassifier

# Setup random seed
np.random.seed(42)

# Make the data 
x = heart_disease.drop('target', axis=1)
y = heart_disease['target']

# Split the data 

x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2)


# Instantiate RandomForestClassifier(tranning  the machine learing model)
clf = RandomForestClassifier(n_estimators=100)
clf.fit(x_train,y_train)

# Evaluate the RandomForestClassifier 

clf.score(x_test,y_test)

In [54]:
x.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2


In [57]:
y.head()

0    1
1    1
2    1
3    1
4    1
Name: target, dtype: int64

In [61]:
y_train

132    1
202    0
196    0
75     1
176    0
      ..
188    0
71     1
106    1
270    0
102    1
Name: target, Length: 242, dtype: int64

In [59]:
x_test

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
179,57,1,0,150,276,0,0,112,1,0.6,1,1,1
228,59,1,3,170,288,0,0,159,0,0.2,1,0,3
111,57,1,2,150,126,1,1,173,0,0.2,2,1,3
246,56,0,0,134,409,0,0,150,1,1.9,1,2,3
60,71,0,2,110,265,1,0,130,0,0.0,2,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
249,69,1,2,140,254,0,0,146,0,2.0,1,3,3
104,50,1,2,129,196,0,1,163,0,0.0,2,0,2
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3
193,60,1,0,145,282,0,0,142,1,2.8,1,2,3


In [60]:
heart_disease

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3,0
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3,0
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3,0
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3,0


In [62]:
y.tail()

298    0
299    0
300    0
301    0
302    0
Name: target, dtype: int64

## 3.2 Make predictions using a machine learning

2 ways to make predicitons
* predict()
* predict_proba()

In [63]:
# Use a trained model to make predictions

clf.predict(x_test)

array([0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0,
       0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0], dtype=int64)

In [65]:
np.array(y_test)

array([0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0,
       0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0], dtype=int64)

In [69]:
# Compare predictions to truth labels to evaluate the model
y_preds = clf.predict(x_test)
np.mean(y_preds == y_test)

0.8688524590163934

In [70]:
clf.score(x_test,y_test)

0.8688524590163934

In [72]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_preds)

0.8688524590163934

## Make predictions with predict_proba()
|This Method did't worked in newer versions


In [75]:
# Make predictions with predict_proba()
clf.predict_proba(x_test[:10])

AttributeError: 'LinearSVC' object has no attribute 'predict_proba'

## NEW: Making Predictions With Our Model (Regression)

In [79]:
housing_df = pd.read_csv('ParisHousing.csv')
housing_df.head()

Unnamed: 0,squareMeters,numberOfRooms,hasYard,hasPool,floors,cityCode,cityPartRange,numPrevOwners,made,isNewBuilt,hasStormProtector,basement,attic,garage,hasStorageRoom,hasGuestRoom,price
0,75523,3,0,1,63,9373,3,8,2005,0,1,4313,9005,956,0,7,7559081.5
1,80771,39,1,1,98,39381,8,6,2015,1,0,3653,2436,128,1,2,8085989.5
2,55712,58,0,1,19,34457,6,8,2021,0,0,2937,8852,135,1,9,5574642.1
3,32316,47,0,0,6,27939,10,4,2012,0,1,659,7141,359,0,3,3232561.2
4,70429,19,1,1,90,38045,3,7,1990,1,0,8435,2429,292,1,4,7055052.0


In [81]:
housing_df.dtypes

squareMeters           int64
numberOfRooms          int64
hasYard                int64
hasPool                int64
floors                 int64
cityCode               int64
cityPartRange          int64
numPrevOwners          int64
made                   int64
isNewBuilt             int64
hasStormProtector      int64
basement               int64
attic                  int64
garage                 int64
hasStorageRoom         int64
hasGuestRoom           int64
price                float64
dtype: object

In [83]:
housing_df['price']

0       7559081.5
1       8085989.5
2       5574642.1
3       3232561.2
4       7055052.0
          ...    
9995     176425.9
9996    4448474.0
9997    8390030.5
9998    5905107.0
9999     146708.4
Name: price, Length: 10000, dtype: float64

In [85]:
housing_df['price'] = housing_df['price'].replace('[\$\,\.]', '', regex=True).astype(int)
housing_df.head()

Unnamed: 0,squareMeters,numberOfRooms,hasYard,hasPool,floors,cityCode,cityPartRange,numPrevOwners,made,isNewBuilt,hasStormProtector,basement,attic,garage,hasStorageRoom,hasGuestRoom,price
0,75523,3,0,1,63,9373,3,8,2005,0,1,4313,9005,956,0,7,7559081
1,80771,39,1,1,98,39381,8,6,2015,1,0,3653,2436,128,1,2,8085989
2,55712,58,0,1,19,34457,6,8,2021,0,0,2937,8852,135,1,9,5574642
3,32316,47,0,0,6,27939,10,4,2012,0,1,659,7141,359,0,3,3232561
4,70429,19,1,1,90,38045,3,7,1990,1,0,8435,2429,292,1,4,7055052


In [86]:
housing_df.dtypes

squareMeters         int64
numberOfRooms        int64
hasYard              int64
hasPool              int64
floors               int64
cityCode             int64
cityPartRange        int64
numPrevOwners        int64
made                 int64
isNewBuilt           int64
hasStormProtector    int64
basement             int64
attic                int64
garage               int64
hasStorageRoom       int64
hasGuestRoom         int64
price                int32
dtype: object

In [104]:
np.random.seed(42)

#Getting the data ready

x = housing_df.drop('price',axis=1)
y = housing_df['price']

# Spliting the data into train and test data sets 

x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2)


# Create the model instance
model = RandomForestRegressor(n_estimators=10)

#Fitting the model/ Tranning the model

model.fit(x_train,y_train)


# Make predications

y_preds = model.predict(x_test)



In [105]:
y_preds[:10]

array([7956587.4, 3711579.3, 8552108.6, 6424205.3, 6458068.5,  221783.3,
       1619523.3, 8635410.7, 6197247.5, 3298226.6])

In [107]:
# Compare the predicated values to the truth 
from sklearn.metrics import mean_absolute_error
mean_absolute_error(y_test,y_preds)

3368.263850000008

In [106]:
model.score(x_test, y_test)

0.9999979427177005