# Task_1

In [1]:
# import libraries

import numpy as np
import pandas as pd

In [2]:
# Import Dataset from Sklearn

from sklearn.datasets import load_boston

In [3]:
boston_ds = load_boston()

In [4]:
# Check dataset to work with

for i in boston_ds.DESCR.split('\n'):
    print(i)

.. _boston_dataset:

Boston house prices dataset
---------------------------

**Data Set Characteristics:**  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pu

In [5]:
# Create DataFrame X
"""
DataFrame_X - specifications data about houses in Boston
"""

# Variables
data = boston_ds.data
feature_names = boston_ds.feature_names

# DataFrame
X = pd.DataFrame(data, columns = feature_names)

In [6]:
# Display result as 10 first rows

X.head(10)

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33
5,0.02985,0.0,2.18,0.0,0.458,6.43,58.7,6.0622,3.0,222.0,18.7,394.12,5.21
6,0.08829,12.5,7.87,0.0,0.524,6.012,66.6,5.5605,5.0,311.0,15.2,395.6,12.43
7,0.14455,12.5,7.87,0.0,0.524,6.172,96.1,5.9505,5.0,311.0,15.2,396.9,19.15
8,0.21124,12.5,7.87,0.0,0.524,5.631,100.0,6.0821,5.0,311.0,15.2,386.63,29.93
9,0.17004,12.5,7.87,0.0,0.524,6.004,85.9,6.5921,5.0,311.0,15.2,386.71,17.1


In [7]:
# Create DataFrame Y
"""
DataFrame_Y - are the prices that we will try to guess
"""

# Variables
target = boston_ds.target

Y = pd.DataFrame(target, columns=['price'])

In [8]:
# Display result as 10 first rows

Y.head(10)

Unnamed: 0,price
0,24.0
1,21.6
2,34.7
3,33.4
4,36.2
5,28.7
6,22.9
7,27.1
8,16.5
9,18.9


### Split DataFarme X & Y 
####         into
#### (X_train, y_train) - for training
####         and
#### (X_test, y_test) - for testing

In [9]:
# Using train_test_split

from sklearn.model_selection import train_test_split

In [10]:
# test_size = is the size to work with
# random_state = how many times data might repeats

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.30, random_state=42)

### Create Model of Linear regression

In [11]:
# import Linear Regression

from sklearn.linear_model import LinearRegression

lr = LinearRegression()

### Start Teaching Models

In [12]:
# fit() - training method

lr.fit(X_train, y_train)

LinearRegression()

#### Store results in var y_forecast

In [13]:
y_forecast = lr.predict(X_test)

### Comparing test with results  

In [14]:
check_test = pd.DataFrame(
    {
    'test': y_test['price'],
    'predict': y_forecast.flatten()
    },
    columns = ['test', 'predict']
)

#### Show results

In [15]:
# head  - user to limit qty of rows

check_test.head(10)

Unnamed: 0,test,predict
173,23.6,28.64896
274,32.4,36.495014
491,13.6,15.411193
72,22.8,25.403213
452,16.1,18.85528
76,20.0,23.146689
316,17.8,17.392124
140,14.0,14.078599
471,19.6,23.036927
500,16.8,20.599433


### Calc R2 Score

In [16]:

# The score above 0.6 is good results, closer to 1 is better


# import R2 score
from sklearn.metrics import r2_score

In [17]:
r2_score(y_test, y_forecast) 

0.7112260057484943

# Task_2

### Create model with RFR

In [18]:
from sklearn.ensemble import RandomForestRegressor

In [19]:
import matplotlib.pyplot as plt

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.30, random_state=42)

In [21]:
regressor = RandomForestRegressor(n_estimators=1000, max_depth=12, random_state=42)

In [22]:
regressor.fit(X_train, y_train.values[:,0])

RandomForestRegressor(max_depth=12, n_estimators=1000, random_state=42)

In [23]:
y_result = regressor.predict(X_test)

In [24]:
r2_score(y_test, y_result) 

0.87472606157312