# Exploring and Denoising Data

**Notice: This notebook is a modification of [sniff.ipynb](https://mlbook.explained.ai/notebooks/index.html) by Terence Parr and Jeremy Howard. It is being used by permission of the author.**

Please use this notebook to follow along with the lectures this week. Feel free to play around with any of the settings. 

There may be minor differences between the notebook seen in the lecture videos and this one. 

## 1. Building an Initial Model

#### Take a Quick Peek at the Data

In [None]:
import pandas as pd

In [None]:
rent = pd.read_csv('rent.csv')
rent.head(3)

In [None]:
rent.head(3).T

In [None]:
rent.info()

#### Select Only the Numeric Columns

In [None]:
rent_num = rent[['bathrooms', 'bedrooms', 'longitude', 'latitude', 'price']]

rent_num.head(3)

In [None]:
rent_num.isnull().any()

#### Create and Evaluate a Model

In [None]:
X = rent_num.drop('price', axis=1)
y = rent_num['price']

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score
import numpy as np

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)  

rf = RandomForestRegressor(n_estimators=100, n_jobs=-1, oob_score=True) 

rf.fit(X_train, y_train) 

rf.score(X_train, y_train)

In [None]:
rf.oob_score_

In [None]:
rf.score(X_val, y_val)

In [None]:
avg_preds = [np.mean(y_train)] * len(y_val)

print(np.mean(y_train))

r2_score(y_val, avg_preds) 

In [None]:
train_r2 = []
train_mae = []
val_r2 = []
val_mae = []
oob_scores = []

for i in range(10):
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)
    rf = RandomForestRegressor(n_estimators=100, n_jobs=-1, oob_score=True) 
    rf.fit(X_train, y_train)
    train_preds = rf.predict(X_train)
    val_preds = rf.predict(X_val)
    train_r2.append(round(r2_score(y_train, train_preds), 2))
    val_r2.append(round(r2_score(y_val, val_preds), 2))
    train_mae.append(round(mean_absolute_error(y_train, train_preds), 0))
    val_mae.append(round(mean_absolute_error(y_val, val_preds), 0))
    oob_scores.append(rf.oob_score_)

In [None]:
print("Train r2 scores: \n", train_r2)
print("")
print("Validation r2 scores: \n", val_r2)
print("")
print("Train MAE scores: \n", train_mae)
print("")
print("Validation MAE scores: \n", val_mae)
print("")
print("Out-of-bag scores: \n", oob_scores)

In [None]:
print("Mean train r2: ", np.mean(train_r2))
print("Mean validation r2: ", np.mean(val_r2))
print("Mean train MAE: ", np.mean(train_mae))
print("Mean validation MAE: ", np.mean(val_mae))
print("Mean oob score: ", np.mean(oob_scores))

## 2. Denoise the Data

In [None]:
rent_num.describe()

In [None]:
rent_num['price'].sort_values(ascending=False).to_frame().head(10) 

In [None]:
rent_num['bathrooms'].value_counts().to_frame()

In [None]:
rent_num['bedrooms'].value_counts().to_frame()

#### Noise

In [None]:
rent_num[(rent_num['longitude'] == 0.0) & (rent_num['latitude'] == 0.0)]

In [None]:
rent_num[(rent_num['longitude'] < -80)]

#### Inconsistencies

In [None]:
rent_num[(rent_num['latitude'] > 40.728) & (rent_num['latitude'] < 40.733) &
         (rent_num['longitude'] >= -74.011) & (rent_num['longitude'] <= -74.007) &
         (rent_num['bedrooms'] == 2) & 
         (rent_num['bathrooms'] == 1)]

#### Cleaning up the price

In [None]:
rent_clean = rent_num[(rent_num['price'] > 1000) & (rent_num['price'] < 10000)]

In [None]:
import matplotlib.pyplot as plt

plt.hist(rent_clean['price'], bins=45)
plt.xlabel('Clipped Price')
plt.ylabel('Num Apts at that price')

#### Cleaning up the location

In [None]:
rent_clean = rent_clean[(rent_clean['longitude'] !=0) | (rent_clean['latitude']!=0)]

In [None]:
rent_clean = rent_clean[(rent_clean['latitude']>40.55) &
                        (rent_clean['latitude']<40.94) &
                        (rent_clean['longitude']>-74.1) &
                        (rent_clean['longitude']<-73.67)]

#### Check Impact on Number of Samples

In [None]:
rent_num.shape

In [None]:
rent_clean.shape

In [None]:
(rent_num.shape[0] - rent_clean.shape[0]) / rent_num.shape[0]

#### Train Model on Denoised Data

In [None]:
X_clean = rent_clean.drop('price', axis=1)
y_clean = rent_clean['price']

In [None]:
X_clean_train, X_clean_val, y_clean_train, y_clean_val = train_test_split(X_clean, y_clean, test_size=0.2)  

rf_clean = RandomForestRegressor(n_estimators=100, n_jobs=-1, oob_score=True) 

rf_clean.fit(X_clean_train, y_clean_train) 

rf_clean.score(X_clean_train, y_clean_train)

In [None]:
rf_clean.oob_score_

In [None]:
rf_clean.score(X_clean_val, y_clean_val)

In [None]:
train_r2 = []
train_mae = []
val_r2 = []
val_mae = []
oob_scores = []

for i in range(10):
    X_clean_train, X_clean_val, y_clean_train, y_clean_val = train_test_split(X_clean, y_clean, test_size=0.2)
    rf_clean = RandomForestRegressor(n_estimators=100, n_jobs=-1, oob_score=True) 
    rf_clean.fit(X_clean_train, y_clean_train)
    train_preds = rf_clean.predict(X_clean_train)
    val_preds = rf_clean.predict(X_clean_val)
    train_r2.append(round(r2_score(y_clean_train, train_preds), 4))
    val_r2.append(round(r2_score(y_clean_val, val_preds), 4))
    train_mae.append(round(mean_absolute_error(y_clean_train, train_preds), 0))
    val_mae.append(round(mean_absolute_error(y_clean_val, val_preds), 0))
    oob_scores.append(round(rf_clean.oob_score_, 4))


In [None]:
print("After cleaning: \n")
print("Train r2 scores: \n", train_r2)
print("")
print("Validation r2 scores: \n", val_r2)
print("")
print("Train MAE scores: \n", train_mae)
print("")
print("Validation MAE scores: \n", val_mae)

In [None]:
print("Mean train r2: ", round(np.mean(train_r2), 4))
print("Mean validation r2: ", round(np.mean(val_r2), 4))
print("Mean train MAE: ", round(np.mean(train_mae), 0))
print("Mean validation MAE: ", round(np.mean(val_mae), 0))

## 3. Data Transformations

In [None]:
top_ten_orig = rent_num['price'].sort_values(ascending=False).head(10).reset_index()['price']
bottom_ten_orig = rent_num['price'].sort_values(ascending=False).tail(10).reset_index()['price']

pd.DataFrame({'Top 10 prices': top_ten_orig, 'Lowest 10 prices': bottom_ten_orig}) 

The ratio of the highest to lowest price:

In [None]:
4490000 / 43

In [None]:
top_ten_log = np.log(rent_num['price'].sort_values(ascending=False).head(10).reset_index()['price'])
bottom_ten_log = np.log(rent_num['price'].sort_values(ascending=False).tail(10).reset_index()['price'])

pd.DataFrame({'Top 10 log(prices)': top_ten_log, 'Lowest 10 log(prices)': bottom_ten_log})

The ratio of the highest to lowest `log(price)`:

In [None]:
15.317363 / 3.7612

In [None]:
plt.hist(rent_num['price'], bins=45)
plt.xlabel('Clipped Price')
plt.ylabel('Num Apts at that price')

In [None]:
plt.hist(rent_clean['price'], bins=45)
plt.xlabel('Clipped Price')
plt.ylabel('Num Apts at that price')

In [None]:
plt.hist(np.log(rent_num['price']), bins=45)
plt.xlabel('Clipped Price')
plt.ylabel('Num Apts at that price')

#### Train Model using `log(price)`

Using the dataset before cleaning (`rent_num`), build and evaluate a model using *log(price)* as the target instead of *price*. Calculate the average of $R^2$ on the validation data for 10 runs. Or, instead, calculate the average for the  *out-of-bag score*. 