# Rental Prices

## Package Import and Configuration

In [0]:
import numpy as np
import pandas as pd
from sklearn.linear_model import SGDRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold, train_test_split

## Data Import and Preprocessing

In [0]:
data = pd.read_csv('https://raw.githubusercontent.com/saschaschworm/dsb/master/Data%20Sets/Demos%20and%20Exercises/rental_prices/rental_prices.csv', sep=',')

### Data Preview

In [3]:
data.head()

Unnamed: 0,living_space,rental_price
0,40.38,551.73
1,40.4,528.99
2,41.42,529.86
3,41.74,530.01
4,42.37,533.01


### Data Preprocessing

There is not data preprocessing step in this example. The only thing we do here is to transform the features to a matrix $X$ and the target variable to a vector $y$.

In [0]:
X, y = data[['living_space']].values, data['rental_price'].values

## Modelling

### Training with Linear Regression with Stochastic Gradient Descent

In [5]:
# Set Seed for "Deterministic Randomness".
np.random.seed(1909)

# Initialize the Linear Regression Model.
linear_model = SGDRegressor(max_iter=1000, eta0=0.0001)

# Perform the Learning.
linear_model.fit(X, y)

SGDRegressor(alpha=0.0001, average=False, epsilon=0.1, eta0=0.0001,
       fit_intercept=True, l1_ratio=0.15, learning_rate='invscaling',
       loss='squared_loss', max_iter=1000, n_iter=None, penalty='l2',
       power_t=0.25, random_state=None, shuffle=True, tol=None, verbose=0,
       warm_start=False)

### Prediction

In [0]:
y_pred = linear_model.predict([[44]])

### Result

In [7]:
print(f'Rental Price Prediction for a 44sqm Appartment: {y_pred[0]:.2f}$')
print(f'Value for Parameter Theta 0: {linear_model.intercept_[0]:.2f}')
print(f'Value for Parameter Theta 1: {linear_model.coef_[0]:.2f}')

Rental Price Prediction for a 44sqm Appartment: 481.46$
Value for Parameter Theta 0: 0.90
Value for Parameter Theta 1: 10.92


## Evaluation

### Holdout Method

#### Training on Training Set with Linear Regression with Stochastic Gradient Descent

In [8]:
# Set Seed for "Deterministic Randomness".
np.random.seed(1909)

# Train-Test-Split with 75% for Training and 25% for Testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

# Perform the Learning
linear_model.fit(X_train, y_train)

SGDRegressor(alpha=0.0001, average=False, epsilon=0.1, eta0=0.0001,
       fit_intercept=True, l1_ratio=0.15, learning_rate='invscaling',
       loss='squared_loss', max_iter=1000, n_iter=None, penalty='l2',
       power_t=0.25, random_state=None, shuffle=True, tol=None, verbose=0,
       warm_start=False)

#### Prediction on Test Set

In [0]:
# Perform the Prediction on the Test-Set-Features.
y_pred = linear_model.predict(X_test)

#### Result

In [10]:
# Calculating the Root Mean Squared Error.
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print(f'RMSE: {rmse:.2f}')

RMSE: 36.49


### 10-Fold-Cross-Validation

#### Algorithm

In [0]:
# Set seed for "deterministic randomness".
np.random.seed(1909)

# Initialize 10-Fold-Cross-Validation.
k_fold = KFold(n_splits=10)

# Empty lists for persisting the performances measures calculated in each iteration. 
rmses = []

for train_idx, test_idx in k_fold.split(X):
  # Split dataset into a train and test set.
  X_train, X_test = X[train_idx], X[test_idx]
  y_train, y_test = y[train_idx], y[test_idx]
  
  # Run the logistic regression on the current training set.
  linear_model.fit(X_train, y_train)
  
  # Perform the Prediction on the Test-Set-Features.
  y_pred = linear_model.predict(X_test)
  
  # Calculate the performance measures on the current test set.
  rmse = np.sqrt(mean_squared_error(y_test, y_pred))
  
  # Append to performances measures to lists.
  rmses.append(rmse)

#### Result

In [12]:
print(f'Average RMSE: {np.mean(rmses):.2f}')

Average RMSE: 44.90
