In [9]:
# ONLY FOR GOOGLE COLAB
from getpass import getpass
import os

#os.environ['USER'] = input('Enter the username of your Github account: ') 
os.environ['PASSWORD'] = getpass('Enter the password of your Github account: ')
#os.environ['REPOSITORY'] = input('Enter the name of the Github repository: ')

os.environ['USER'] = 'DimiDR'
os.environ['REPOSITORY'] = 'RentPredData'


os.environ['GITHUB_AUTH'] = os.environ['USER'] + ':' + os.environ['PASSWORD']

!rm -rf $REPOSITORY # To remove the previous clone of the Github repository
!git clone https://$GITHUB_AUTH@github.com/$USER/$REPOSITORY.git

os.environ['USER'] = os.environ['PASSWORD'] = os.environ['REPOSITORY'] = os.environ['GITHUB_AUTH'] = ""
! ls
%cd RentPredData
! ls

In [26]:
#XGBoost
# https://stackabuse.com/gradient-boosting-classifiers-in-python-with-scikit-learn/
# https://blogs.sas.com/content/subconsciousmusings/2017/04/12/machine-learning-algorithm-use/
import pandas as pd
import sqlite3
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.externals import joblib
from sklearn.preprocessing import OneHotEncoder

# Data Selection from DB

In [5]:
cnx_rent = sqlite3.connect('real-estate-rent.db')
cnx_buy = sqlite3.connect('real-estate-buy.db')

df_rent = pd.read_sql_query("SELECT * FROM immoscout", cnx_rent)
df_buy = pd.read_sql_query("SELECT * FROM immoscout", cnx_buy)
cnx_rent.close()
cnx_buy.close()
print('Rent Shape: ', df_rent.shape)
print('Buy Shape: ', df_buy.shape)


Rent Shape:  (90637, 37)
Buy Shape:  (109, 37)


# Methods

In [34]:
def encode_data_for_training(df):
    # Prepare the training data for training
    # Features:
    # Data One Hot Encoding
    # will save the encoder files in the folder
    # use bag of words for one hot encoding
    
    #df.immo_id = df.immo_id.astype(str)
    
    #set to True or False and then to 1 or 0
    df["balcony"] = df["balcony"] == "true"
    df["balcony"] = df.balcony.astype(int)
    df["cellar"] = df["cellar"] == "true"
    df["cellar"] = df.cellar.astype(int)
    df["garden"] = df["garden"] == "true"
    df["garden"] = df.garden.astype(int)
    df["lift"] = df["lift"] == "true"
    df["lift"] = df.lift.astype(int)
    
    # one hot city encoding
    enc_city = OneHotEncoder(handle_unknown='ignore')
    enc_city.fit(df[['city']])
    df_city_enc = pd.DataFrame(data=enc_city.transform(df[['city']]).toarray(), columns=enc_city.get_feature_names(['city']), dtype=bool)
    df_city_enc = df_city_enc * 1
    # save 
    joblib.dump(enc_city, 'encoder_city.joblib')

    # one hot quarter encoding
    enc_quarter = OneHotEncoder(handle_unknown='ignore')
    enc_quarter.fit(df2[['quarter']])
    df_quarter_enc = pd.DataFrame(data=enc_quarter.transform(df[['quarter']]).toarray(), columns=enc_quarter.get_feature_names(['quarter']), dtype=bool)
    df_quarter_enc = df_quarter_enc * 1
    # save 
    joblib.dump(enc_quarter, 'encoder_quarter.joblib')

    # concatenate training data
    X = pd.concat((df[['balcony', 'cellar', 'garden', 'lift', 'livingSpace', 'numberOfRooms']], df_city_enc, df_quarter_enc), axis=1)
    y = df[["value"]]
    X = bag_of_words(X, df)
    return (X, y)

def predict_data(df, model, enc_city, enc_quarter):
    # predict data with given model
    # the formating of the data should be the same as the trained model
    
    #set to True or False and then to 1 or 0
    df["balcony"] = df["balcony"] == "true"
    df["balcony"] = df.balcony.astype(int)
    df["cellar"] = df["cellar"] == "true"
    df["cellar"] = df.cellar.astype(int)
    df["garden"] = df["garden"] == "true"
    df["garden"] = df.garden.astype(int)
    df["lift"] = df["lift"] == "true"
    df["lift"] = df.lift.astype(int)
    
    # enc_city, enc_quarter
    hot_city = pd.DataFrame(data=enc_city.transform(df[['city']]).toarray(), columns=enc_city.get_feature_names(['city']), dtype=bool)
    hot_city = hot_city * 1
    hot_quarter = pd.DataFrame(data=enc_quarter.transform(df[['quarter']]).toarray(), columns=enc_quarter.get_feature_names(['quarter']), dtype=bool)
    hot_quarter = hot_quarter * 1
    # data for prediction
    X = pd.concat((df[['balcony', 'cellar', 'garden', 'lift', 'livingSpace', 'numberOfRooms']], hot_city, hot_quarter), axis=1)
    y = df[["value"]]
    X = bag_of_words(X, df)
    y_pred = model.predict(X)
    return y_pred
    
def bag_of_words(X, df):
    # BagOfWords implementation
    # As no nice and easy implementation was found this was the next best thing.
    # For every word in the vacabulary a column is crated and added if the text includes the corresponding word.
    vocabulary = [ "uni", "modern", "dach", "loft", "pool", "wg", "altbau", "luxu", "terasse", 
                  "neubau", "maisonet", "penthouse", "erstbezug", "kamin", "langzeit", "renoviert", "dachgeschoss"]
    for word in vocabulary:
        X[word] = 0
        for index, row in df.iterrows():
            if word.lower() in row["title"].lower():
                #X.set_value(index, word, 1)
                X.at[index, word] = 1
    return X

# Rent Analysis
## Data Cleaning

In [7]:
%%capture
df2 = df_rent[["title", "city", "quarter", "balcony", "cellar",
         "garden", "lift", "livingSpace", "numberOfRooms", "value"]]
X, y = encode_data_for_training(df2)

## Training

In [9]:
%%time
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)
#XGBBoosting -------------------
xgb_reg = XGBRegressor(objective ='reg:squarederror')
xgb_reg.fit(X_train, y_train)

Wall time: 19min 20s


XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0,
             importance_type='gain', learning_rate=0.1, max_delta_step=0,
             max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
             n_jobs=1, nthread=None, objective='reg:squarederror',
             random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
             seed=None, silent=None, subsample=1, verbosity=1)

In [10]:
score = xgb_reg.score(X_test, y_test)
print(score)

0.7209947784436077


## Save Model

In [11]:
joblib.dump(xgb_reg, 'model_1.sav')

## Max Learning

learning with 100% of data

In [12]:
%%time
#max learning with all the data
xgb_reg_max = XGBRegressor(objective ='reg:squarederror')
xgb_reg_max.fit(X, y)

Wall time: 24min 38s


XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0,
             importance_type='gain', learning_rate=0.1, max_delta_step=0,
             max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
             n_jobs=1, nthread=None, objective='reg:squarederror',
             random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
             seed=None, silent=None, subsample=1, verbosity=1)

## Save Max Model

In [14]:
joblib.dump(xgb_reg_max, 'model_1_max.sav')

In [40]:
score = xgb_reg_max.score(X, y)
print(score)

0.9997453215629898


# Buy Analysis
## load Model

In [15]:
filename = 'model_1_max.sav'
loaded_model = joblib.load(filename)
loaded_model

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0,
             importance_type='gain', learning_rate=0.1, max_delta_step=0,
             max_depth=3, min_child_weight=1, missing=nan, n_estimators=100,
             n_jobs=1, nthread=None, objective='reg:squarederror',
             random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
             seed=None, silent=None, subsample=1, verbosity=1)

## Predict Data

In [37]:
%%capture
df2 = df_buy[["title", "city", "quarter", "balcony", "cellar",
         "garden", "lift", "livingSpace", "numberOfRooms", "value"]]

encoder_city = joblib.load('encoder_city.joblib')
encoder_quarter = joblib.load('encoder_quarter.joblib')

y_pred = predict_data(df2, xgb_reg, encoder_city, encoder_quarter)

In [39]:
y_pred.shape

(109,)

# Save to CSV

In [44]:
df_output = df_buy
df_output['prediction'] = y_pred
df_output.to_csv("data_predict.xls", sep=';', decimal=",", encoding='utf-8-sig')