In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import RandomizedSearchCV
from sklearn import metrics
import json
from sklearn.ensemble import AdaBoostRegressor
from sklearn.model_selection import GridSearchCV


def load_from_disk(name):
    try:
        with open(name, 'r') as f:
            tree = json.load(f)
            return tree
    except:
        return None

In [2]:
lbl2word = load_from_disk("lbl2word.json")
word2lbl = load_from_disk("word2lbl.json")
symbol2name = load_from_disk("symbol2name.json")
name2symbol = load_from_disk("name2symbol.json")

In [3]:
expcheb = pd.read_csv("expcheb.csv")

In [4]:
expcheb.head()

Unnamed: 0,sName,sSymbol,eligble voters,voters,droped,valid,1,2,3,4,...,Arabs,Founding year,Current type of locality,Organizational affiliation,Coordinates,height,Planning Commission,Police space,year,Cluster Local Authorities
0,Tirosh,10,333,237,2,235,3,0,0,0,...,0.0,1955,310,2.0,1892863000.0,130.0,151.0,15000597.0,2018,0.0
1,Mevasseret Ziyyon,1015,18871,13883,37,13846,81,2,0,2,...,98.0,1951,160,0.0,2144363000.0,583.0,152.0,10002475.0,2018,0.0
2,Me'ir Shefeya,102,106,54,0,54,0,0,0,0,...,38.0,1923,340,0.0,1975772000.0,63.0,303.0,10004333.0,2018,0.0
3,Or Aqiva,1020,15520,9773,72,9701,119,0,0,0,...,93.0,1951,170,0.0,1927371000.0,10.0,353.0,10004261.0,2018,0.0
4,Haruzim,1024,584,442,3,439,3,0,0,0,...,0.0,1951,350,0.0,1874468000.0,39.0,401.0,15000060.0,2018,0.0


In [5]:
expcheb2 = pd.read_csv("expcheb2.csv")

In [6]:
expcheb3 = pd.read_csv("expcheb3.csv")

In [7]:
data = expcheb.append(expcheb2, ignore_index=True)

In [8]:
del expcheb2
del expcheb

In [9]:
def drop_cols(data):
    cols = ["sName", "voters", "valid", "droped", "subdistrict", "Natural area", "Metropolitan affiliation", "Founding year", "Organizational affiliation", "Coordinates", "height", "Planning Commission", "Police space", "year", "Cluster Local Authorities"]
    for col in cols:
        data = data.drop(col, axis=1)
        
    return data

In [10]:
train_label = data["droped"]
data = drop_cols(data)
test_label = expcheb3["droped"]
expcheb3 = drop_cols(expcheb3)

In [11]:
RSEED = 50

In [12]:
adb = AdaBoostRegressor()
adb_param_grid = {'n_estimators':[50,100,150,200,250], #Number of weak learners to train iteratively., 
                'learning_rate':[0.001, 0.01, 0.1, 1], #It contributes to the weights of weak learners. It uses 1 as a default value.,
                'random_state': [1]}

gsADB = GridSearchCV(adb,param_grid = adb_param_grid, cv=5, n_jobs= -1)

gsADB.fit(data,train_label)

ADB_best = gsADB.best_estimator_
    
print("Best Parameters:\n", gsADB.best_params_)
    
adb = AdaBoostRegressor(ADB_best)
adb.fit(data, train_label)
print(adb)

Best Parameters:
 {'learning_rate': 0.1, 'n_estimators': 250, 'random_state': 1}
AdaBoostRegressor(base_estimator=AdaBoostRegressor(base_estimator=None,
                                                   learning_rate=0.1,
                                                   loss='linear',
                                                   n_estimators=250,
                                                   random_state=1),
                  learning_rate=1.0, loss='linear', n_estimators=50,
                  random_state=None)


In [13]:
train_rf_predictions = adb.predict(data)
# train_rf_probs = best_model.predict_proba(train)[:, 1]

rf_predictions = adb.predict(expcheb3)
metrics.mean_squared_error(rf_predictions, test_label)

# rf_probs = best_model.predict_proba(test)[:, 1]

606.5875873933468

In [14]:
df = pd.DataFrame({"Predictions":rf_predictions})
names = [symbol2name[str(value)] for value in expcheb3["sSymbol"]]
df.insert(0,"sName", names)
df = df.sort_values('Predictions', ascending = False)
df.head(10)

Unnamed: 0,sName,Predictions
493,Jerusalem,1840.0
196,Bene Beraq,1238.913924
405,Haifa,773.08
987,Petah Tiqwa,772.346667
1187,Tel Aviv - Yafo,771.737705
114,Ashdod,767.64
129,Be'er Sheva,763.842975
859,Netanya,757.626506
1056,Rishon LeZiyyon,733.464286
392,Holon,554.789474
