In [1]:
from sklearn.ensemble import AdaBoostRegressor
from sklearn import metrics
from sklearn.model_selection import GridSearchCV

import pandas as pd
import json

def load_from_disk(name):
    try:
        with open(name, 'r') as f:
            tree = json.load(f)
            return tree
    except:
        return None

In [2]:
lbl2word = load_from_disk("lbl2word.json")
word2lbl = load_from_disk("word2lbl.json")
symbol2name = load_from_disk("symbol2name.json")
name2symbol = load_from_disk("name2symbol.json")

In [3]:
expcheb = pd.read_csv("expcheb.csv")

In [4]:
expcheb.head()

Unnamed: 0,sName,sSymbol,eligble voters,voters,droped,valid,1,2,3,4,...,Arabs,Founding year,Current type of locality,Organizational affiliation,Coordinates,height,Planning Commission,Police space,year,Cluster Local Authorities
0,Tirosh,10,333,237,2,235,3,0,0,0,...,0.0,1955,310,2.0,1892863000.0,130.0,151.0,15000597.0,2018,0.0
1,Mevasseret Ziyyon,1015,18871,13883,37,13846,81,2,0,2,...,98.0,1951,160,0.0,2144363000.0,583.0,152.0,10002475.0,2018,0.0
2,Me'ir Shefeya,102,106,54,0,54,0,0,0,0,...,38.0,1923,340,0.0,1975772000.0,63.0,303.0,10004333.0,2018,0.0
3,Or Aqiva,1020,15520,9773,72,9701,119,0,0,0,...,93.0,1951,170,0.0,1927371000.0,10.0,353.0,10004261.0,2018,0.0
4,Haruzim,1024,584,442,3,439,3,0,0,0,...,0.0,1951,350,0.0,1874468000.0,39.0,401.0,15000060.0,2018,0.0


In [5]:
expcheb2 = pd.read_csv("expcheb2.csv")

In [6]:
expcheb3 = pd.read_csv("expcheb3.csv")

In [7]:
data = expcheb.append(expcheb2, ignore_index=True)

In [8]:
del expcheb2
del expcheb

In [9]:
def drop_cols(data):
    cols = ["sName", "voters", "valid", "eligble voters", "droped", "subdistrict", "Natural area", "Metropolitan affiliation", "Founding year", "Organizational affiliation", "Coordinates", "height", "Planning Commission", "Police space", "year", "Cluster Local Authorities"]
    for col in cols:
        data = data.drop(col, axis=1)
        
    return data

In [10]:
data = drop_cols(data)
expcheb3 = drop_cols(expcheb3)

In [11]:
data.head()

Unnamed: 0,sSymbol,1,2,3,4,5,6,7,8,9,...,59,60,district,Municipal status,religion,Total Population 2018,Jews and others,Thereof: Jews,Arabs,Current type of locality
0,10,3,0,0,0,0,0,0,0,0,...,0,1,1,26.0,1.0,501.0,501.0,500.0,0.0,310
1,1015,81,2,0,2,0,0,5,3,3,...,0,299,1,99.0,1.0,23962.0,23864.0,23114.0,98.0,160
2,102,0,0,0,0,0,0,0,0,0,...,0,2,3,15.0,1.0,286.0,248.0,187.0,38.0,340
3,1020,119,0,0,0,0,0,0,4,11,...,0,339,3,0.0,1.0,18236.0,18143.0,16787.0,93.0,170
4,1024,3,0,0,0,0,0,0,0,0,...,0,9,4,19.0,1.0,866.0,866.0,859.0,0.0,350


In [12]:
def classify(data, label):
    adb = AdaBoostRegressor()
    adb_param_grid = {'n_estimators':[50,100,150,200,250], #Number of weak learners to train iteratively., 
                  'learning_rate':[0.001, 0.01, 0.1, 1], #It contributes to the weights of weak learners. It uses 1 as a default value.,
                  'random_state': [1]}

    gsADB = GridSearchCV(adb,param_grid = adb_param_grid, cv=5, n_jobs= -1)

    gsADB.fit(data,label)

    ADB_best = gsADB.best_estimator_
    
    print("Best Parameters:\n", gsADB.best_params_)
    
    adb = AdaBoostRegressor(ADB_best)
    adb.fit(data, label)
    print(adb)
    return adb

In [13]:
selected_patrties = [word2lbl["מחל"], word2lbl["אמת"], word2lbl["ודעם"]  ]
training_labels =  [data[str(lbl)] for lbl in selected_patrties]

In [14]:
training_labels[0]

0        124
1       4940
2         15
3       5107
4        131
        ... 
2419    5785
2420    7385
2421    8626
2422    6954
2423    1404
Name: 45, Length: 2424, dtype: int64

In [15]:
def drop_party_cols(data, cols):
    for col in cols:
        data = data.drop(str(col), axis=1)
            
    return data

In [16]:
# data = drop_party_cols(data, selected_patrties)

In [17]:
data.shape

(2424, 69)

In [18]:
selected_patrties == 5

False

In [19]:
models = []
for party_num, lbl in zip(selected_patrties, training_labels):
    loc = data.columns.get_loc(str(party_num))
    data = data.drop(str(party_num), axis=1)
    print(f"classifying party {lbl2word[str(party_num)]}")
    models.append(classify(data, lbl))
    data.insert(loc, str(party_num), lbl)

classifying party מחל
Best Parameters:
 {'learning_rate': 0.01, 'n_estimators': 50, 'random_state': 1}
AdaBoostRegressor(base_estimator=AdaBoostRegressor(base_estimator=None,
                                                   learning_rate=0.01,
                                                   loss='linear',
                                                   n_estimators=50,
                                                   random_state=1),
                  learning_rate=1.0, loss='linear', n_estimators=50,
                  random_state=None)
classifying party אמת
Best Parameters:
 {'learning_rate': 0.1, 'n_estimators': 150, 'random_state': 1}
AdaBoostRegressor(base_estimator=AdaBoostRegressor(base_estimator=None,
                                                   learning_rate=0.1,
                                                   loss='linear',
                                                   n_estimators=150,
                                                   random_state=1)

In [20]:
data.head()

Unnamed: 0,sSymbol,1,2,3,4,5,6,7,8,9,...,59,60,district,Municipal status,religion,Total Population 2018,Jews and others,Thereof: Jews,Arabs,Current type of locality
0,10,3,0,0,0,0,0,0,0,0,...,0,1,1,26.0,1.0,501.0,501.0,500.0,0.0,310
1,1015,81,2,0,2,0,0,5,3,3,...,0,299,1,99.0,1.0,23962.0,23864.0,23114.0,98.0,160
2,102,0,0,0,0,0,0,0,0,0,...,0,2,3,15.0,1.0,286.0,248.0,187.0,38.0,340
3,1020,119,0,0,0,0,0,0,4,11,...,0,339,3,0.0,1.0,18236.0,18143.0,16787.0,93.0,170
4,1024,3,0,0,0,0,0,0,0,0,...,0,9,4,19.0,1.0,866.0,866.0,859.0,0.0,350


In [21]:
for party_num, model in zip(selected_patrties, models):
    print(f"feature importances for  {lbl2word[str(party_num)]}")
    print(model.feature_importances_)

feature importances for  מחל
[3.15356143e-03 1.08029488e-03 6.65290803e-05 0.00000000e+00
 4.16255760e-01 1.92979360e-05 7.61920516e-07 1.37890966e-03
 2.31455244e-05 3.08554292e-05 1.36837081e-04 9.26881234e-06
 2.40530751e-04 8.75911594e-03 5.47807367e-07 2.23997683e-04
 1.47667117e-05 6.31703556e-03 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 3.15499117e-04
 3.26341697e-04 8.52008180e-04 0.00000000e+00 5.47849722e-05
 6.02045014e-02 0.00000000e+00 7.89593050e-05 1.89165620e-05
 2.87684365e-06 6.70873468e-06 6.17583038e-03 0.00000000e+00
 7.20470178e-02 0.00000000e+00 1.24991018e-05 2.89726628e-06
 9.83348922e-07 8.52206023e-05 2.18120254e-04 0.00000000e+00
 1.28778217e-04 6.92180162e-03 1.60172687e-02 0.00000000e+00
 7.53916294e-04 1.53982114e-05 3.60092048e-06 1.22544003e-06
 8.20915606e-04 0.00000000e+00 0.00000000e+00 1.49176044e-04
 3.79231893e-04 1.44939928e-06 0.00000000e+00 3.43918704e-03
 1.89511272e-03 0.00000000e+00 3.81443452e-05 1.04528668

In [22]:
test_labels = [expcheb3[str(lbl)] for lbl in selected_patrties]
# expcheb3 = drop_party_cols(expcheb3)

In [23]:
expcheb3.head()

Unnamed: 0,sSymbol,1,2,3,4,5,6,7,8,9,...,59,60,district,Municipal status,religion,Total Population 2018,Jews and others,Thereof: Jews,Arabs,Current type of locality
0,967,0,0,0,2,0,1,0,1,0,...,0,0,6,0.0,3.0,0.0,0.0,0.0,0.0,460
1,472,4,0,0,0,0,0,0,0,0,...,0,0,1,99.0,2.0,7543.0,97.0,79.0,7446.0,280
2,473,6,0,0,0,0,2,0,2,0,...,0,0,2,99.0,2.0,13915.0,28.0,11.0,13887.0,270
3,958,0,0,0,0,0,0,0,0,0,...,0,0,6,0.0,3.0,0.0,0.0,0.0,0.0,460
4,968,0,0,0,0,0,0,0,1,0,...,0,0,6,0.0,3.0,0.0,0.0,0.0,0.0,460


In [24]:
predictions = {}
for party_num, lbl, model in zip(selected_patrties, test_labels, models):
    loc = expcheb3.columns.get_loc(str(party_num))
    expcheb3 = expcheb3.drop(str(party_num), axis=1)
    y_pred = model.predict(expcheb3)
    predictions[lbl2word[str(party_num)]] = y_pred
    print(f"error rate: {metrics.mean_squared_error(lbl, y_pred)}")
    expcheb3.insert(loc, str(party_num), lbl)

error rate: 1145608.8411293395
error rate: 443986.53841359034
error rate: 1044606.454613275


In [25]:
expcheb3.head()

Unnamed: 0,sSymbol,1,2,3,4,5,6,7,8,9,...,59,60,district,Municipal status,religion,Total Population 2018,Jews and others,Thereof: Jews,Arabs,Current type of locality
0,967,0,0,0,2,0,1,0,1,0,...,0,0,6,0.0,3.0,0.0,0.0,0.0,0.0,460
1,472,4,0,0,0,0,0,0,0,0,...,0,0,1,99.0,2.0,7543.0,97.0,79.0,7446.0,280
2,473,6,0,0,0,0,2,0,2,0,...,0,0,2,99.0,2.0,13915.0,28.0,11.0,13887.0,270
3,958,0,0,0,0,0,0,0,0,0,...,0,0,6,0.0,3.0,0.0,0.0,0.0,0.0,460
4,968,0,0,0,0,0,0,0,1,0,...,0,0,6,0.0,3.0,0.0,0.0,0.0,0.0,460


In [26]:
cities = ["Daliyat Al-Karmel", "Karmi'el", "Bene Beraq", "Jerusalem", "Sakhnin"]

In [27]:
df = pd.DataFrame(predictions)
names = [symbol2name[str(value)] for value in expcheb3["sSymbol"]]
df.insert(0,"sName", names)
df.loc[df["sName"].isin(cities)]

Unnamed: 0,sName,מחל,אמת,ודעם
196,Bene Beraq,5293.833333,1394.803653,326.159938
321,Daliyat Al-Karmel,610.85206,1834.411688,1804.55597
493,Jerusalem,62204.707071,17905.666667,7234.156342
606,Karmi'el,8538.911392,1486.813953,544.831338
868,Sakhnin,222.877056,731.811663,14020.660494
