In [1]:
from sklearn.ensemble import AdaBoostRegressor
from sklearn import metrics
from sklearn.model_selection import GridSearchCV

import pandas as pd
import json

def load_from_disk(name):
    try:
        with open(name, 'r') as f:
            tree = json.load(f)
            return tree
    except:
        return None

In [2]:
lbl2word = load_from_disk("lbl2word.json")
word2lbl = load_from_disk("word2lbl.json")
symbol2name = load_from_disk("symbol2name.json")
name2symbol = load_from_disk("name2symbol.json")

In [3]:
expcheb = pd.read_csv("expcheb.csv")

In [4]:
expcheb.head()

Unnamed: 0,sName,sSymbol,eligble voters,voters,droped,valid,2,3,4,5,...,Arabs,Founding year,Current type of locality,Organizational affiliation,Coordinates,height,Planning Commission,Police space,year,Cluster Local Authorities\n
0,Tirosh,10,333,237,2,235,0,0,0,10,...,0.0,1955,310,2.0,1892863000.0,130.0,151.0,15000597.0,2018,0.0
1,Mevasseret Ziyyon,1015,18871,13883,37,13846,0,0,3,409,...,98.0,1951,160,0.0,2144363000.0,583.0,152.0,10002475.0,2018,0.0
2,Me'ir Shefeya,102,106,54,0,54,0,0,0,0,...,38.0,1923,340,0.0,1975772000.0,63.0,303.0,10004333.0,2018,0.0
3,Or Aqiva,1020,15520,9773,72,9701,0,0,6,153,...,93.0,1951,170,0.0,1927371000.0,10.0,353.0,10004261.0,2018,0.0
4,Haruzim,1024,584,442,3,439,0,0,3,1,...,0.0,1951,350,0.0,1874468000.0,39.0,401.0,15000060.0,2018,0.0


In [5]:
expcheb2 = pd.read_csv("expcheb2.csv")

In [6]:
expcheb3 = pd.read_csv("expcheb3.csv")

In [7]:
data = expcheb.append(expcheb2, ignore_index=True)

In [8]:
del expcheb2
del expcheb

In [9]:
def drop_cols(data):
    data = data.drop("sName", axis=1).drop("voters", axis=1).drop("valid", axis=1).drop("eligble voters", axis=1).drop("droped", axis=1).drop("subdistrict", axis=1).drop("Natural area", axis=1).drop("Metropolitan affiliation", axis=1).drop("Founding year", axis=1).drop("Organizational affiliation", axis=1).drop("Coordinates", axis=1).drop("height", axis=1).drop("Planning Commission", axis=1).drop("Police space", axis=1).drop("year", axis=1)
    return data

In [10]:
data = drop_cols(data)
expcheb3 = drop_cols(expcheb3)

In [11]:
data.head()

Unnamed: 0,sSymbol,2,3,4,5,6,7,8,9,10,...,61,district,Municipal status,religion,Total Population 2018,Jews and others,Thereof: Jews,Arabs,Organizational affiliation,Cluster Local Authorities\n
0,10,0,0,0,10,0,0,0,0,0,...,0,1,26.0,1.0,501.0,501.0,500.0,0.0,2.0,0.0
1,1015,0,0,3,409,110,3,5,0,0,...,0,1,99.0,1.0,23962.0,23864.0,23114.0,98.0,0.0,0.0
2,102,0,0,0,0,2,0,0,0,0,...,0,3,15.0,1.0,286.0,248.0,187.0,38.0,0.0,0.0
3,1020,0,0,6,153,891,11,0,0,0,...,0,3,0.0,1.0,18236.0,18143.0,16787.0,93.0,0.0,0.0
4,1024,0,0,3,1,1,0,0,0,0,...,0,4,19.0,1.0,866.0,866.0,859.0,0.0,0.0,0.0


In [12]:
def classify(data, label):
    adb = AdaBoostRegressor()
    adb_param_grid = {'n_estimators':[50,100,150,200,250], #Number of weak learners to train iteratively., 
                  'learning_rate':[0.001, 0.01, 0.1, 1], #It contributes to the weights of weak learners. It uses 1 as a default value.,
                  'random_state': [1]}

    gsADB = GridSearchCV(adb,param_grid = adb_param_grid, cv=5, n_jobs= -1)

    gsADB.fit(data,label)

    ADB_best = gsADB.best_estimator_
    
    print("Best Parameters:\n", gsADB.best_params_)
    
    adb = AdaBoostRegressor(ADB_best)
    adb.fit(data, label)
    print(adb)
    return adb

In [13]:
selected_patrties = [word2lbl["מחל"], word2lbl["אמת"], word2lbl["ודעם"]  ]
training_labels =  [data[str(lbl)] for lbl in selected_patrties]

In [14]:
training_labels[0]

0        124
1       4940
2         15
3       5107
4        131
        ... 
2419    5785
2420    7385
2421    8626
2422    6954
2423    1404
Name: 36, Length: 2424, dtype: int64

In [15]:
def drop_party_cols(data):
    for col in data.columns:
        if col.isnumeric():
            data = data.drop(col, axis=1)
            
    return data

In [16]:
data = drop_party_cols(data)

In [17]:
data.shape

(2424, 10)

In [18]:
models = []
for party_num, lbl in zip(selected_patrties, training_labels):
    print(f"classifying party {lbl2word[str(party_num)]}")
    models.append(classify(data, lbl))

classifying party מחל
Best Parameters:
 {'learning_rate': 0.1, 'n_estimators': 250, 'random_state': 1}
AdaBoostRegressor(base_estimator=AdaBoostRegressor(base_estimator=None,
                                                   learning_rate=0.1,
                                                   loss='linear',
                                                   n_estimators=250,
                                                   random_state=1),
                  learning_rate=1.0, loss='linear', n_estimators=50,
                  random_state=None)
classifying party אמת
Best Parameters:
 {'learning_rate': 0.1, 'n_estimators': 50, 'random_state': 1}
AdaBoostRegressor(base_estimator=AdaBoostRegressor(base_estimator=None,
                                                   learning_rate=0.1,
                                                   loss='linear',
                                                   n_estimators=50,
                                                   random_state=1),


In [19]:
for party_num, model in zip(selected_patrties, models):
    print(f"feature importances for  {lbl2word[str(party_num)]}")
    print(model.feature_importances_)

feature importances for  מחל
[9.85239683e-03 3.86486274e-03 0.00000000e+00 1.03965944e-04
 4.37624726e-01 4.07163564e-01 1.05173111e-01 3.62173733e-02
 0.00000000e+00 0.00000000e+00]
feature importances for  אמת
[0.01758568 0.0053922  0.         0.00329762 0.28423772 0.30595229
 0.3399903  0.04354418 0.         0.        ]
feature importances for  ודעם
[9.86990960e-03 2.92920926e-03 5.02556227e-04 3.74481760e-03
 1.02570645e-02 3.64219330e-03 6.92351700e-03 9.60049233e-01
 0.00000000e+00 2.08149917e-03]


In [20]:
test_labels = [expcheb3[str(lbl)] for lbl in selected_patrties]
expcheb3 = drop_party_cols(expcheb3)

In [21]:
expcheb3.head()

Unnamed: 0,sSymbol,district,Municipal status,religion,Total Population 2018,Jews and others,Thereof: Jews,Arabs,Organizational affiliation,Cluster Local Authorities\n
0,967,6,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0
1,472,1,99.0,2.0,7543.0,97.0,79.0,7446.0,0.0,0.0
2,473,2,99.0,2.0,13915.0,28.0,11.0,13887.0,0.0,0.0
3,958,6,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0
4,968,6,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0


In [22]:
predictions = {}
for party_num, lbl, model in zip(selected_patrties, test_labels, models):
    y_pred = model.predict(expcheb3)
    predictions[lbl2word[str(party_num)]] = y_pred
    print(f"error rate: {metrics.mean_squared_error(lbl, y_pred)}")

error rate: 986191.2557381871
error rate: 634777.0540017324
error rate: 617457.762583764


In [23]:
cities = ["Daliyat Al-Karmel", "Karmi'el", "Bene Beraq", "Jerusalem", "Sakhnin"]

In [24]:
df = pd.DataFrame(predictions)
names = [symbol2name[str(value)] for value in expcheb3["sSymbol"]]
df.insert(0,"sName", names)
df.loc[df["sName"].isin(cities)]

Unnamed: 0,sName,מחל,אמת,ודעם
196,Bene Beraq,4247.363636,3522.104167,5.548297
321,Daliyat Al-Karmel,217.150118,554.882567,2913.48227
493,Jerusalem,62052.634921,20127.0,3582.0
606,Karmi'el,6987.102662,1012.180995,5.548297
868,Sakhnin,217.150118,2489.557604,12435.482759
