In [67]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd 

np.random.seed(42)

import tensorflow as tf 
from sklearn.preprocessing import StandardScaler, MultiLabelBinarizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn import metrics
from sklearn.metrics import accuracy_score


from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Flatten, Conv2D, MaxPooling2D
 

In [24]:
mass = pd.read_pickle('./data/03_dummy_df.pickle') 

In [25]:
mass.head()

Unnamed: 0,id,country,ccode,region,protestnumber,protesterviolence,location,protesteridentity,sources,notes,...,labor_wage_dispute,land_farm_issue,police_brutality,political_behavior_process,price increases_tax_policy,removal_of_politician,social_restrictions,start_date,end_date,target
0,201990001,Canada,20,North America,1,0.0,national,unspecified,1. great canadian train journeys into history;...,canada s railway passenger system was finally ...,...,1,0,0,1,0,0,0,1990-01-15,1990-01-15,"[0, 0, 0, 0, 1, 0, 0]"
1,201990002,Canada,20,North America,2,0.0,"Montreal, Quebec",unspecified,1. autonomy s cry revived in quebec the new yo...,protestors were only identified as young peopl...,...,0,0,0,1,0,0,0,1990-06-25,1990-06-25,"[0, 0, 0, 0, 1, 0, 0]"
2,201990003,Canada,20,North America,3,0.0,"Montreal, Quebec",separatist parti quebecois,1. quebec protest after queen calls for unity ...,"the queen, after calling on canadians to remai...",...,0,0,0,1,0,0,0,1990-07-01,1990-07-01,"[0, 0, 0, 0, 1, 0, 0]"
3,201990004,Canada,20,North America,4,1.0,"Montreal, Quebec",mohawk indians,1. indians gather as siege intensifies; armed ...,canada s federal government has agreed to acqu...,...,0,1,0,0,0,0,0,1990-07-12,1990-09-06,"[0, 1, 0, 0, 0, 0, 0]"
4,201990005,Canada,20,North America,5,1.0,"Montreal, Quebec",local residents,1. dozens hurt in mohawk blockade protest the ...,protests were directed against the state due t...,...,0,0,0,1,0,0,0,1990-08-14,1990-08-15,"[1, 1, 0, 1, 0, 0, 0]"


In [38]:
mass.columns[0:20]

Index(['id', 'country', 'ccode', 'region', 'protestnumber',
       'protesterviolence', 'location', 'protesteridentity', 'sources',
       'notes', 'protester_id_type', 'partipants_number',
       'protest_size_category', 'pop_male', 'pop_female', 'pop_total',
       'pop_density', 'prosperity_2020', 'country_Afghanistan',
       'country_Albania'],
      dtype='object')

In [39]:
mass.columns[200:228]

Index(['protester_id_type_locals_residents',
       'protester_id_type_pensioners_retirees',
       'protester_id_type_political_group', 'protester_id_type_prisoners',
       'protester_id_type_protestors_generic',
       'protester_id_type_religious_group',
       'protester_id_type_soldiers_veterans',
       'protester_id_type_students_youth',
       'protester_id_type_victims_families', 'protester_id_type_women',
       'protester_id_type_workers_unions', 'arrests', 'accomodation',
       'beatings', 'crowddispersal', 'ignore', 'killings', 'shootings',
       'labor_wage_dispute', 'land_farm_issue', 'police_brutality',
       'political_behavior_process', 'price increases_tax_policy',
       'removal_of_politician', 'social_restrictions', 'start_date',
       'end_date', 'target'],
      dtype='object')

In [27]:
mass['protester_id_type'].unique()

array(['workers_unions', 'students_youth', 'political_group',
       'ethnic_group', 'locals_residents', 'pensioners_retirees',
       'protestors_generic', 'civil_human_rights', 'women',
       'religious_group', 'soldiers_veterans', 'victims_families',
       'prisoners'], dtype=object)

In [66]:
mass['partipants_number']

0        1000
1        1000
2         500
3         100
4         950
         ... 
15220     100
15221    1000
15222      50
15223      50
15224     100
Name: partipants_number, Length: 15225, dtype: int64

In [70]:
mlb = MultiLabelBinarizer()
mlb.fit(mass[['arrests', 'accomodation',
       'beatings', 'crowddispersal', 'ignore', 'killings', 'shootings']])

MultiLabelBinarizer()

In [71]:
mlb.classes_

array(['a', 'b', 'c', 'd', 'e', 'g', 'h', 'i', 'k', 'l', 'm', 'n', 'o',
       'p', 'r', 's', 't', 'w'], dtype=object)

In [46]:
X = mass.drop(columns=['country', 'ccode', 'region', 'location','protesteridentity', 'sources',
       'notes', 'protester_id_type', 'protest_size_category', 'start_date',
       'end_date', 'target'])
y = mass['target']

In [47]:
X

Unnamed: 0,id,protestnumber,protesterviolence,partipants_number,pop_male,pop_female,pop_total,pop_density,prosperity_2020,country_Afghanistan,...,ignore,killings,shootings,labor_wage_dispute,land_farm_issue,police_brutality,political_behavior_process,price increases_tax_policy,removal_of_politician,social_restrictions
0,201990001,1,0.0,1000,13656.649,13884.674,27541.323,3.029,79.819703,0,...,1,0,0,1,0,0,1,0,0,0
1,201990002,2,0.0,1000,13656.649,13884.674,27541.323,3.029,79.819703,0,...,1,0,0,0,0,0,1,0,0,0
2,201990003,3,0.0,500,13656.649,13884.674,27541.323,3.029,79.819703,0,...,1,0,0,0,0,0,1,0,0,0
3,201990004,4,1.0,100,13656.649,13884.674,27541.323,3.029,79.819703,0,...,0,0,0,0,1,0,0,0,0,0
4,201990005,5,1.0,950,13656.649,13884.674,27541.323,3.029,79.819703,0,...,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15220,9102014001,1,1.0,100,4050.613,3896.120,7946.733,17.548,46.065203,0,...,0,0,1,0,0,0,1,0,0,0
15221,9102016001,1,1.0,1000,4219.461,4052.305,8271.766,18.266,46.065203,0,...,0,1,1,0,0,0,0,0,1,0
15222,9102017001,1,0.0,50,4305.516,4132.522,8438.038,18.633,46.065203,0,...,0,0,0,0,1,0,1,0,0,0
15223,9102017002,2,1.0,50,4305.516,4132.522,8438.038,18.633,46.065203,0,...,0,0,0,0,0,0,1,0,0,0


In [51]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [52]:
X_train

Unnamed: 0,id,protestnumber,protesterviolence,partipants_number,pop_male,pop_female,pop_total,pop_density,prosperity_2020,country_Afghanistan,...,ignore,killings,shootings,labor_wage_dispute,land_farm_issue,police_brutality,political_behavior_process,price increases_tax_policy,removal_of_politician,social_restrictions
14823,8201999007,7,0.0,200,11539.672,11121.621,22661.293,68.974,67.491327,0,...,0,0,0,0,0,1,0,0,0,0
8260,4392002001,1,0.0,2000,6047.905,6245.192,12293.097,44.931,44.824098,0,...,1,0,0,0,0,0,0,1,0,0
14949,8302014004,4,0.0,100,2891.999,2633.629,5525.628,7893.754,79.512408,0,...,1,0,0,0,0,0,1,1,0,0
11334,6451995001,1,1.0,1000,10159.693,9989.649,20149.342,46.393,44.557454,0,...,0,1,1,0,0,1,0,0,0,0
1458,1102014001,1,1.0,500,380.976,382.395,763.371,3.878,55.871701,0,...,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5191,3252005006,6,1.0,40000,28295.735,29985.474,58281.209,198.141,71.833298,0,...,0,0,0,0,1,0,0,0,0,0
13418,7502006001,1,1.0,100,606386.475,559099.816,1165486.291,391.999,53.640353,0,...,0,1,0,0,1,0,0,0,0,0
5390,3422018004,4,0.0,25000,4313.121,4489.620,8802.741,100.649,62.470991,0,...,1,0,0,0,0,0,1,0,1,0
860,932015002,2,0.0,50,3067.180,3156.054,6223.234,51.714,51.380974,0,...,1,0,0,0,0,0,1,0,0,0


In [53]:
ss = StandardScaler()
Z_train = ss.fit_transform(X_train)
Z_test = ss.transform(X_test)

In [61]:
n_inputs = Z_train.shape[1]
print(f'Number of inputs: {Z_train.shape[1]}')

Number of inputs: 216


In [62]:
model = Sequential()
model.add(Dense(20, input_dim=n_inputs, kernel_initializer='he_uniform', activation='relu'))
model.add(Dense(7, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam')

In [65]:
y_pred = model.predict(Z_test)
y_pred = y_pred.round()
accuracy = accuracy_score(y_test, y_pred)

ValueError: You appear to be using a legacy multi-label data representation. Sequence of sequences are no longer supported; use a binary array or sparse matrix instead - the MultiLabelBinarizer transformer can convert to this format.

In [None]:
# output layer 
model.add(Dens(7, activation='sigmoid'))

In [None]:
# compile 
model.compile(loss='binary_crossentropy', 
             optimizer='adam', 
             metrics='accuracy')