In [None]:
import pandas as pd
from transformers import BertTokenizer, BertModel, BertForMaskedLM
from transformers import AdamW
import torch
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score

#Roey

In [None]:
def read_data(filepath):
  data = pd.read_csv(filepath)
  data = data[data["About"].isnull() == False]
  data = data[data["About"] != "0"]
  return data

data = read_data("data/SDG_Companies_ddbb.csv")
data.head()


Unnamed: 0.1,Unnamed: 0,Company,Source,1,2,3,4,5,6,7,...,12,13,14,15,16,17,SDG Goal,About,linkedin,Twitter Handle
0,1.0,https://www.oko.finance/,https://cfamedia.ng/meet-the-eleven-start-ups-...,1.0,,,,,,,...,,,,,,,1.0,OKO (Israel) is crop insurance designed for em...,,
1,2.0,http://www.solarfreeze.co.ke/,https://cfamedia.ng/meet-the-eleven-start-ups-...,1.0,,,,,,,...,,,,,,,1.0,Solar Freeze (Kenya) is pioneering mobile cold...,,
2,3.0,Leaf,tech2impact,1.0,,,,,,,...,,,,,,,1.0,"Leaf is a global, virtual bank for vulnerable ...",,
3,4.0,eSolidar,tech2impact,1.0,,,,,,,...,,,,,,,1.0,Platform that brings together those who need h...,,
5,5.0,briteforcesoftware.com,Rainmaking,1.0,,,,,,,...,,,,,,,1.0,"About Us\r\n\r\nBriteforce, Inc was incorporat...",,


In [None]:
data["Target"] = data["SDG Goal"].apply(lambda x: str(x) + "_goal")
# data = data[(data["Target"]=="3.0_goal") | (data["Target"]=="9.0_goal") | 
#            (data["Target"]=="2.0_goal") | (data["Target"]=="1.0_goal") | (data["Target"]=="7.0_goal") |
#            (data["Target"]=="4.0_goal") | (data["Target"]=="6.0_goal")]
labels = list(data["Target"].value_counts().keys().values)
print(data["Target"].value_counts())
print(labels)

print(data.shape)

3.0_goal     44
9.0_goal     36
2.0_goal     35
1.0_goal     30
7.0_goal     29
4.0_goal     28
6.0_goal     27
10.0_goal    26
5.0_goal     25
11.0_goal    19
16.0_goal    13
17.0_goal    13
8.0_goal     11
12.0_goal     8
15.0_goal     7
13.0_goal     5
14.0_goal     1
nan_goal      1
Name: Target, dtype: int64
['3.0_goal', '9.0_goal', '2.0_goal', '1.0_goal', '7.0_goal', '4.0_goal', '6.0_goal', '10.0_goal', '5.0_goal', '11.0_goal', '16.0_goal', '17.0_goal', '8.0_goal', '12.0_goal', '15.0_goal', '13.0_goal', '14.0_goal', 'nan_goal']
(358, 25)


In [None]:
X_train, X_test, y_train, y_test = train_test_split(data, data["Target"], test_size=0.15, random_state=42)

In [None]:
print(len(X_train), len(y_train))
print(len(X_test), len(y_test))

304 304
54 54


In [None]:
class SDGModel(torch.nn.Module):
  def __init__(self, output_size):
    super(SDGModel, self).__init__()
    self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    self.bert = BertModel.from_pretrained('bert-base-uncased')
#     for index, param in enumerate(self.bert.parameters()):
#         print(index, param.shape)
#         param.requires_grad = False
    self.out1 = torch.nn.Linear(768, 400)
    self.drop = torch.nn.Dropout(0.5)
    self.out2 = torch.nn.Linear(400, output_size)

  def forward(self, text):
    tokenized_text = self.tokenizer.tokenize("[CLS] " + text + " [SEP]")
    indexed_tokens = self.tokenizer.convert_tokens_to_ids(tokenized_text)
    tokens_tensor = torch.tensor([indexed_tokens]).cuda()
    
    if tokens_tensor.shape[1] > 500:
      tokens_tensor = tokens_tensor[0][0:500].unsqueeze(0)
    outputs = self.bert(tokens_tensor)[0]
    o = torch.nn.ReLU()(self.out1(outputs[0][0]))
    o = self.drop(o)
    return self.out2(o)


In [None]:
model = SDGModel(len(labels)).cuda()

In [None]:
model("Hello World")

tensor([ 0.0045,  0.0309, -0.0524, -0.0609,  0.1948, -0.0229,  0.2815,  0.1341,
        -0.0686,  0.1043, -0.1395, -0.1412,  0.0971,  0.0119, -0.1364,  0.0178,
         0.0427, -0.0292], device='cuda:0', grad_fn=<AddBackward0>)

In [None]:
optimizer = AdamW(model.parameters(), lr=1e-5)
criterion = torch.nn.CrossEntropyLoss()

In [None]:
n_epochs = 50

for e in range(1, n_epochs+1):
  current_loss = 0
  curr_data = X_train.sample(frac=1)
  print(len(curr_data))
  for index, instance in curr_data.iterrows():
    optimizer.zero_grad()
    out = model(instance["About"].lower())
    loss = criterion(out.view(1, -1), torch.LongTensor([labels.index(instance["Target"])]).cuda())
    loss.backward()
    optimizer.step()
    current_loss += loss.item()
    if index % 30 == 0:
      print(f"Epoch={e}, index={index}, loss={current_loss/30}")
      print(f"{instance['Target']} --> {labels[torch.argmax(out).item()]}")
      current_loss = 0
  print(evaluate_mode(X_test, model))



304
Epoch=1, index=480, loss=0.099591859181722
3.0_goal --> 17.0_goal
Epoch=1, index=660, loss=1.588361112276713
4.0_goal --> 1.0_goal
Epoch=1, index=1530, loss=0.08351851304372152
10.0_goal --> 10.0_goal
Epoch=1, index=0, loss=0.6562719980875651
1.0_goal --> 6.0_goal
Epoch=1, index=1710, loss=1.4168092648188273
12.0_goal --> 1.0_goal
Epoch=1, index=30, loss=5.553674427668254
1.0_goal --> 1.0_goal
Epoch=1, index=210, loss=2.7190701882044475
2.0_goal --> 6.0_goal
Epoch=1, index=2190, loss=0.8338195006052653
17.0_goal --> 9.0_goal
Epoch=1, index=720, loss=12.728995275497436
5.0_goal --> 3.0_goal
(array([[2, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 1],
       [0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1],
       [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 6, 1, 1, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 6, 0, 0, 0, 

Epoch=7, index=1530, loss=0.6619347214698792
10.0_goal --> 10.0_goal
Epoch=7, index=1710, loss=2.137202529112498
12.0_goal --> 11.0_goal
Epoch=7, index=2190, loss=1.4521557648976644
17.0_goal --> 17.0_goal
Epoch=7, index=0, loss=0.1969880183537801
1.0_goal --> 1.0_goal
Epoch=7, index=30, loss=0.29873613913853964
1.0_goal --> 1.0_goal
Epoch=7, index=480, loss=1.28864643573761
3.0_goal --> 11.0_goal
Epoch=7, index=660, loss=0.33876471122105917
4.0_goal --> 4.0_goal
Epoch=7, index=210, loss=1.0551485419273376
2.0_goal --> 1.0_goal
Epoch=7, index=720, loss=0.34591393868128456
5.0_goal --> 3.0_goal
(array([[3, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1],
       [0, 3, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
       [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
       [0, 0, 1, 1, 

Epoch=13, index=210, loss=0.7402397712071737
2.0_goal --> 7.0_goal
Epoch=13, index=1530, loss=0.615283211072286
10.0_goal --> 10.0_goal
Epoch=13, index=480, loss=0.32208350896835325
3.0_goal --> 11.0_goal
Epoch=13, index=1710, loss=0.18135212262471517
12.0_goal --> 11.0_goal
Epoch=13, index=720, loss=1.0838652968406677
5.0_goal --> 5.0_goal
Epoch=13, index=0, loss=0.2570438543955485
1.0_goal --> 1.0_goal
Epoch=13, index=660, loss=0.08753062884012858
4.0_goal --> 4.0_goal
Epoch=13, index=2190, loss=0.1187823494275411
17.0_goal --> 17.0_goal
Epoch=13, index=30, loss=0.008237632115681966
1.0_goal --> 1.0_goal
(array([[3, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1],
       [0, 3, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
       [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
       [0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1],
       

Epoch=19, index=660, loss=0.291936469078064
4.0_goal --> 4.0_goal
Epoch=19, index=0, loss=0.8287092367808024
1.0_goal --> 1.0_goal
Epoch=19, index=720, loss=0.10219024817148845
5.0_goal --> 3.0_goal
Epoch=19, index=1530, loss=0.03973823388417562
10.0_goal --> 10.0_goal
Epoch=19, index=30, loss=0.24196157455444336
1.0_goal --> 1.0_goal
Epoch=19, index=480, loss=0.48445975383122764
3.0_goal --> 11.0_goal
Epoch=19, index=210, loss=0.08727020025253296
2.0_goal --> 7.0_goal
Epoch=19, index=1710, loss=0.29134542544682823
12.0_goal --> 11.0_goal
Epoch=19, index=2190, loss=0.3231196482976278
17.0_goal --> 17.0_goal
(array([[3, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1],
       [0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2],
       [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
       [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2],
      

Epoch=25, index=0, loss=0.01202543576558431
1.0_goal --> 1.0_goal
Epoch=25, index=480, loss=0.0646044651667277
3.0_goal --> 3.0_goal
Epoch=25, index=30, loss=0.002048206329345703
1.0_goal --> 1.0_goal
Epoch=25, index=210, loss=0.19256671269734701
2.0_goal --> 2.0_goal
Epoch=25, index=660, loss=0.006074587504069011
4.0_goal --> 4.0_goal
Epoch=25, index=2190, loss=0.357697327931722
17.0_goal --> 17.0_goal
Epoch=25, index=1710, loss=0.1010529597600301
12.0_goal --> 11.0_goal
Epoch=25, index=1530, loss=0.9616732875506083
10.0_goal --> 10.0_goal
Epoch=25, index=720, loss=0.9128072818120321
5.0_goal --> 3.0_goal
(array([[4, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
       [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,

Epoch=30, index=1530, loss=0.05298233826955159
10.0_goal --> 10.0_goal
Epoch=30, index=2190, loss=0.007571125030517578
17.0_goal --> 17.0_goal
(array([[3, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 3, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
       [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
       [0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1],
       [0, 0, 1, 1, 0, 0, 0, 0, 6, 0, 0, 0, 0, 0, 0, 0],
       [1, 1, 0, 0, 0, 0, 0, 0, 0, 3, 0, 1, 0, 0, 0, 0],
       [0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 2, 0, 0, 0, 0, 0],
       [0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 3, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1],
       [0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0],
 

Epoch=36, index=480, loss=0.13634098370869954
3.0_goal --> 11.0_goal
Epoch=36, index=2190, loss=0.3924279769261678
17.0_goal --> 17.0_goal
Epoch=36, index=210, loss=0.05903430779774984
2.0_goal --> 7.0_goal
Epoch=36, index=1710, loss=0.06771987676620483
12.0_goal --> 11.0_goal
Epoch=36, index=0, loss=0.4141910513242086
1.0_goal --> 1.0_goal
Epoch=36, index=720, loss=0.2313892443974813
5.0_goal --> 3.0_goal
Epoch=36, index=30, loss=0.13602800369262696
1.0_goal --> 1.0_goal
Epoch=36, index=660, loss=0.39563987255096433
4.0_goal --> 4.0_goal
Epoch=36, index=1530, loss=0.10277310212453207
10.0_goal --> 10.0_goal
(array([[4, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
       [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 

Epoch=41, index=1530, loss=0.08011656602223714
10.0_goal --> 10.0_goal
(array([[3, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1],
       [0, 3, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
       [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
       [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2],
       [0, 0, 1, 1, 0, 0, 0, 6, 0, 0, 0, 0, 0, 0, 0],
       [1, 1, 0, 0, 0, 0, 0, 0, 3, 0, 1, 0, 0, 0, 0],
       [0, 1, 0, 0, 0, 0, 1, 0, 0, 2, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 3, 0, 0, 1, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1],
       [0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0],
       [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 2, 1]]), 0.46296296296296297)
304
Epoch=42, index=2190, loss=0.8018990675608317
17.0_goal --> 17.0_goal
Epoch=42, index=30, l

Epoch=47, index=30, loss=0.04516469637552897
1.0_goal --> 1.0_goal
Epoch=47, index=2190, loss=0.06197237968444824
17.0_goal --> 17.0_goal
Epoch=47, index=660, loss=0.15397459665934246
4.0_goal --> 4.0_goal
Epoch=47, index=210, loss=0.4361654837926229
2.0_goal --> 1.0_goal
Epoch=47, index=1710, loss=0.06624868710835775
12.0_goal --> 13.0_goal
Epoch=47, index=720, loss=0.047379064559936526
5.0_goal --> 3.0_goal
Epoch=47, index=480, loss=0.3358813762664795
3.0_goal --> 3.0_goal
Epoch=47, index=0, loss=0.18609813451766968
1.0_goal --> 1.0_goal
Epoch=47, index=1530, loss=0.14738958676656086
10.0_goal --> 10.0_goal
(array([[3, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0],
       [0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
       [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,

In [None]:
def evaluate_mode(test_data, model):
    model.eval()
    with torch.no_grad():
        y_pred = []
        y_test = test_data["Target"]
        for index, instance in test_data.iterrows():
            out = model(instance["About"].lower())
            y_pred.append(labels[torch.argmax(out).item()])

        model.train()
        return confusion_matrix(y_test, y_pred), accuracy_score(y_test, y_pred)


In [None]:
evaluate_mode(X_test, model)

(array([[1, 0, 0, 0, 0, 3, 0, 0, 0],
        [0, 4, 0, 0, 0, 1, 0, 0, 0],
        [0, 1, 0, 0, 0, 1, 0, 0, 0],
        [0, 3, 0, 0, 0, 2, 0, 0, 0],
        [0, 1, 0, 0, 0, 0, 0, 0, 0],
        [0, 3, 0, 0, 0, 0, 0, 0, 0],
        [0, 3, 0, 0, 0, 2, 0, 0, 0],
        [0, 1, 0, 0, 0, 1, 0, 0, 0],
        [0, 3, 0, 0, 0, 3, 0, 0, 0]]), 0.15151515151515152)