In [None]:
from google.colab import drive
drive.mount('/content/drive')
%cd drive/My Drive/SDG/project/sdg-codebase

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
[Errno 2] No such file or directory: 'drive/My Drive/SDG/project/sdg-codebase'
/content/drive/My Drive/SDG/project/sdg-codebase


# Imports and functions

In [None]:
import pandas as pd
import numpy as np
!pip install transformers
from transformers import BertTokenizer, BertModel, BertForMaskedLM
from transformers import AdamW
import torch
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score



In [None]:
import sys
!test -d bertviz_repo && echo "FYI: bertviz_repo directory already exists, to pull latest version uncomment this line: !rm -r bertviz_repo"
# !rm -r bertviz_repo # Uncomment if you need a clean pull from repo
!test -d bertviz_repo || git clone https://github.com/jessevig/bertviz bertviz_repo
if not 'bertviz_repo' in sys.path:
  sys.path += ['bertviz_repo']
!pip install regex
from bertviz import head_view
from bertviz import model_view
from bertviz.neuron_view import show
# !pip install transformers

FYI: bertviz_repo directory already exists, to pull latest version uncomment this line: !rm -r bertviz_repo


In [None]:
def call_html_head_view(attention, tokens):
  import IPython
  display(IPython.core.display.HTML('''
        <script src="/static/components/requirejs/require.js"></script>
        <script>
          requirejs.config({
            paths: {
              base: '/static/base',
              "d3": "https://cdnjs.cloudflare.com/ajax/libs/d3/3.5.8/d3.min",
              jquery: '//ajax.googleapis.com/ajax/libs/jquery/2.0.0/jquery.min',
            },
          });
        </script>
        '''))
  head_view(attention, tokens)

In [None]:
def call_html_model_view(attention, tokens):
  import IPython
  display(IPython.core.display.HTML('''
        <script src="/static/components/requirejs/require.js"></script>
        <script>
          requirejs.config({
            paths: {
              base: '/static/base',
              "d3": "https://cdnjs.cloudflare.com/ajax/libs/d3/5.7.0/d3.min",
              jquery: '//ajax.googleapis.com/ajax/libs/jquery/2.0.0/jquery.min',
            },
          });
        </script>
        '''))
  model_view(attention, tokens)

In [None]:
def call_html_neuron_view(model, model_type, tokenizer, sentence_a):
  import IPython
  display(IPython.core.display.HTML('''
        <script src="/static/components/requirejs/require.js"></script>
        <script>
          requirejs.config({
            paths: {
              base: '/static/base',
              "d3": "https://cdnjs.cloudflare.com/ajax/libs/d3/5.7.0/d3.min",
              jquery: '//ajax.googleapis.com/ajax/libs/jquery/2.0.0/jquery.min',
            },
          });
        </script>
        '''))
  show(model, model_type, tokenizer, sentence_a)

## plot metrics

In [None]:
def plotAccPrecRecall(y_test, y_pred, predicted_prob, verbose=False):
  accuracy = metrics.accuracy_score(y_test, y_pred)
  # auc = metrics.roc_auc_score(y_test, predicted_prob, multi_class="ovr")
  print("Accuracy:",  round(accuracy,2))
  # if verbose:
  #   print("Auc:", round(auc,2))
  #   print("Detail:")
  #   print(metrics.classification_report(y_test, y_pred))

  return accuracy

In [None]:
def plotConfusionMatrix(y_test, y_pred, labels):
  cm = metrics.confusion_matrix(y_test, y_pred, labels=labels)
  fig, ax = plt.subplots()
  sns.heatmap(cm, annot=True, fmt='d', ax=ax, cmap=plt.cm.Blues, 
              cbar=False)
  ax.set(xlabel="Pred", ylabel="True", xticklabels=labels, 
        yticklabels=labels, title="Confusion matrix")
  plt.yticks(rotation=0)

In [None]:
def plotRocPrecRecallCurve(y_test, predicted_prob, labels):
  y_test_array = pd.get_dummies(y_test, drop_first=False).values
  fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(10,5))
  ## Plot roc
  for i in range(len(labels)):
      fpr, tpr, thresholds = metrics.roc_curve(y_test_array[:,i],  
                            predicted_prob[:,i])
      ax[0].plot(fpr, tpr, lw=3,
                label='{0} (area={1:0.2f})'.format(labels[i], 
                                metrics.auc(fpr, tpr),)
                )
  ax[0].plot([0,1], [0,1], color='navy', lw=3, linestyle='--')
  ax[0].set(xlim=[-0.05,1.0], ylim=[0.0,1.05], 
            xlabel='False Positive Rate', 
            ylabel="True Positive Rate (Recall)", 
            title="Receiver operating characteristic")
  ax[0].legend(loc="lower right")
  ax[0].grid(True)
      
  ## Plot precision-recall curve
  for i in range(len(labels)):
      precision, recall, thresholds = metrics.precision_recall_curve(
                  y_test_array[:,i], predicted_prob[:,i])
      ax[1].plot(recall, precision, lw=3, 
                label='{0} (area={1:0.2f})'.format(labels[i], 
                                    metrics.auc(recall, precision))
                )
  ax[1].set(xlim=[0.0,1.05], ylim=[0.0,1.05], xlabel='Recall', 
            ylabel="Precision", title="Precision-Recall curve")
  ax[1].legend(loc="best")
  ax[1].grid(True)
  plt.show()

In [None]:
def SDGToFivePs(x):
  x = int(x.replace("_goal",""))
  if x in range(0,7):
    return 'People'
  if x in range(13,16):
    return 'Planet'
  if x in range(7,13):
    return 'Prosperity'
  if x in range(16,17):
    return 'Peace'
  if x in range(17,18):
    return 'Partnerships'
  return None

# Model

In [None]:
class SDGModel(torch.nn.Module):
  def __init__(self, output_size):
    super(SDGModel, self).__init__()
    self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
    self.bert = BertModel.from_pretrained('bert-base-uncased', output_attentions=True)
#     for index, param in enumerate(self.bert.parameters()):
#         print(index, param.shape)
#         param.requires_grad = False
    self.out1 = torch.nn.Linear(768, 400)
    self.drop = torch.nn.Dropout(0.5)
    self.out2 = torch.nn.Linear(400, output_size)

  def forward(self, text):
    tokenized_text = self.tokenizer.tokenize("[CLS] " + text + " [SEP]")
    indexed_tokens = self.tokenizer.convert_tokens_to_ids(tokenized_text)
    tokens_tensor = torch.tensor([indexed_tokens]).cuda()
    
    if tokens_tensor.shape[1] > 500:
      tokens_tensor = tokens_tensor[0][0:500].unsqueeze(0)
    outputs = self.bert(tokens_tensor)[0]
    o = torch.nn.ReLU()(self.out1(outputs[0][0]))
    o = self.drop(o)
    return self.out2(o)

  def getAttention(self, text):
    # tbd
    self.eval()
    inputs = self.tokenizer.encode_plus(text, return_tensors='pt', add_special_tokens=True)
    token_type_ids = inputs['token_type_ids']
    input_ids = inputs['input_ids']
    attention = self.bert(torch.tensor(input_ids).cuda(), token_type_ids=torch.tensor(token_type_ids).cuda())[-1]
    input_id_list = input_ids[0].tolist() # Batch index 0
    tokens = self.tokenizer.convert_ids_to_tokens(input_id_list)
    
    # call_html()
    # head_view(attention, tokens)
    # self.train()
    return attention, tokens


In [None]:
from sklearn import metrics
import seaborn as sns
import numpy as np

def evaluate_mode(X_test, model, verbose=True):

  model.eval()
  errors = []
  softmax = torch.nn.Softmax(dim=0)

  #eval
  with torch.no_grad():
      y_pred = []
      predicted_prob = []
      y_test = X_test["Target"]
      for index, instance in X_test.iterrows():
          out = model(instance["Description"].lower())
          predicted_prob.append(softmax(out).tolist())
          y_pred.append(labels[torch.argmax(out).item()])
          # collect companies with errors
          if instance['Target'] != labels[torch.argmax(out).item()]:
            errors.append((instance['Company'], instance['Target'], labels[torch.argmax(out).item()]))
      model.train()

  predicted_prob = np.array(predicted_prob)

  ## plot Accuracy, Precision, Recall
  accuracy = plotAccPrecRecall(y_test, y_pred, predicted_prob, verbose)
      
  ## Plot confusion matrix
  plotConfusionMatrix(y_test, y_pred, labels)

  ## Plot roc, precision recall curve
  if verbose:
    plotRocPrecRecallCurve(y_test, predicted_prob, labels)

  return errors, accuracy

# Analyze labeled Data

## Loading Model

In [None]:
ls models

model_5Ps_checkpoint_12.tar  model_SDG_onlyNewRain_checkpoint_13.tar
model_SDG_checkpoint_14.tar  model_with_attention_SDG_checkpoint_8.tar


In [None]:
filename = "model_with_attention_SDG_checkpoint_8.tar"
# filename = "model_check.tar"

In [None]:
# load model from checkpoint
checkpoint = torch.load(f"models/{filename}")
labels = checkpoint['labels']

device = torch.device("cuda")
model = SDGModel(len(labels))
optimizer = AdamW(model.parameters(), lr=1e-5)

model.load_state_dict(checkpoint['model_state_dict'])
model.to(device)
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
epoch = checkpoint['epoch']
losses = checkpoint['losses']
errors = checkpoint['errors']

model.eval()
# - or -
# model.train()

## Loading Data

In [None]:
ls data

 5Ps_multi_label_to_1_rare.csv
 5Ps_multi_label_to_1_rare_Planet_Peace_Partnerships.csv
 AnnotatedByDalia.csv
 difDaliaOurs.csv
 impact_columns_as_dbb.csv
'Impact Startup Database - SDG Compass.xlsx'
 newAll_withspreadsheet.csv
 plusImpact_SDGs_Companies.csv
 pure_rainmaking_old_manually.csv
 rainmaking_content_Countries_Companies.txt
 rainmaking_db_oct.csv
 rainmaking_db_oct_full.csv
 rainmaking_db_oct_train.csv
 rainmaking_impact_db_171dups.csv
 rainmaking_impact_db.csv
 rand_predict_Israel_companies_to_SDG_5Ps_16_10.csv
 SDG_1_label_companies.csv
 SDG_5Ps_1_label_companies.csv
 SDG_5Ps_rainmaking_impact_others_1_label.csv
 SDG_5Ps_rainmaking_impact_others.csv
 SDG_Companies_ddbb.csv
'SDG Companies - DDBB _impact.csv'
 SDG_Companies_ddbb_old1.csv
'SDG_Companies_plusImpact (1).csv'
 SDG_Companies_plusImpact.csv
 SDG_Compnies_rainmaking.io_impact_compass.csv
 SDG_israel_no_tags.xlsx
 SDG_newRainmaking_others.csv
 sncCompaniesNotSdg.csv
 spreadsheet_companies.csv
 spreadsheetSDGsEurope.

In [None]:
def read_data():
  return pd.read_csv("data/SDG_5Ps_rainmaking_impact_others_1_label.csv")

data = read_data()
data.shape

(2472, 14)

## Eval

In [None]:
data['Target'] = data['SDG Goal']
instance = data.sample(1).iloc[0]
text = instance['Description']
out = model(text)
print(f"{instance['Target']} --> {labels[torch.argmax(out).item()]}")

7 --> 7


In [None]:
text

'aega asa is a solar utility company that acquires and operates solar power plants. the company currently owns a portfolio of seven individual solar parks in the umbria, lazio, abruzzo, emilia-romagna and lombardia regions in italy with a combined production capacity of 7mwp, and an initial annual electricity production of approximately 10gwh depending on the annual solar irradiation. the company focuses on acquisitions of smaller existing solar parks'

In [None]:
text = text[:100]
attention, tokens = model.getAttention(text)



In [None]:
call_html_head_view(attention, tokens)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
call_html_model_view(attention, tokens)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
show(model, 'bert', model.tokenizer, text)