In [3]:
%%capture
!pip install transformers

In [48]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import model_selection
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
import torch
import transformers as ppb
import warnings
warnings.filterwarnings('ignore')

In [5]:

#import os
#import json
 
#from google.colab import drive
#drive.mount('/content/gdrive')
#df2 = pd.read_csv('gdrive/My Drive/germeval2017clean.csv', delimiter=',')


Mounted at /content/gdrive


## Importing the dataset


In [49]:
df2 = pd.read_csv('https://raw.githubusercontent.com/ARIC-NLP/TwitterNLTKSentiment/main/germeval2017clean.csv', delimiter=',')

Dataset is somewhat unbalanced

In [50]:
df2['sentiment'].value_counts()

neutral     1253
negative     610
positive     155
Name: sentiment, dtype: int64

## Loading the Pre-trained BERT model 

In [51]:

model_class, tokenizer_class, pretrained_weights = (ppb.BertModel, ppb.BertTokenizer, 'deepset/bert-base-german-cased-sentiment-Germeval17')


# Load pretrained model/tokenizer
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)

##Tokenize, pad up to max len of 30 and create attention mask

In [13]:
tokenized = df2['text'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True, truncation=True,max_length=30)))

#pad up to 30 tokens
max_len = 0
for i in tokenized.values:
    if len(i) > max_len:
        max_len = len(i)

padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])

attention_mask = np.where(padded != 0, 1, 0)

##Feed data into transfomer and get last hidden state

In [55]:
%%time

input_ids = torch.tensor(padded)  
attention_mask = torch.tensor(attention_mask)

with torch.no_grad():
    last_hidden_states = model(input_ids, attention_mask=attention_mask)

CPU times: user 2min 58s, sys: 1.41 s, total: 3min
Wall time: 1min 30s


Labels are our sentiments and features are the vectors of the last hidden state of BERT

In [57]:
features = last_hidden_states[0][:,0,:].numpy()
labels = df2['sentiment']

 Train/Test Split


In [58]:
train_features, test_features, train_labels, test_labels = train_test_split(features, labels)


<img src="https://jalammar.github.io/images/distilBERT/bert-distilbert-train-test-split-sentence-embedding.png" />

### Grid Search for Parameters

.

In [24]:
parameters = {'C': np.linspace(0.0001, 100, 20)}
grid_search = GridSearchCV(LogisticRegression(), parameters)
grid_search.fit(train_features, train_labels)

print('best parameters: ', grid_search.best_params_)
print('best scrores: ', grid_search.best_score_)

best parameters:  {'C': 0.0001}
best scrores:  0.7422464100714707


In [100]:
from sklearn.tree import DecisionTreeClassifier
#clf = LogisticRegression(C=0.0001)
#lr_clf = DecisionTreeClassifier()

from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(max_depth=2, random_state=0)


clf.fit(train_features, train_labels)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=2, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [101]:
clf.score(test_features, test_labels)

0.7564356435643564

In [102]:
pred= clf.predict(test_features)



In [103]:
from sklearn.metrics import classification_report
class_names = ['negative', 'neutral', 'positive']
print(classification_report(test_labels, pred, target_names=class_names))

              precision    recall  f1-score   support

    negative       0.81      0.55      0.66       164
     neutral       0.74      0.93      0.82       306
    positive       0.88      0.20      0.33        35

    accuracy                           0.76       505
   macro avg       0.81      0.56      0.60       505
weighted avg       0.77      0.76      0.73       505



In [104]:
n = 203
start  = len(test_features)+n
print(start,df2.iloc[start]['text'],df2.iloc[start]['sentiment'])
pred[n]

708 Sich betrinkende Menschen in der Bahn mag ich positive


'negative'

In [125]:
#Test with sentences

In [105]:
def infer(sent,model):
  tokened= tokenizer.encode(sent, add_special_tokens=True, truncation=True,max_length=30)
  #pad up
  N=30
  tokened += [0] * (N - len(tokened))
  pad0 = np.reshape(tokened, (1, 30))
  attention_mask0 = np.where(pad0 != 0, 1, 0)

  input_ids = torch.tensor(pad0)  
  attention_mask1 = torch.tensor(attention_mask0)

  with torch.no_grad():
    last_hidden_state0 = model(input_ids, attention_mask=attention_mask1)
  feature0 = last_hidden_state0[0][:,0,:].numpy()
  pred0=lr_clf.predict(feature0)
  return pred0[0]
  

In [106]:

infer('Was soll der Quatsch? falsches Gleis und Verspätung',model)

'negative'