In [1]:
from google.colab import drive
drive.mount('gdrive')

Mounted at gdrive


In [2]:
%cd /content/gdrive/My Drive/

/content/gdrive/My Drive


In [3]:
import pandas as pd
import numpy as np

data = pd.read_csv("ner_dataset.csv", encoding="latin1")

In [4]:
data = data.fillna(method="ffill")

In [5]:
data.head()

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,Sentence: 1,of,IN,O
2,Sentence: 1,demonstrators,NNS,O
3,Sentence: 1,have,VBP,O
4,Sentence: 1,marched,VBN,O


In [6]:
words = list(set(data["Word"].values))

In [7]:
n_words = len(words); n_words

35178

In [8]:
class SentenceGetter(object):
    
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, p, t) for w, p, t in zip(s["Word"].values.tolist(),
                                                           s["POS"].values.tolist(),
                                                           s["Tag"].values.tolist())]
        self.grouped = self.data.groupby("Sentence #").apply(agg_func)
        self.sentences = [s for s in self.grouped]
    
    def get_next(self):
        try:
            s = self.grouped["Sentence: {}".format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None

In [9]:
set(data["Tag"].values.tolist())

{'B-art',
 'B-eve',
 'B-geo',
 'B-gpe',
 'B-nat',
 'B-org',
 'B-per',
 'B-tim',
 'I-art',
 'I-eve',
 'I-geo',
 'I-gpe',
 'I-nat',
 'I-org',
 'I-per',
 'I-tim',
 'O'}

In [11]:
def tagging_stanza(tag):
    if tag == 'B-geo' or tag == 'I-geo':
        new_tag = 'GPE'
    elif tag == 'B-org' or tag == 'I-org':
        new_tag = 'ORG'
    elif tag == 'O':
        new_tag = 'O'
    elif tag == 'B-per' or tag == 'I-per':
        new_tag = 'PERSON'
    elif tag == 'B-tim' or tag == 'I-tim':
        new_tag = 'DATE'
    elif tag == 'B-eve' or tag == 'I-eve':
        new_tag = 'EVENT'
    elif tag == 'B-gpe' or tag == 'I-gpe':
        new_tag = 'GPE'
    elif tag == 'B-art' or tag == 'I-art':
        new_tag = 'WORK_OF_ART'
    else:
        new_tag = 'MISC'
        
    return new_tag 

In [13]:
from tqdm._tqdm_notebook import tqdm_notebook
tqdm_notebook.pandas('PROGRESS>>>')
data['Tag_stanza'] = data['Tag'].progress_apply(tagging_stanza)

HBox(children=(FloatProgress(value=0.0, max=1048575.0), HTML(value='')))




In [14]:
data.head(15)

Unnamed: 0,Sentence #,Word,POS,Tag,Tag_flair,Tag_stanza
0,Sentence: 1,Thousands,NNS,O,O,O
1,Sentence: 1,of,IN,O,O,O
2,Sentence: 1,demonstrators,NNS,O,O,O
3,Sentence: 1,have,VBP,O,O,O
4,Sentence: 1,marched,VBN,O,O,O
5,Sentence: 1,through,IN,O,O,O
6,Sentence: 1,London,NNP,B-geo,LOC,GPE
7,Sentence: 1,to,TO,O,O,O
8,Sentence: 1,protest,VB,O,O,O
9,Sentence: 1,the,DT,O,O,O


In [15]:
getter = SentenceGetter(data)

In [16]:
sent = getter.get_next()

In [17]:
print(sent)

[('Thousands', 'NNS', 'O'), ('of', 'IN', 'O'), ('demonstrators', 'NNS', 'O'), ('have', 'VBP', 'O'), ('marched', 'VBN', 'O'), ('through', 'IN', 'O'), ('London', 'NNP', 'B-geo'), ('to', 'TO', 'O'), ('protest', 'VB', 'O'), ('the', 'DT', 'O'), ('war', 'NN', 'O'), ('in', 'IN', 'O'), ('Iraq', 'NNP', 'B-geo'), ('and', 'CC', 'O'), ('demand', 'VB', 'O'), ('the', 'DT', 'O'), ('withdrawal', 'NN', 'O'), ('of', 'IN', 'O'), ('British', 'JJ', 'B-gpe'), ('troops', 'NNS', 'O'), ('from', 'IN', 'O'), ('that', 'DT', 'O'), ('country', 'NN', 'O'), ('.', '.', 'O')]


In [18]:
sentences = getter.sentences

In [19]:
texts = []
for sent in sentences:
    text = ' '.join([str(s[0]) for s in sent])
    texts.append(text)

In [20]:
texts[:5]

['Thousands of demonstrators have marched through London to protest the war in Iraq and demand the withdrawal of British troops from that country .',
 'Iranian officials say they expect to get access to sealed sensitive parts of the plant Wednesday , after an IAEA surveillance system begins functioning .',
 'Helicopter gunships Saturday pounded militant hideouts in the Orakzai tribal region , where many Taliban militants are believed to have fled to avoid an earlier military offensive in nearby South Waziristan .',
 'They left after a tense hour-long standoff with riot police .',
 'U.N. relief coordinator Jan Egeland said Sunday , U.S. , Indonesian and Australian military helicopters are ferrying out food and supplies to remote areas of western Aceh province that ground crews can not reach .']

In [21]:
len(texts)

47959

## CRF классификатор

In [22]:
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]

    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'postag[:2]': postag[:2],
    }
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True

    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, postag, label in sent]

def sent2tokens(sent):
    return [token for token, postag, label in sent]

In [23]:
X = [sent2features(s) for s in sentences]
y = [sent2labels(s) for s in sentences]

In [28]:
len(X_train), len(y_train)

(42959, 42959)

In [27]:
X_train = X[5000:]
X_test = X[:5000]
y_train = y[5000:]
y_test = y[:5000]

In [29]:
pip install sklearn_crfsuite

Collecting sklearn_crfsuite
  Downloading https://files.pythonhosted.org/packages/25/74/5b7befa513482e6dee1f3dd68171a6c9dfc14c0eaa00f885ffeba54fe9b0/sklearn_crfsuite-0.3.6-py2.py3-none-any.whl
Collecting python-crfsuite>=0.8.3
[?25l  Downloading https://files.pythonhosted.org/packages/95/99/869dde6dbf3e0d07a013c8eebfb0a3d30776334e0097f8432b631a9a3a19/python_crfsuite-0.9.7-cp36-cp36m-manylinux1_x86_64.whl (743kB)
[K     |████████████████████████████████| 747kB 4.4MB/s 
[?25hInstalling collected packages: python-crfsuite, sklearn-crfsuite
Successfully installed python-crfsuite-0.9.7 sklearn-crfsuite-0.3.6


In [30]:
from sklearn_crfsuite import CRF

crf = CRF(algorithm='lbfgs',
          c1=0.1,
          c2=0.1,
          max_iterations=100,
          all_possible_transitions=False)
crf.fit(X_train, y_train)



CRF(algorithm='lbfgs', all_possible_states=None, all_possible_transitions=False,
    averaging=None, c=None, c1=0.1, c2=0.1, calibration_candidates=None,
    calibration_eta=None, calibration_max_trials=None, calibration_rate=None,
    calibration_samples=None, delta=None, epsilon=None, error_sensitive=None,
    gamma=None, keep_tempfiles=None, linesearch=None, max_iterations=100,
    max_linesearch=None, min_freq=None, model_filename=None, num_memories=None,
    pa_type=None, period=None, trainer_cls=None, variance=None, verbose=False)

In [31]:
from sklearn.model_selection import cross_val_predict
from sklearn_crfsuite.metrics import flat_classification_report

In [32]:
from sklearn_crfsuite import metrics

In [33]:
labels = list(crf.classes_)
labels.remove('O')
labels

['B-geo',
 'I-geo',
 'B-org',
 'I-org',
 'B-eve',
 'I-eve',
 'B-gpe',
 'B-tim',
 'B-per',
 'I-per',
 'I-tim',
 'B-art',
 'I-art',
 'I-gpe',
 'B-nat',
 'I-nat']

In [34]:
y_pred = crf.predict(X_test)
metrics.flat_f1_score(y_test, y_pred,
                      average='weighted', labels=labels)

0.8384334414259855

In [35]:
sorted_labels = sorted(
    labels,
    key=lambda name: (name[1:], name[0])
)
print(metrics.flat_classification_report(
    y_test, y_pred, labels=sorted_labels, digits=3
))

              precision    recall  f1-score   support

       B-art      0.200     0.022     0.040        45
       I-art      0.000     0.000     0.000        35
       B-eve      0.435     0.263     0.328        38
       I-eve      0.278     0.135     0.182        37
       B-geo      0.834     0.896     0.864      3746
       I-geo      0.775     0.797     0.786       710
       B-gpe      0.968     0.943     0.956      1653
       I-gpe      1.000     0.250     0.400        16
       B-nat      0.750     0.391     0.514        23
       I-nat      1.000     0.200     0.333        10
       B-org      0.791     0.707     0.746      2206
       I-org      0.800     0.791     0.796      1719
       B-per      0.838     0.815     0.827      1732
       I-per      0.853     0.896     0.874      1758
       B-tim      0.921     0.887     0.903      2100
       I-tim      0.820     0.783     0.801       650

   micro avg      0.847     0.837     0.842     16478
   macro avg      0.704   

In [39]:
hand_test = y_pred[:100]
summ = 0
for h in hand_test:
    summ += len(h)

In [41]:
new_df = data[:summ]

In [44]:
test = []
for h in hand_test:
    test += h

In [45]:
len(test)

2264

In [46]:
new_df['crf_results'] = test

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,Sentence #,Word,POS,Tag,Tag_flair,Tag_stanza,crf_results
0,Sentence: 1,Thousands,NNS,O,O,O,O
1,Sentence: 1,of,IN,O,O,O,O
2,Sentence: 1,demonstrators,NNS,O,O,O,O
3,Sentence: 1,have,VBP,O,O,O,O
4,Sentence: 1,marched,VBN,O,O,O,O
...,...,...,...,...,...,...,...
2259,Sentence: 99,a,DT,O,O,O,O
2260,Sentence: 99,third,JJ,O,O,O,O
2261,Sentence: 99,day,NN,O,O,O,O
2262,Sentence: 99,",",",",O,O,O,O


## stanza

In [49]:
pip install stanza

Collecting stanza
[?25l  Downloading https://files.pythonhosted.org/packages/e7/8b/3a9e7a8d8cb14ad6afffc3983b7a7322a3a24d94ebc978a70746fcffc085/stanza-1.1.1-py3-none-any.whl (227kB)
[K     |█▍                              | 10kB 12.7MB/s eta 0:00:01[K     |██▉                             | 20kB 17.4MB/s eta 0:00:01[K     |████▎                           | 30kB 11.1MB/s eta 0:00:01[K     |█████▊                          | 40kB 9.0MB/s eta 0:00:01[K     |███████▏                        | 51kB 4.8MB/s eta 0:00:01[K     |████████▋                       | 61kB 5.3MB/s eta 0:00:01[K     |██████████                      | 71kB 5.2MB/s eta 0:00:01[K     |███████████▌                    | 81kB 5.3MB/s eta 0:00:01[K     |█████████████                   | 92kB 5.8MB/s eta 0:00:01[K     |██████████████▍                 | 102kB 6.1MB/s eta 0:00:01[K     |███████████████▉                | 112kB 6.1MB/s eta 0:00:01[K     |█████████████████▎              | 122kB 6.1MB/s eta 0:

In [50]:
import stanza

In [52]:
stanza.download('en')

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/master/resources_1.1.0.json: 122kB [00:00, 17.4MB/s]                    
2020-12-21 17:21:01 INFO: Downloading default packages for language: en (English)...
Downloading http://nlp.stanford.edu/software/stanza/1.1.0/en/default.zip: 100%|██████████| 428M/428M [02:27<00:00, 2.91MB/s]
2020-12-21 17:23:37 INFO: Finished downloading models and saved to /root/stanza_resources.


In [53]:
dictionary = []
nlp = stanza.Pipeline(lang='en', processors='tokenize,ner')
for text in texts[:5000]:
    doc = nlp(text)
    ner = {}
    for sent in doc.sentences:
        for ent in sent.ents:
            ner[ent.text] = ent.type
    
    for word in text.split(' '):
        d = {}
        if word in ner.keys():
            d[word] = ner[word]
        else:
            d[word] = 'O'
        
        dictionary.append(d)

2020-12-21 17:24:21 INFO: Loading these models for language: en (English):
| Processor | Package   |
-------------------------
| tokenize  | ewt       |
| ner       | ontonotes |

2020-12-21 17:24:22 INFO: Use device: cpu
2020-12-21 17:24:22 INFO: Loading: tokenize
2020-12-21 17:24:22 INFO: Loading: ner
2020-12-21 17:24:22 INFO: Done loading processors!


In [54]:
dictionary

[{'Thousands': 'CARDINAL'},
 {'of': 'O'},
 {'demonstrators': 'O'},
 {'have': 'O'},
 {'marched': 'O'},
 {'through': 'O'},
 {'London': 'GPE'},
 {'to': 'O'},
 {'protest': 'O'},
 {'the': 'O'},
 {'war': 'O'},
 {'in': 'O'},
 {'Iraq': 'GPE'},
 {'and': 'O'},
 {'demand': 'O'},
 {'the': 'O'},
 {'withdrawal': 'O'},
 {'of': 'O'},
 {'British': 'NORP'},
 {'troops': 'O'},
 {'from': 'O'},
 {'that': 'O'},
 {'country': 'O'},
 {'.': 'O'},
 {'Iranian': 'NORP'},
 {'officials': 'O'},
 {'say': 'O'},
 {'they': 'O'},
 {'expect': 'O'},
 {'to': 'O'},
 {'get': 'O'},
 {'access': 'O'},
 {'to': 'O'},
 {'sealed': 'O'},
 {'sensitive': 'O'},
 {'parts': 'O'},
 {'of': 'O'},
 {'the': 'O'},
 {'plant': 'O'},
 {'Wednesday': 'DATE'},
 {',': 'O'},
 {'after': 'O'},
 {'an': 'O'},
 {'IAEA': 'ORG'},
 {'surveillance': 'O'},
 {'system': 'O'},
 {'begins': 'O'},
 {'functioning': 'O'},
 {'.': 'O'},
 {'Helicopter': 'O'},
 {'gunships': 'O'},
 {'Saturday': 'DATE'},
 {'pounded': 'O'},
 {'militant': 'O'},
 {'hideouts': 'O'},
 {'in': 'O'},
 

In [55]:
ner_stanza = [d.values() for d in dictionary]

In [56]:
len(ner_stanza), len(dictionary)

(109843, 109843)

In [57]:
data = data[0:109843]

In [58]:
len(data) # это количество слов в 5000 предложениях

109843

In [59]:
str(ner_stanza[2])[14:-3]

'O'

In [60]:
ner_stanza = [str(n)[14:-3] for n in ner_stanza]

In [61]:
data['Tag_Stanza_results'] = ner_stanza

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [62]:
set(data["Tag_Stanza_results"].values.tolist())

{'CARDINAL',
 'DATE',
 'EVENT',
 'FAC',
 'GPE',
 'LANGUAGE',
 'LOC',
 'MONEY',
 'NORP',
 'O',
 'ORDINAL',
 'ORG',
 'PERSON',
 'PRODUCT',
 'QUANTITY',
 'TIME',
 'WORK_OF_ART'}

In [63]:
from sklearn.metrics import classification_report

In [64]:
true = data['Tag_stanza']
pred = data['Tag_Stanza_results']
print(classification_report(true, pred))

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

    CARDINAL       0.00      0.00      0.00         0
        DATE       0.03      0.02      0.02      2609
       EVENT       0.00      0.00      0.00       100
         FAC       0.00      0.00      0.00         0
         GPE       0.07      0.03      0.05      6424
    LANGUAGE       0.00      0.00      0.00         0
         LOC       0.00      0.00      0.00         0
        MISC       0.00      0.00      0.00        46
       MONEY       0.00      0.00      0.00         0
        NORP       0.00      0.00      0.00         0
           O       0.85      0.91      0.88     92990
     ORDINAL       0.00      0.00      0.00         0
         ORG       0.04      0.01      0.01      3743
      PERSON       0.03      0.01      0.01      3813
     PRODUCT       0.00      0.00      0.00         0
    QUANTITY       0.00      0.00      0.00         0
        TIME       0.00      0.00      0.00         0
 WORK_OF_ART       0.00    

In [66]:
new_df['stanza_eng_results'] = ner_stanza[:2264]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [67]:
new_df.head()

Unnamed: 0,Sentence #,Word,POS,Tag,Tag_flair,Tag_stanza,crf_results,stanza_eng_results
0,Sentence: 1,Thousands,NNS,O,O,O,O,CARDINAL
1,Sentence: 1,of,IN,O,O,O,O,O
2,Sentence: 1,demonstrators,NNS,O,O,O,O,O
3,Sentence: 1,have,VBP,O,O,O,O,O
4,Sentence: 1,marched,VBN,O,O,O,O,O


In [68]:
new_df.to_csv('CRF.csv')

In [70]:
from google.colab import files
files.download("CRF.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>