In [1]:
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
import re
from nltk.tokenize import  word_tokenize
import json
import ast

In [2]:

model_name = 'ner_crf_trial3.model'
positive = True
if positive:
    train_len = 14000
    raw_data = pd.read_csv('/home/pratik/NER/data/large_positive.csv')
else:
    train_len = 25000
    raw_data = pd.read_csv('/home/pratik/NER/data/large_all.csv')

In [3]:
raw_data.drop_duplicates(subset=['body'], inplace=True)
len(raw_data)

16623

In [4]:
def convert_to_list(original_text_list):
    l = ast.literal_eval(original_text_list)
    l = [i.strip() for i in l]
    return l
raw_data['original_text'] = raw_data['original_text'].apply(lambda x: convert_to_list(x))

In [5]:
raw_data['original_text']

0                [swimming pools]
1                   [sports club]
2                 [medical store]
3                          [food]
5                           [atm]
6                   [petrol pump]
7            [kvbank deposit atm]
8          [kvb bank deposit atm]
9                [online booking]
10             [paytm debit card]
11                    [something]
12                  [paytm debit]
13                     [obc bank]
14                   [Night Club]
15          [registration center]
16                    [locations]
17                     [madikere]
18                  [picnic spot]
19                [bhaddd me jau]
20                    [wesa kuvh]
21              [cabinet painter]
22                    [axis bank]
23               [tourist places]
24                     [jabalpur]
25              [barbeque nation]
26                      [batches]
27                       [sendwa]
28                  [croma store]
29                      [anymore]
30            

In [6]:
def pre_process_text(body, original_text_list):
    try:
        processed_data = []
        tokenized_body = word_tokenize(str(body))
        processed_original_text = []
        for  original_text in original_text_list:
            processed_original_text = processed_original_text + word_tokenize(original_text.lower())
        for token in tokenized_body:
            if token.lower() in processed_original_text:
                processed_data.append((token, 'N'))
            else:
                processed_data.append((token, 'I'))
    except:
        print(body, original_text_list)
    return processed_data

In [7]:
processed_list = []
for body, original_text in zip(raw_data['body'], raw_data['original_text']):
    processed_list.append(pre_process_text(body, original_text))

In [8]:
raw_data['processed'] = list(processed_list)

In [9]:
docs = list(raw_data['processed'])
docs

[[('Swimming', 'N'), ('pools', 'N'), ('near', 'I'), ('me', 'I')],
 [('Sports', 'N'), ('club', 'N')],
 [('Hey', 'I'),
  (',', 'I'),
  ('can', 'I'),
  ('you', 'I'),
  ('find', 'I'),
  ('me', 'I'),
  ('the', 'I'),
  ('nearest', 'I'),
  ('medical', 'N'),
  ('store', 'N'),
  ('?', 'I')],
 [('More', 'I'), ('Nearby', 'I'), (':', 'I'), ('food', 'N')],
 [('Hey', 'I'),
  (',', 'I'),
  ('can', 'I'),
  ('you', 'I'),
  ('find', 'I'),
  ('me', 'I'),
  ('the', 'I'),
  ('nearest', 'I'),
  ('ATM', 'N'),
  ('?', 'I')],
 [('Hey', 'I'),
  (',', 'I'),
  ('can', 'I'),
  ('you', 'I'),
  ('find', 'I'),
  ('me', 'I'),
  ('the', 'I'),
  ('nearest', 'I'),
  ('petrol', 'N'),
  ('pump', 'N'),
  ('?', 'I')],
 [('Kvbank', 'N'), ('deposit', 'N'), ('atm', 'N')],
 [('KVB', 'N'), ('bank', 'N'), ('deposit', 'N'), ('atm', 'N')],
 [('Online', 'N'), ('booking', 'N')],
 [('Hey', 'I'),
  ('I', 'I'),
  ('want', 'I'),
  ('Paytm', 'N'),
  ('debit', 'N'),
  ('card', 'N'),
  ('queries', 'I'),
  ('number', 'I')],
 [('Look', 'I'), (

In [10]:
import nltk
def post_tag(docs):
    data = []
    for i, doc in enumerate(docs):

        # Obtain the list of tokens in the document
        tokens = [t for t, label in doc]

        # Perform POS tagging
        tagged = nltk.pos_tag(tokens)

        # Take the word, POS tag, and its label
        data.append([(w, pos, label) for (w, label), (word, pos) in zip(doc, tagged)])
    return data

In [11]:
data = post_tag(docs)

In [12]:
def is_nearest(x):
    if ('near' in x):
        return True
    return False

def word2features(doc, i):
    word = doc[i][0]
    postag = doc[i][1]
    
    # Common features for all words
    features = [
        'bias',
        'word.lower=' + word.lower(),
        'word[-3:]=' + word[-3:],
        'word[-2:]=' + word[-2:],
        'word.isupper=%s' % word.isupper(),
        'word.istitle=%s' % word.istitle(),
        'word.isdigit=%s' % word.isdigit(),
        'postag=' + postag
    ]
    
    # Features for words that are not
    # at the beginning of a document
    if i > 0:
        word1 = doc[i-1][0]
        postag1 = doc[i-1][1]
        features.extend([
            '-1:word.lower=' + word1.lower(),
            '-1:word.istitle=%s' % word1.istitle(),
            '-1:word.isupper=%s' % word1.isupper(),
            '-1:word.isdigit=%s' % word1.isdigit(),
            '-1:postag=' + postag1,
            '-1:is_near=' + str(bool('near' in word1)),
            '-1:is_close=' + str(bool('close' in word1))
            
        ])
    else:
        # Indicate that it is the 'beginning of a document'
        features.append('BOS')

    # Features for words that are not
    # at the end of a document
    if i < len(doc)-1:
        word1 = doc[i+1][0]
        postag1 = doc[i+1][1]
        features.extend([
            '+1:word.lower=' + word1.lower(),
            '+1:word.istitle=%s' % word1.istitle(),
            '+1:word.isupper=%s' % word1.isupper(),
            '+1:word.isdigit=%s' % word1.isdigit(),
            '+1:postag=' + postag1
            #'+1:is_in_dict='+str(bool(lemmatizer.lemmatize(word1) in entity_dictionary_unigram_lemmas))
        ])
    else:
        # Indicate that it is the 'end of a document'
        features.append('EOS')

    return features

In [13]:
from sklearn.model_selection import train_test_split

# A function for extracting features in documents
def extract_features(doc):
    return [word2features(doc, i) for i in range(len(doc))]

# A function fo generating the list of labels for each document
def get_labels(doc):
    return [label for (token, postag, label) in doc]

X = [extract_features(doc) for doc in data]
y = [get_labels(doc) for doc in data]

X_train = X[:train_len]
y_train = y[:train_len]

X_test = X[train_len:]
y_test = y[train_len:]


In [14]:
len(X_train)

14000

In [15]:
import pycrfsuite

def train_metrics():
    tagger = pycrfsuite.Tagger()
    tagger.open(model_name)

    y_pred = [tagger.tag(xseq) for xseq in X_train]
    
    
    import numpy as np
    from sklearn.metrics import classification_report

    # Create a mapping of labels to indices
    labels = {"N": 1, "I": 0}
    print(len(y_pred))
    # Convert the sequences of tags into a 1-dimensional array
    predictions = np.array([labels[tag] for row in y_pred for tag in row])
    truths = np.array([labels[tag] for row in y_train for tag in row])

    # Print out the classification report
    print('train', classification_report(
        truths, predictions,
        target_names=["I", "N"]))

    
    return classification_report(
        truths, predictions,
        target_names=["I", "N"])

    

def train_crf_model(c1=10, c2=10, max_iterations=1000):
    trainer = pycrfsuite.Trainer(verbose=False)


# Submit training data to the trainer
    for xseq, yseq in zip(X_train, y_train):
        trainer.append(xseq, yseq)

    # Set the parameters of the model
    trainer.set_params({
        # coefficient for L1 penalty
        'c1': c1,

        # coefficient for L2 penalty
        'c2': c2,  

        # maximum number of iterations
        'max_iterations': max_iterations,

        # whether to include transitions that
        # are possible, but not observed
        'feature.possible_transitions': True
    })

    # Provide a file name as a parameter to the train function, such that
    # the model will be saved to the file when training is finished
    trainer.train(model_name)
    
    tagger = pycrfsuite.Tagger()
    tagger.open(model_name)

    y_pred = [tagger.tag(xseq) for xseq in X_test]
    

    for x, y in zip(y_pred[i], [x[1].split("=")[1] for x in X_test[i]]):
        #print("%s (%s)" % (y, x))
        pass
    
    import numpy as np
    from sklearn.metrics import classification_report

    # Create a mapping of labels to indices
    labels = {"N": 1, "I": 0}
    print(len(y_pred))
    # Convert the sequences of tags into a 1-dimensional array
    predictions = np.array([labels[tag] for row in y_pred for tag in row])
    truths = np.array([labels[tag] for row in y_test for tag in row])

    # Print out the classification report
    print(classification_report(
        truths, predictions,
        target_names=["I", "N"]))
    
    return classification_report(
        truths, predictions,
        target_names=["I", "N"])


In [16]:
result_analysis = {'c1': [], 'c2': [], 'result': [], 'max_iterations': [], 'train/test': [], 'model_name': []}

for i in range(0,15,5):
    for j in range(0,15,5):
        for k in range(1000,10000,1000):
            print('C1: ', i, 'C2: ', j, 'MAX_ITERS:', k)
            model_name = '/home/pratik/crf_model/pos' + str(i) + str(j) + str(k) + '.model'
            result = train_crf_model(c1=i, c2=j, max_iterations=k)
            result_analysis['c1'].append(i)
            result_analysis['c2'].append(j)
            result_analysis['result'].append(result)
            result_analysis['max_iterations'].append(k)
            result_analysis['train/test'].append('test')
            result_analysis['model_name'].append(model_name)
            result = train_metrics()
            result_analysis['c1'].append(i)
            result_analysis['c2'].append(j)
            result_analysis['result'].append(result)
            result_analysis['max_iterations'].append(k)
            result_analysis['train/test'].append('train')
            result_analysis['model_name'].append(model_name)
            print('--------------------------')

C1:  0 C2:  0 MAX_ITERS: 1000
2623
             precision    recall  f1-score   support

          I       0.92      0.88      0.90      7148
          N       0.83      0.88      0.85      4681

avg / total       0.88      0.88      0.88     11829

14000
train              precision    recall  f1-score   support

          I       0.97      0.98      0.97     29866
          N       0.97      0.96      0.97     24231

avg / total       0.97      0.97      0.97     54097

--------------------------
C1:  0 C2:  0 MAX_ITERS: 2000
2623
             precision    recall  f1-score   support

          I       0.92      0.87      0.89      7148
          N       0.82      0.88      0.85      4681

avg / total       0.88      0.87      0.87     11829

14000
train              precision    recall  f1-score   support

          I       0.97      0.97      0.97     29866
          N       0.97      0.96      0.97     24231

avg / total       0.97      0.97      0.97     54097

-------------------

14000
train              precision    recall  f1-score   support

          I       0.92      0.90      0.91     29866
          N       0.88      0.90      0.89     24231

avg / total       0.90      0.90      0.90     54097

--------------------------
C1:  0 C2:  5 MAX_ITERS: 9000
2623
             precision    recall  f1-score   support

          I       0.93      0.89      0.91      7148
          N       0.84      0.89      0.87      4681

avg / total       0.89      0.89      0.89     11829

14000
train              precision    recall  f1-score   support

          I       0.92      0.90      0.91     29866
          N       0.88      0.90      0.89     24231

avg / total       0.90      0.90      0.90     54097

--------------------------
C1:  0 C2:  10 MAX_ITERS: 1000
2623
             precision    recall  f1-score   support

          I       0.92      0.89      0.91      7148
          N       0.84      0.89      0.86      4681

avg / total       0.89      0.89      0.89   

2623
             precision    recall  f1-score   support

          I       0.92      0.88      0.90      7148
          N       0.83      0.89      0.86      4681

avg / total       0.89      0.89      0.89     11829

14000
train              precision    recall  f1-score   support

          I       0.91      0.89      0.90     29866
          N       0.87      0.89      0.88     24231

avg / total       0.89      0.89      0.89     54097

--------------------------
C1:  5 C2:  0 MAX_ITERS: 8000
2623
             precision    recall  f1-score   support

          I       0.92      0.88      0.90      7148
          N       0.83      0.89      0.86      4681

avg / total       0.89      0.89      0.89     11829

14000
train              precision    recall  f1-score   support

          I       0.91      0.89      0.90     29866
          N       0.87      0.89      0.88     24231

avg / total       0.89      0.89      0.89     54097

--------------------------
C1:  5 C2:  0 MAX_ITER

14000
train              precision    recall  f1-score   support

          I       0.90      0.88      0.89     29866
          N       0.86      0.88      0.87     24231

avg / total       0.88      0.88      0.88     54097

--------------------------
C1:  5 C2:  10 MAX_ITERS: 6000
2623
             precision    recall  f1-score   support

          I       0.92      0.88      0.90      7148
          N       0.83      0.88      0.85      4681

avg / total       0.88      0.88      0.88     11829

14000
train              precision    recall  f1-score   support

          I       0.90      0.88      0.89     29866
          N       0.86      0.88      0.87     24231

avg / total       0.88      0.88      0.88     54097

--------------------------
C1:  5 C2:  10 MAX_ITERS: 7000
2623
             precision    recall  f1-score   support

          I       0.92      0.88      0.90      7148
          N       0.83      0.88      0.85      4681

avg / total       0.88      0.88      0.88  

2623
             precision    recall  f1-score   support

          I       0.92      0.88      0.90      7148
          N       0.83      0.88      0.85      4681

avg / total       0.88      0.88      0.88     11829

14000
train              precision    recall  f1-score   support

          I       0.90      0.88      0.89     29866
          N       0.85      0.88      0.86     24231

avg / total       0.88      0.88      0.88     54097

--------------------------
C1:  10 C2:  5 MAX_ITERS: 5000
2623
             precision    recall  f1-score   support

          I       0.92      0.88      0.90      7148
          N       0.83      0.88      0.85      4681

avg / total       0.88      0.88      0.88     11829

14000
train              precision    recall  f1-score   support

          I       0.90      0.88      0.89     29866
          N       0.85      0.88      0.86     24231

avg / total       0.88      0.88      0.88     54097

--------------------------
C1:  10 C2:  5 MAX_IT

In [17]:
result_pandas = pd.DataFrame.from_dict(result_analysis)

In [18]:
result_pandas['Iprecision'] = result_pandas['result'].apply(lambda x : x.split()[5])
result_pandas['Irecall'] = result_pandas['result'].apply(lambda x : x.split()[6])
result_pandas['If_score'] = result_pandas['result'].apply(lambda x : x.split()[7])
result_pandas['Nprecision'] = result_pandas['result'].apply(lambda x : x.split()[10])
result_pandas['Nrecall'] = result_pandas['result'].apply(lambda x : x.split()[11])
result_pandas['Nf_score'] = result_pandas['result'].apply(lambda x : x.split()[12])
result_pandas['precision'] = result_pandas['result'].apply(lambda x : x.split()[17])
result_pandas['recall'] = result_pandas['result'].apply(lambda x : x.split()[18])
result_pandas['f_score'] = result_pandas['result'].apply(lambda x : x.split()[19])

In [19]:
result_pandas

Unnamed: 0,c1,c2,max_iterations,model_name,result,train/test,Iprecision,Irecall,If_score,Nprecision,Nrecall,Nf_score,precision,recall,f_score
0,0,0,1000,/home/pratik/crf_model/pos001000.model,precision recall f1-score s...,test,0.92,0.88,0.90,0.83,0.88,0.85,0.88,0.88,0.88
1,0,0,1000,/home/pratik/crf_model/pos001000.model,precision recall f1-score s...,train,0.97,0.98,0.97,0.97,0.96,0.97,0.97,0.97,0.97
2,0,0,2000,/home/pratik/crf_model/pos002000.model,precision recall f1-score s...,test,0.92,0.87,0.89,0.82,0.88,0.85,0.88,0.87,0.87
3,0,0,2000,/home/pratik/crf_model/pos002000.model,precision recall f1-score s...,train,0.97,0.97,0.97,0.97,0.96,0.97,0.97,0.97,0.97
4,0,0,3000,/home/pratik/crf_model/pos003000.model,precision recall f1-score s...,test,0.91,0.87,0.89,0.81,0.87,0.84,0.87,0.87,0.87
5,0,0,3000,/home/pratik/crf_model/pos003000.model,precision recall f1-score s...,train,0.97,0.97,0.97,0.97,0.96,0.97,0.97,0.97,0.97
6,0,0,4000,/home/pratik/crf_model/pos004000.model,precision recall f1-score s...,test,0.91,0.87,0.89,0.81,0.87,0.84,0.87,0.87,0.87
7,0,0,4000,/home/pratik/crf_model/pos004000.model,precision recall f1-score s...,train,0.97,0.97,0.97,0.97,0.96,0.97,0.97,0.97,0.97
8,0,0,5000,/home/pratik/crf_model/pos005000.model,precision recall f1-score s...,test,0.91,0.87,0.89,0.81,0.87,0.84,0.87,0.87,0.87
9,0,0,5000,/home/pratik/crf_model/pos005000.model,precision recall f1-score s...,train,0.97,0.97,0.97,0.97,0.96,0.97,0.97,0.97,0.97


In [29]:
test_result = result_pandas[result_pandas['train/test'] == 'test'][['Nprecision', 'Nrecall', 'Nf_score','c1','c2', 'max_iterations']]

In [30]:
test_result

Unnamed: 0,Nprecision,Nrecall,Nf_score,c1,c2,max_iterations
0,0.83,0.88,0.85,0,0,1000
2,0.82,0.88,0.85,0,0,2000
4,0.81,0.87,0.84,0,0,3000
6,0.81,0.87,0.84,0,0,4000
8,0.81,0.87,0.84,0,0,5000
10,0.81,0.87,0.84,0,0,6000
12,0.81,0.87,0.84,0,0,7000
14,0.81,0.87,0.84,0,0,8000
16,0.81,0.87,0.84,0,0,9000
18,0.84,0.89,0.87,0,5,1000


In [22]:
tagger = pycrfsuite.Tagger()
tagger.open(model_name)

y_pred = [tagger.tag(xseq) for xseq in X_test]

predictions_analysis = {'status': [], 'expected': [], 'predicted': [], 'word': []}
for i in range(len(y_pred)):
    for x, y, z in zip(y_pred[i], [x[1].split("=")[1] for x in X_test[i]], y_test[i]):
        #print("Word %s Prediction(%s) Expected(%s)" % (y, x, z))
        if x == z:
            predictions_analysis['status'].append(True)
        else:
            predictions_analysis['status'].append(False)
        predictions_analysis['predicted'].append(x)
        predictions_analysis['expected'].append(z)
        predictions_analysis['word'].append(y)

import numpy as np
from sklearn.metrics import classification_report

# Create a mapping of labels to indices
labels = {"N": 1, "I": 0}
print(len(y_pred))
# Convert the sequences of tags into a 1-dimensional array
predictions = np.array([labels[tag] for row in y_pred for tag in row])
truths = np.array([labels[tag] for row in y_test for tag in row])

# Print out the classification report
print(classification_report(
    truths, predictions,
    target_names=["I", "N"]))


2623
             precision    recall  f1-score   support

          I       0.91      0.88      0.90      7148
          N       0.83      0.87      0.85      4681

avg / total       0.88      0.88      0.88     11829



In [23]:
pandas_prediction_analysis = pd.DataFrame.from_dict(predictions_analysis)

In [24]:
labels = np.array(pandas_prediction_analysis['expected'].map({'N': 1, 'I': 0 } ))
predictions = np.array(pandas_prediction_analysis['predicted'].map({'N': 1, 'I': 0 } ))

In [25]:
from sklearn.metrics import confusion_matrix
tn, fp, fn, tp  = confusion_matrix(labels, predictions).ravel()

In [26]:
tn, fp, fn, tp

(6284, 864, 594, 4087)

In [31]:
test_result.loc[test_result.Nf_score.argmax()]

Nprecision        0.84
Nrecall           0.89
Nf_score          0.87
c1                   0
c2                   5
max_iterations    1000
Name: 18, dtype: object

In [28]:
#result_pandas.to_csv()
#pandas_prediction_analysis.to_csv()


In [32]:
result_pandas.to_csv('crf_positive.csv')

Unnamed: 0,c1,c2,max_iterations,model_name,result,train/test,Iprecision,Irecall,If_score,Nprecision,Nrecall,Nf_score,precision,recall,f_score
0,0,0,1000,/home/pratik/crf_model/pos001000.model,precision recall f1-score s...,test,0.92,0.88,0.90,0.83,0.88,0.85,0.88,0.88,0.88
1,0,0,1000,/home/pratik/crf_model/pos001000.model,precision recall f1-score s...,train,0.97,0.98,0.97,0.97,0.96,0.97,0.97,0.97,0.97
2,0,0,2000,/home/pratik/crf_model/pos002000.model,precision recall f1-score s...,test,0.92,0.87,0.89,0.82,0.88,0.85,0.88,0.87,0.87
3,0,0,2000,/home/pratik/crf_model/pos002000.model,precision recall f1-score s...,train,0.97,0.97,0.97,0.97,0.96,0.97,0.97,0.97,0.97
4,0,0,3000,/home/pratik/crf_model/pos003000.model,precision recall f1-score s...,test,0.91,0.87,0.89,0.81,0.87,0.84,0.87,0.87,0.87
5,0,0,3000,/home/pratik/crf_model/pos003000.model,precision recall f1-score s...,train,0.97,0.97,0.97,0.97,0.96,0.97,0.97,0.97,0.97
6,0,0,4000,/home/pratik/crf_model/pos004000.model,precision recall f1-score s...,test,0.91,0.87,0.89,0.81,0.87,0.84,0.87,0.87,0.87
7,0,0,4000,/home/pratik/crf_model/pos004000.model,precision recall f1-score s...,train,0.97,0.97,0.97,0.97,0.96,0.97,0.97,0.97,0.97
8,0,0,5000,/home/pratik/crf_model/pos005000.model,precision recall f1-score s...,test,0.91,0.87,0.89,0.81,0.87,0.84,0.87,0.87,0.87
9,0,0,5000,/home/pratik/crf_model/pos005000.model,precision recall f1-score s...,train,0.97,0.97,0.97,0.97,0.96,0.97,0.97,0.97,0.97
