## Import and Setup

In [1]:
from itertools import chain
import glob
import pandas as pd
import re
import numpy as np
import nltk
import math
from nltk.tag import CRFTagger
import pycrfsuite
from hmmlearn import hmm
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.metrics import classification_report, confusion_matrix
#import dateparser
#from dateparser.search import search_dates

#nltk.download()

## Functions

In [2]:
def save_as_csv(filename, dataframe) :
    dataframe.to_csv(filename, encoding='utf-8')

In [3]:
def read_csv(path, last_column):
    data_input = pd.read_csv(path, header=0, usecols=range(1,last_column))
    data_frame_input = pd.DataFrame(data = data_input)
    return data_frame_input

In [4]:
def get_promo_data_index(data):
    numpy_data = data.copy(deep=True)
    numpy_data = np.array(numpy_data)
    new_data = []
    for i in range(len(numpy_data)):
        if (numpy_data[i][2] == 1):
            new_data.append(i)
    new_data = np.array(new_data)
    return (new_data)

def get_promo_data(data):
    numpy_data = data.copy(deep=True)
    numpy_data = np.array(numpy_data)
    new_data = []
    for i in range(len(numpy_data)):
        if (numpy_data[i][2] == 1):
            new_data.append(numpy_data[:,[0,1]][i])
    new_data = np.array(new_data)
    new_data = pd.DataFrame(data=new_data,columns=["tweet","username"])
    return (new_data)

In [5]:
def get_url(text):
    result = re.findall(r"http\S+",text)
    return(result)

def get_url_data(data):
    numpy_data = data.copy(deep=True)
    numpy_data = np.array(numpy_data)
    new_data = []
    for i in range(len(numpy_data)):
        url_result = get_url(numpy_data[i][0])
        new_data.append(url_result)
    new_data = np.array(new_data)
    new_data = pd.DataFrame(new_data,columns=["urls"])
    return(new_data)

In [6]:
def get_number(text):
    filter_text = re.sub(r"http\S+", "", text)
    result = re.findall(r"[0-9]\S+",filter_text)
    return (result)

def get_number_data(data):
    numpy_data = data.copy(deep=True)
    numpy_data = np.array(numpy_data)
    new_data = []
    for i in range(len(numpy_data)):
        number_result = get_number(numpy_data[i][0])
        new_data.append(number_result)
    new_data = np.array(new_data)
    new_data = pd.DataFrame(new_data,columns=["number"])
    return(new_data)

print(get_number("15rb"))

['15rb']


In [7]:
def get_discount(text):
    #result = re.findall(r"(?:[a-zA-Z'-]+[^a-zA-Z'-]+){0,2}\S+%(?:[^a-zA-Z'-]+[a-zA-Z'-]+){0,2}",text)
    result = re.findall(r"\S+%",text)
    return (result)

def get_discount_data(data):
    numpy_data = data.copy(deep=True)
    numpy_data = np.array(numpy_data)
    new_data = []
    for i in range(len(numpy_data)):
        result = get_discount(numpy_data[i][0])
        new_data.append(result)
    new_data = np.array(new_data)
    new_data = pd.DataFrame(new_data,columns=["discount"])
    return(new_data)

print(get_discount("Diskon ini 10000% dari biasanya"))

['10000%']


In [8]:
def get_price(text):
    filter_text = re.sub(r"http\S+", "", text)
    filter_text = re.sub(r"08[0-9]\S+", "", filter_text)
    filter_text = re.sub(r"\s*k", ".000", filter_text)
    filter_text = re.sub(r"\s*rb\w*", ".000", filter_text)
    filter_text = re.sub(r"\s*ribu\w*", ".000", filter_text)
    filter_text = re.sub(r"\s*rebu\w*", ".000", filter_text)
    filter_text = re.sub(r"\s*jt\w*", ".000.000", filter_text)
    filter_text = re.sub(r"\s*juta\w*", ".000.000", filter_text)    
    filter_text = re.sub(r"[^\s0-9.]", "", filter_text)
    
    result = re.findall(r"[1-9]+\S+.[0-9]+[0]{2}",filter_text)
    return (result)

def get_price_data(data):
    numpy_data = data.copy(deep=True)
    numpy_data = np.array(numpy_data)
    new_data = []
    for i in range(len(numpy_data)):
        result = get_price(numpy_data[i][0])
        new_data.append(result)
    new_data = np.array(new_data)
    new_data = pd.DataFrame(new_data,columns=["price"])
    return(new_data)

print(get_price("0857-000 rp15rb rp15ribuan rp15juta 15 juta 15 puluh puluh juta 15jt 15.000 15.00"))

['15.000', '15.000', '15.000.000', '15.000.000', '15.000.000', '15.000']


In [9]:
def get_date(text):
    result = ""
    return result
    
def get_date_data(data):
    numpy_data = data.copy(deep=True)
    numpy_data = np.array(numpy_data)
    new_data = []
    for i in range(len(numpy_data)):
        date_result = get_date(numpy_data[i][0])
        new_data.append(date_result)
    new_data = np.array(new_data)
    new_data = pd.DataFrame(new_data,columns=["date"])
    return(new_data)

print(get_date("1 1 2018"))




In [10]:
def merge_data(datas1, datas2):
    np1 = np.array(datas1)
    np2 = np.array(datas2)
    column1 = datas1.columns.values
    column2 = datas2.columns.values
    columnresult = np.append(column1,column2, axis=0)
    result = np.append(np1, np2, axis=1)
    result = pd.DataFrame(data=result, columns=columnresult)
    return (result)

In [11]:
def get_by_index(data,index_array):
    numpy_data = data.copy(deep=True)
    numpy_data = np.array(numpy_data)
    new_data = []
    check_index = 0
    for i in range(len(numpy_data)):
        if (check_index < len(index_array)) and (i == index_array[check_index]):
            check_index = check_index + 1
            new_data.append(numpy_data[i])
    new_data = np.array(new_data)
    new_data = pd.DataFrame(data=new_data,columns=["tweet","username"])
    return new_data

## Main

In [12]:
raw_data = read_csv("integrated_data/integrated_with_label(full).csv",4)
print(raw_data)

                                                      0                1    2
0      ya anjir cpt bgt ya abis barang manusia diskon:(         SOHEEDCH  0.0
1     lazada 11.11.2019 diskon terbesar 24 jam https...        Vallno_07  1.0
2     selama kalo liat promo diskon 70% + 30% dalem ...         bayu_joo  0.0
3     apa bedanya matahari sama bulan?  matahari ada...       Hanifati_f  0.0
4     @garudacares bagaimana cara tukar mileage disk...           Nug_QA  1.0
5     @gencamon nahiya benerrr hahahaha kalo makan c...         primetam  0.0
6            fak gmarket kenapa diskon nya sekarang sih       singularyv  0.0
7     sejarah asia tenggara karya m.c. ricklefs rp 3...   KomunitasBambu  1.0
8     @amingcoffee transmart kuburaya diskon, kapan ...      kalbarinfo_  0.0
9     tah diskon 50% cenah..  yuk diserbu..   https:...   AmaAbiyyunShoh  1.0
10    @wongalas901 @prabowo @aniesbaswedan rika wis ...     Dianhandoko6  0.0
11    sebenarnya bahagia itu sederhana... - beli ini...        a

In [13]:
raw_promo_data = get_promo_data(raw_data)
raw_promo_data_index = get_promo_data_index(raw_data)
print(raw_promo_data_index)
print(raw_promo_data)

[  1   4   7   9  15  17  18  20  22  23  24  25  27  30  33  35  36  38
  39  40  42  43  44  45  47  48  49  52  53  54  55  56  58  59  60  61
  62  63  64  68  70  72  73  77  78  79  80  81  83  84  85  87  89  90
  94  95  96  97 100 103 104 105 106 107 108 110 112 114 116 118 119 121
 122 123 124 126 128 129 131 134 135 136 139 143 145 146 151 152 154 155
 156 158 162 163 164 167 169 171 172 173 174 175 176 177 179 180 181 187
 188 190 195 196 198]
                                                 tweet         username
0    lazada 11.11.2019 diskon terbesar 24 jam https...        Vallno_07
1    @garudacares bagaimana cara tukar mileage disk...           Nug_QA
2    sejarah asia tenggara karya m.c. ricklefs rp 3...   KomunitasBambu
3    tah diskon 50% cenah..  yuk diserbu..   https:...   AmaAbiyyunShoh
4    yaampun kak aulll. eh tapi itu make over lagi ...      tyoungjaems
5    pesta harga akhir tahun di giias medan auto sh...      IcanIndopro
6    hanya hari ini! shopee promo 11

In [14]:
url_data = get_url_data(raw_promo_data)

In [15]:
number_data = get_number_data(raw_promo_data)
for i in range(len(number_data)):
    print(number_data["number"][i])

['11.11.2019', '24']
['70%']
['350.000', '210.000', '0813-8543-0505', '08.00-17.00']
['50%']
[]
['2018!']
['11.11', '11.11', '90%']
['20%']
['20%', '100rb.']
['01-15', '2018']
['50%']
['84%!!']
['30rebu']
['2018']
['10']
['25%']
['120rb/pcs']
['25.500', '34.000', '15.800', '20.000']
['70%...benar']
['10%']
['50ribuan', '50%.']
['25%']
['1.000.000.']
[]
['11', '2018.', '2018goestojungleland']
['30%']
['25', '2018', '121']
['30%']
['120.000', '84.000', '36.000', '30%', '10.800,', '25.200']
[]
['2..', '10%', '2256ea97']
['2018:', '90%']
['2018:', '90%']
[]
[]
['110rb/pcs']
['7.500']
[]
[]
['790k', '560k']
[]
['15%', '1,2', '04']
['15%', '1,2', '04']
[]
['100%', '70%.']
[]
[]
['75%']
['150.000']
[]
['085701512000']
[]
['12', '30%']
['20%']
['60-80%', '20rb']
['10%', '15%', '20%', '082324126647']
['11/11', '12/12', '11/11', '12/12']
['59rb', '1602', '60rb']
['13', '89.000', '71.200']
['15%', '10%']
['11.11']
['15%']
['10%']
['97q9anm2xnt']
['50%', '31', '2018']
['50%']
['15%.', '2018!']
['2

In [16]:
discount_data = get_discount_data(raw_promo_data)
print(discount_data)

       discount
0            []
1         [70%]
2            []
3         [50%]
4            []
5            []
6         [90%]
7         [20%]
8         [20%]
9            []
10        [50%]
11        [84%]
12           []
13           []
14           []
15        [25%]
16           []
17           []
18        [70%]
19        [10%]
20        [50%]
21        [25%]
22           []
23           []
24           []
25        [30%]
26           []
27        [30%]
28        [30%]
29           []
..          ...
83           []
84           []
85        [25%]
86        [50%]
87   [10%, 15%]
88           []
89           []
90        [35%]
91        [65%]
92           []
93           []
94        [50%]
95   [10%, 15%]
96        [60%]
97        [50%]
98        [50%]
99        [50%]
100       [50%]
101       [50%]
102       [50%]
103   [10%, 0%]
104  [70%, 10%]
105       [50%]
106       [25%]
107       [50%]
108          []
109          []
110          []
111       [50%]
112          []

[113 ro

In [17]:
price_data = get_price_data(raw_promo_data)
print(price_data)

                                         price
0                                           []
1                                           []
2                           [350.000, 210.000]
3                                           []
4                                           []
5                                           []
6                                           []
7                                           []
8                                    [100.000]
9                                           []
10                                          []
11                                          []
12                                    [30.000]
13                                          []
14                                          []
15                                          []
16                                   [120.000]
17            [25.500, 34.000, 15.800, 20.000]
18                                          []
19                                          []
20           

In [18]:
ie_data = merge_data(raw_promo_data, url_data)
ie_data = merge_data(ie_data, discount_data)
ie_data = merge_data(ie_data, price_data)
print(ie_data)

                                                 tweet         username  \
0    lazada 11.11.2019 diskon terbesar 24 jam https...        Vallno_07   
1    @garudacares bagaimana cara tukar mileage disk...           Nug_QA   
2    sejarah asia tenggara karya m.c. ricklefs rp 3...   KomunitasBambu   
3    tah diskon 50% cenah..  yuk diserbu..   https:...   AmaAbiyyunShoh   
4    yaampun kak aulll. eh tapi itu make over lagi ...      tyoungjaems   
5    pesta harga akhir tahun di giias medan auto sh...      IcanIndopro   
6    hanya hari ini! shopee promo 11.11 big sale! 1...   katalogpromosi   
7    cgv cinemas kini hadir di icon mall #gresik . ...         GoersApp   
8    beli tiket kereta api di @tokopedia diskon 20%...        promo_BRI   
9    buat kalian yang bisa jajan di alfamart - yuk ...   katalogpromosi   
10   starbucks promo diskon 50% untuk pembelian min...   katalogpromosi   
11   sempurnakan alat dapurmu yuk! peralatan dapur ...       alfacartID   
12   @yrachmania temen gu

In [19]:
raw_normalized_data = read_csv("integrated_data/normalized.csv",3)
raw_normalized_data = get_by_index(raw_normalized_data, raw_promo_data_index)
print(raw_normalized_data)

                                                 tweet         username
0             lazada 11 11 2019 diskon terbesar 24 jam        Vallno_07
1     bagaimana cara tukar mileage diskon 70 % garu...           Nug_QA
2     sejarah asia tenggara karya m c ricklefs rupi...   KomunitasBambu
3                      nah diskon 50 % lho yuk diserbu   AmaAbiyyunShoh
4     astaga kak aul eh tapi itu make over lagi dis...      tyoungjaems
5     pesta harga akhir tahun di giias medan auto s...      IcanIndopro
6     hanya hari ini shopee promo 11 11 big sale 11...   katalogpromosi
7     cgv cinemas kini hadir di icon mall dapat dis...         GoersApp
8     beli tiket kereta api di diskon 20 % pakai ep...        promo_BRI
9     untuk kalian yang bisa jajan di alfamart yuk ...   katalogpromosi
10    starbucks promo diskon 50 % untuk beli minuma...   katalogpromosi
11    sempurna alat dapur yuk peralatan dapur yang ...       alfacartID
12    teman saya titip teman saat di hongkong lagi ...      urpo

## Eksperimen Post Tag dengan Gradient descent using the L-BFGS method

In [20]:
def read_pos_tag_corpus(file_name,sample_count=1000):
    with open(file_name, 'r', encoding='utf-8') as f:
        lines = f.read().split('\n')
    pair = []
    all_pair = []
    for line in lines[: min(sample_count, len(lines))]:
        if line == '':
            all_pair.append(pair)
            pair = []
        else:
            kata, tag = line.split('\t')
            p = (kata.lower(),tag)
            pair.append(p)
    return all_pair

pos_tag_corpus = read_pos_tag_corpus("text_corpus/Indonesian_Manually_Tagged_Corpus.txt")
print(pos_tag_corpus)

[[('kera', 'NN'), ('untuk', 'SC'), ('amankan', 'VB'), ('pesta olahraga', 'NN')], [('pemerintah', 'NNP'), ('kota', 'NNP'), ('delhi', 'NNP'), ('mengerahkan', 'VB'), ('monyet', 'NN'), ('untuk', 'SC'), ('mengusir', 'VB'), ('monyet-monyet', 'NN'), ('lain', 'JJ'), ('yang', 'SC'), ('berbadan', 'VB'), ('lebih', 'RB'), ('kecil', 'JJ'), ('dari', 'IN'), ('arena', 'NN'), ('pesta olahraga', 'NNP'), ('persemakmuran', 'NNP'), ('.', 'Z')], [('beberapa', 'CD'), ('laporan', 'NN'), ('menyebutkan', 'VB'), ('setidaknya', 'RB'), ('10', 'CD'), ('monyet', 'NN'), ('ditempatkan', 'VB'), ('di', 'IN'), ('luar', 'NN'), ('arena', 'NN'), ('lomba', 'NN'), ('dan', 'CC'), ('pertandingan', 'NN'), ('di', 'IN'), ('ibukota', 'NNP'), ('india', 'NNP'), ('.', 'Z')], [('pemkot', 'NNP'), ('delhi', 'NNP'), ('memiliki', 'VB'), ('28', 'CD'), ('monyet', 'NN'), ('dan', 'CC'), ('berencana', 'VB'), ('mendatangkan', 'VB'), ('10', 'CD'), ('monyet', 'NN'), ('sejenis', 'NN'), ('dari', 'IN'), ('negara bagian', 'NNP'), ('rajasthan', 'NNP'),

In [21]:
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]
    features = [
        'bias',
        'word.lower=' + word.lower(),
        'word[-3:]=' + word[-3:],
        'word[-2:]=' + word[-2:],
        'word.isupper=%s' % word.isupper(),
        'word.istitle=%s' % word.istitle(),
        'word.isdigit=%s' % word.isdigit(),
        #'postag=' + postag,
        #'postag[:2]=' + postag[:2],
    ]
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.extend([
            '-1:word.lower=' + word1.lower(),
            '-1:word.istitle=%s' % word1.istitle(),
            '-1:word.isupper=%s' % word1.isupper(),
            '-1:postag=' + postag1,
            '-1:postag[:2]=' + postag1[:2],
        ])
    else:
        features.append('BOS')
        
    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.extend([
            '+1:word.lower=' + word1.lower(),
            '+1:word.istitle=%s' % word1.istitle(),
            '+1:word.isupper=%s' % word1.isupper(),
            '+1:postag=' + postag1,
            '+1:postag[:2]=' + postag1[:2],
        ])
    else:
        features.append('EOS')
                
    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [postag for token, postag in sent]

def sent2tokens(sent):
    return [token for token, postag in sent]

print(sent2features(pos_tag_corpus[0])[0])

['bias', 'word.lower=kera', 'word[-3:]=era', 'word[-2:]=ra', 'word.isupper=False', 'word.istitle=False', 'word.isdigit=False', 'BOS', '+1:word.lower=untuk', '+1:word.istitle=False', '+1:word.isupper=False', '+1:postag=SC', '+1:postag[:2]=SC']


In [22]:
X = [sent2features(s) for s in pos_tag_corpus]
y = [sent2labels(s) for s in pos_tag_corpus]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

print(X_train)

[[['bias', 'word.lower=salah satu', 'word[-3:]=atu', 'word[-2:]=tu', 'word.isupper=False', 'word.istitle=False', 'word.isdigit=False', 'BOS', '+1:word.lower=dari', '+1:word.istitle=False', '+1:word.isupper=False', '+1:postag=IN', '+1:postag[:2]=IN'], ['bias', 'word.lower=dari', 'word[-3:]=ari', 'word[-2:]=ri', 'word.isupper=False', 'word.istitle=False', 'word.isdigit=False', '-1:word.lower=salah satu', '-1:word.istitle=False', '-1:word.isupper=False', '-1:postag=CD', '-1:postag[:2]=CD', '+1:word.lower=pihak', '+1:word.istitle=False', '+1:word.isupper=False', '+1:postag=NN', '+1:postag[:2]=NN'], ['bias', 'word.lower=pihak', 'word[-3:]=hak', 'word[-2:]=ak', 'word.isupper=False', 'word.istitle=False', 'word.isdigit=False', '-1:word.lower=dari', '-1:word.istitle=False', '-1:word.isupper=False', '-1:postag=IN', '-1:postag[:2]=IN', '+1:word.lower=yang', '+1:word.istitle=False', '+1:word.isupper=False', '+1:postag=SC', '+1:postag[:2]=SC'], ['bias', 'word.lower=yang', 'word[-3:]=ang', 'word[-2

In [23]:
trainer = pycrfsuite.Trainer(algorithm='lbfgs',verbose=False)

for xseq, yseq in zip(X_train, y_train):
    trainer.append(xseq, yseq)
    
trainer.set_params({
    'c1': 1.0,   # coefficient for L1 penalty
    'c2': 1e-3,  # coefficient for L2 penalty
    'max_iterations': 50,  # stop earlier

    # include transitions that are possible, but not observed
    'feature.possible_transitions': True
})

print(trainer.params())

trainer.train('conll2002-esp.crfsuite')

['feature.minfreq', 'feature.possible_states', 'feature.possible_transitions', 'c1', 'c2', 'max_iterations', 'num_memories', 'epsilon', 'period', 'delta', 'linesearch', 'max_linesearch']


In [24]:
trainer.logparser.last_iteration

{'active_features': 528,
 'error_norm': 12.68808,
 'feature_norm': 26.273679,
 'linesearch_step': 1.0,
 'linesearch_trials': 1,
 'loss': 809.239624,
 'num': 50,
 'scores': {},
 'time': 0.002}

In [25]:
!ls -lh ./conll2002-esp.crfsuite

'ls' is not recognized as an internal or external command,
operable program or batch file.


In [26]:
tagger = pycrfsuite.Tagger()
tagger.open('conll2002-esp.crfsuite')

<contextlib.closing at 0x277f6d0a0b8>

In [27]:
example_sent = pos_tag_corpus[0]
print(example_sent)
print(' '.join(sent2tokens(example_sent)), end='\n\n')

print("Predicted:", ' '.join(tagger.tag(sent2features(example_sent))))
print("Correct:  ", ' '.join(sent2labels(example_sent)))

[('kera', 'NN'), ('untuk', 'SC'), ('amankan', 'VB'), ('pesta olahraga', 'NN')]
kera untuk amankan pesta olahraga

Predicted: NN SC VB NN
Correct:   NN SC VB NN


In [28]:
def postag_classification_report(y_true, y_pred):
    lb = preprocessing.LabelBinarizer()
    y_true_combined = lb.fit_transform(list(chain.from_iterable(y_true)))
    y_pred_combined = lb.transform(list(chain.from_iterable(y_pred)))
        
    tagset = set(lb.classes_) - {'O'}
    tagset = sorted(tagset, key=lambda tag: tag.split('-', 1)[::-1])
    class_indices = {cls: idx for idx, cls in enumerate(lb.classes_)}
    
    return classification_report(
        y_true_combined,
        y_pred_combined,
        labels = [class_indices[cls] for cls in tagset],
        target_names = tagset,
    )

In [29]:
y_pred = [tagger.tag(xseq) for xseq in X_test]

In [30]:
print(postag_classification_report(y_test, y_pred))

             precision    recall  f1-score   support

         CC       1.00      0.75      0.86         4
         CD       1.00      0.50      0.67         4
         FW       0.00      0.00      0.00         1
         IN       0.75      0.75      0.75         4
         JJ       0.60      0.43      0.50         7
         MD       0.67      1.00      0.80         4
        NEG       1.00      1.00      1.00         1
         NN       0.57      0.95      0.71        21
        NNP       0.75      0.86      0.80         7
         PR       1.00      1.00      1.00         2
        PRP       1.00      1.00      1.00         7
         RB       0.00      0.00      0.00         3
         RP       0.00      0.00      0.00         1
         SC       0.75      0.43      0.55         7
         VB       0.86      0.67      0.75        18
          Z       1.00      1.00      1.00         7

avg / total       0.74      0.74      0.72        98



  'precision', 'predicted', average, warn_for)


## Input NNP ke Information Extraction

In [31]:
def words_list(text):
    return(nltk.word_tokenize(text))

def generate_post_tag(text):
    word_array = np.array(words_list(text))
    return tagger.tag(sent2features(word_array))
    #return ct.tag_sents(word_array)
    
generate_post_tag("lazada 11 11 2019 diskon terbesar 24 jam")

['NN', 'CD', 'CD', 'CD', 'NN', 'NN', 'CD', 'NN']

In [32]:
def get_nnp_index(text):
    index_array = []
    filter_text = filter_one_character(text)
    filter_text = filter_not_alphanumeric(filter_text)
    #filter_text = filter_number(filter_text)
    tag_result = generate_post_tag(filter_text)
    for i in range(len(tag_result)):
        if (tag_result[i] == 'NNP'):
            index_array.append(i)
    return index_array

def filter_one_character(text):
    return(re.sub(r"\b[%a-zA-Z0-9]\b", "", text))

def filter_not_alphanumeric(text):
    return(re.sub(r'[^\w]', ' ', text))

def filter_number(text):
    return(re.sub(r'[0-9]', '', text))

def get_nnp(text):
    nnp_index = get_nnp_index(text)
    word_array = words_list(text)
    result = []
    for i in range(len(nnp_index)):
        result.append([word_array[nnp_index[i]]])
    return result

def get_nnp_data(data):
    numpy_data = data.copy(deep=True)
    numpy_data = np.array(numpy_data)
    new_data = []
    for i in range(len(numpy_data)):
        result = get_nnp(numpy_data[i][0])
        new_data.append(result)
    new_data = np.array(new_data)
    if (new_data.size > 0):
        new_data = pd.DataFrame(new_data,columns=["nnp"])
    return(new_data)

In [33]:
nnp_data = get_nnp_data(raw_normalized_data)
if (nnp_data.size > 0):
    ie_data = merge_data(ie_data, nnp_data)


In [34]:
save_as_csv("info_extract_data/info_extract.csv",ie_data)