In [1]:
import pandas as pd
from src.utils import extract_quintuplet
from src.evaluator import Evaluator
from src.postprocessor import PostProcessor
from Levenshtein import distance

# Using postprocessor

In [22]:
df = pd.read_csv('../Data/test_data.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 16 columns):
 #   Column                                Non-Null Count  Dtype 
---  ------                                --------------  ----- 
 0   aoriginal_id                          200 non-null    int64 
 1   content                               200 non-null    object
 2   final_sentiment                       200 non-null    object
 3   baseline_aspect_category              195 non-null    object
 4   tweet_type                            200 non-null    object
 5   clean_tweet                           200 non-null    object
 6   label                                 200 non-null    object
 7   corrected_label                       25 non-null     object
 8   keterangan                            8 non-null      object
 9   quintuplet_label                      200 non-null    object
 10  postprocess_quintuplet                200 non-null    object
 11  p21_model_prediction            

In [9]:
df['content'] = df['content'].apply(lambda x:x.lower())

In [10]:
postprocessor = PostProcessor(use_postprocess=True)
evaluator = Evaluator(task_type='quintuplet')

In [11]:
exp_type = 'p21'
df[f'postprocessed_{exp_type}_prediction'] = ''
for i in range(len(df)):
    row = df.iloc[i]
    orig_sent = row['content']
    quintuplets = extract_quintuplet(row[f'{exp_type}_model_prediction'])
    clean_quintuplet = []
    for quintuplet in quintuplets:
        entity, aspect_term, opinion_term, sentiment, aspect_category = quintuplet
        entity, aspect_term, opinion_term, sentiment, aspect_category = postprocessor.post_process(entity, aspect_term, opinion_term, sentiment, aspect_category, orig_sent)
        post_processed_quin = f'({entity},{aspect_term},{opinion_term},{sentiment},{aspect_category})'
        clean_quintuplet.append(post_processed_quin)
    df.at[i, f'postprocessed_{exp_type}_prediction'] = ';'.join(clean_quintuplet)
df[f'postprocessed_{exp_type}_prediction'] = df[f'postprocessed_{exp_type}_prediction'].apply(lambda x:x.lower())

In [14]:
df['postprocessed_postprocess_quintuplet'] = ''
for i in range(len(df)):
    row = df.iloc[i]
    orig_sent = row['content']
    quintuplets = extract_quintuplet(row['postprocess_quintuplet'])
    clean_quintuplet = []
    for quintuplet in quintuplets:
        entity, aspect_term, opinion_term, sentiment, aspect_category = quintuplet
        entity, aspect_term, opinion_term, sentiment, aspect_category = postprocessor.post_process(entity, aspect_term, opinion_term, sentiment, aspect_category, orig_sent)
        post_processed_quin = f'({entity},{aspect_term},{opinion_term},{sentiment},{aspect_category})'
        clean_quintuplet.append(post_processed_quin)
    df.at[i, 'postprocessed_postprocess_quintuplet'] = ';'.join(clean_quintuplet)
df['postprocessed_postprocess_quintuplet'] = df['postprocessed_postprocess_quintuplet'].apply(lambda x:x.lower())

In [12]:
raw_scores, all_labels, all_preds = evaluator.evaluate(
    pred_seqs=df[f'postprocessed_p21_prediction'],
    gold_seqs=df['postprocess_quintuplet']
)
raw_scores

  0%|          | 0/200 [00:00<?, ?it/s]

{'precision': 0.775, 'recall': 0.768, 'f1': 0.771}

In [14]:
df.to_csv('../Data/val_data.csv')

In [29]:
wrong_df = df[df['p21_model_prediction']!=df['postprocess_quintuplet']]
wrong_df.to_csv('../Data/wrong_test.csv')

In [15]:
idx = df[df['postprocessed_p21_prediction']!=df['p21_model_prediction']].index
for i in idx:
    print(i)

3
17
20
31
36
38
42
43
52
58
59
63
82
84
91
100
104
106
107
111
120
121
124
130
136
138
142
147
160
164
165
176
182
184
192
194
198


In [17]:
df.to_csv('../Data/val_data_postprocessed.csv', index=False)

In [48]:
distance('gonta', 'gonta-ganti')

6

# analisis kesalahan prediksi

In [6]:
from src.preprocessor import Preprocessor
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
import pandas as pd

In [2]:
preprocessor = Preprocessor(
    preprocess_type='p02',
    tokenizer='',
    tokenizer_max_length=0,
    text_col='content',
    label_col='postprocess_quintuplet'
)

In [7]:
df = pd.read_csv('../Data/test_data.csv')

In [8]:
factory = StemmerFactory()
stemmer = factory.create_stemmer()

In [9]:
df['clean_tweet'] = df['content'].apply(preprocessor.clean_text)
df['clean_tweet'] = df['clean_tweet'].apply(stemmer.stem)

In [10]:
df[df['postprocess_quintuplet']!=df['p21_model_prediction']].to_csv('../Data/wrong_test.csv')

In [13]:
stemmer.stem('saya suka nasi/mie')

'saya suka nasi mie'

In [12]:
preprocessor.clean_text('saya suka nasi/mie')

'saya suka nasi/mie'