In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install language_tool_python --upgrade



In [None]:
import language_tool_python
import pandas as pd
from io import StringIO

from tqdm import tqdm
tqdm.pandas()

In [None]:
!pip install --upgrade pandas




In [None]:
data_path = '/content/drive/MyDrive/Project_AI/Replication-NLP-Adversarial-Examples-master/section_3_case_study/case_study_examples.csv'
df = pd.read_csv(data_path)


In [None]:
df.head()

Unnamed: 0,dataset,original_class,original_text,perturbed_class,perturbed_text,run_type
0,movie_review,1.0,[[lovingly]] photographed in the manner of a g...,0.0,[[clumsily]] photographed in the manner of a g...,jin
1,movie_review,1.0,[[consistently]] [[clever]] and [[suspenseful]] .,1.0,[[necessarily]] [[malin]] and [[suggestive]] .,jin
2,movie_review,1.0,"it's like a "" big chill "" reunion of the baade...",0.0,"it's like a "" big chill "" reunion of the baade...",jin
3,movie_review,1.0,the story gives ample opportunity for large-sc...,0.0,the story gives ample opportunity for large-sc...,jin
4,movie_review,1.0,"red dragon "" never cuts [[corners]] .",0.0,"red dragon "" never cuts [[angles]] .",jin


In [None]:
lang_tool = language_tool_python.LanguageTool("en-US")

In [None]:
def get_errors(text):
    return lang_tool.check(text)

def get_error_count(text):
    return len(get_errors(text))

In [None]:
df['original_text_errors'] = df['original_text'].progress_map(get_errors)
df['perturbed_text_errors'] = df['perturbed_text'].progress_map(get_errors)

100%|██████████| 3292/3292 [04:28<00:00, 12.26it/s]
100%|██████████| 3292/3292 [03:57<00:00, 13.88it/s]


In [None]:
df['original_text_error_count'] = df['original_text_errors'].map(len)
df['perturbed_text_error_count'] = df['perturbed_text_errors'].map(len)

In [None]:
original_text_errors = [e for l in df['original_text_errors'] for e in l]
perturbed_text_errors = [e for l in df['perturbed_text_errors'] for e in l]
print('Original errors:', len(original_text_errors))
print('Perturbed errors:', len(perturbed_text_errors))

Original errors: 4494
Perturbed errors: 4570


In [None]:
print('Added errors:', (df['original_text_error_count'] < df['perturbed_text_error_count']).sum())
print('Same # errors:', (df['original_text_error_count'] == df['perturbed_text_error_count']).sum())
print('Removed errors:', (df['original_text_error_count'] > df['perturbed_text_error_count']).sum())

Added errors: 66
Same # errors: 3226
Removed errors: 0


In [None]:
df['run'] = df['dataset'] + '_' + df['run_type']
print(df['run'].unique())

['movie_review_jin' nan]


In [None]:
original_text_error_rule_ids = pd.Series([x.ruleId for x in original_text_errors])
perturbed_text_error_rule_ids = pd.Series([x.ruleId for x in perturbed_text_errors])

original_text_error_rule_freqs = original_text_error_rule_ids.value_counts().to_dict()
perturbed_text_error_rule_freqs = perturbed_text_error_rule_ids.value_counts().to_dict()

print('Original errors:', original_text_error_rule_ids.value_counts())
print()
print('Perturbed errors:', perturbed_text_error_rule_ids.value_counts())

Original errors: MORFOLOGIK_RULE_EN_US           3122
COMMA_PARENTHESIS_WHITESPACE     906
UPPERCASE_SENTENCE_START         408
EN_SPECIFIC_CASE                  18
I_LOWERCASE                       12
EN_COMPOUNDS                       4
EN_UNPAIRED_BRACKETS               4
VERB_APOSTROPHE_S                  2
EN_DIACRITICS_REPLACE              2
A_UNCOUNTABLE                      2
CONFUSION_OF_MARS_MARS             2
ADVERB_WORD_ORDER                  2
ARTICLE_ADJECTIVE_OF               2
MUCH_NEEDED_HYPHEN                 2
SOME_OF_THE                        2
WHETHER                            2
ENGLISH_WORD_REPEAT_RULE           2
dtype: int64

Perturbed errors: MORFOLOGIK_RULE_EN_US           3192
COMMA_PARENTHESIS_WHITESPACE     906
UPPERCASE_SENTENCE_START         408
EN_SPECIFIC_CASE                  18
I_LOWERCASE                       12
EN_COMPOUNDS                       4
EN_UNPAIRED_BRACKETS               4
ID_CASING                          2
WHETHER                   

In [None]:
err_data = []

for rule_id, pert_freq in perturbed_text_error_rule_freqs.items():
    orig_freq = original_text_error_rule_freqs.get(rule_id, 0)
    freq_diff = pert_freq - orig_freq
    freq_ratio = pert_freq / orig_freq if orig_freq else None
    err_data.append({
        'freq_diff':  freq_diff,
        'orig_freq':  orig_freq,
        'pert_freq':  pert_freq,
        'freq_ratio': freq_ratio,
        'rule_id': rule_id
    })

err_data = pd.DataFrame(err_data)
err_data['freq_diff_perc'] = err_data['freq_diff'] / float(len(df))
err_data.head()

Unnamed: 0,freq_diff,orig_freq,pert_freq,freq_ratio,rule_id,freq_diff_perc
0,70,3122,3192,1.022422,MORFOLOGIK_RULE_EN_US,0.021264
1,0,906,906,1.0,COMMA_PARENTHESIS_WHITESPACE,0.0
2,0,408,408,1.0,UPPERCASE_SENTENCE_START,0.0
3,0,18,18,1.0,EN_SPECIFIC_CASE,0.0
4,0,12,12,1.0,I_LOWERCASE,0.0


In [None]:
len(df)

3292

In [None]:
pd.set_option('display.max_rows', None)

err_data.sort_values(by='freq_diff', ascending=False)

Unnamed: 0,freq_diff,orig_freq,pert_freq,freq_ratio,rule_id,freq_diff_perc
0,70,3122,3192,1.022422,MORFOLOGIK_RULE_EN_US,0.021264
7,2,0,2,,ID_CASING,0.000608
18,2,0,2,,AI,0.000608
10,2,0,2,,BESTEST,0.000608
4,0,12,12,1.0,I_LOWERCASE,0.0
5,0,4,4,1.0,EN_COMPOUNDS,0.0
6,0,4,4,1.0,EN_UNPAIRED_BRACKETS,0.0
3,0,18,18,1.0,EN_SPECIFIC_CASE,0.0
8,0,2,2,1.0,WHETHER,0.0
9,0,2,2,1.0,SOME_OF_THE,0.0


In [None]:
err_data[err_data['freq_ratio'].isna()].sort_values(by='freq_diff', ascending=False)

Unnamed: 0,freq_diff,orig_freq,pert_freq,freq_ratio,rule_id,freq_diff_perc
7,2,0,2,,ID_CASING,0.000608
10,2,0,2,,BESTEST,0.000608
18,2,0,2,,AI,0.000608


In [None]:
err_data.sort_values(by=['pert_freq'], ascending=False)

Unnamed: 0,freq_diff,orig_freq,pert_freq,freq_ratio,rule_id,freq_diff_perc
0,70,3122,3192,1.022422,MORFOLOGIK_RULE_EN_US,0.021264
1,0,906,906,1.0,COMMA_PARENTHESIS_WHITESPACE,0.0
2,0,408,408,1.0,UPPERCASE_SENTENCE_START,0.0
3,0,18,18,1.0,EN_SPECIFIC_CASE,0.0
4,0,12,12,1.0,I_LOWERCASE,0.0
5,0,4,4,1.0,EN_COMPOUNDS,0.0
6,0,4,4,1.0,EN_UNPAIRED_BRACKETS,0.0
13,0,2,2,1.0,ARTICLE_ADJECTIVE_OF,0.0
18,2,0,2,,AI,0.000608
17,0,2,2,1.0,VERB_APOSTROPHE_S,0.0


In [None]:
len(df)

3292

In [None]:
err_data.sort_values(by=['freq_ratio', 'freq_diff'], ascending=False)

Unnamed: 0,freq_diff,orig_freq,pert_freq,freq_ratio,rule_id,freq_diff_perc
0,70,3122,3192,1.022422,MORFOLOGIK_RULE_EN_US,0.021264
1,0,906,906,1.0,COMMA_PARENTHESIS_WHITESPACE,0.0
2,0,408,408,1.0,UPPERCASE_SENTENCE_START,0.0
3,0,18,18,1.0,EN_SPECIFIC_CASE,0.0
4,0,12,12,1.0,I_LOWERCASE,0.0
5,0,4,4,1.0,EN_COMPOUNDS,0.0
6,0,4,4,1.0,EN_UNPAIRED_BRACKETS,0.0
8,0,2,2,1.0,WHETHER,0.0
9,0,2,2,1.0,SOME_OF_THE,0.0
11,0,2,2,1.0,MUCH_NEEDED_HYPHEN,0.0


In [None]:
perturbed_text_errors[0]

Match({'ruleId': 'COMMA_PARENTHESIS_WHITESPACE', 'message': 'Put a space after the comma, but not before the comma.', 'replacements': [','], 'offsetInContext': 43, 'context': '...e manner of a golden book sprung to life , stuart little 2 manages sweetness large...', 'offset': 71, 'errorLength': 2, 'category': 'TYPOGRAPHY', 'ruleIssueType': 'whitespace', 'sentence': '[[clumsily]] photographed in the manner of a golden book sprung to life , stuart little 2 manages sweetness largely without stickiness .'})

In [None]:
for err in perturbed_text_errors:
    if err.ruleId == 'MORFOLOGIK_RULE_EN_US':
        badword = err.context[err.offset:err.offset+err.errorLength]
        goodwords = err.replacements
        print(err.message + ':', 'Replace',badword,'with one of',goodwords)
        print(err.category)
        print()
        break

Possible spelling mistake found.: Replace ss lar with one of ['Stuart', 'start', 'Stuarts', 'square', 'star', 'stars', 'starts', 'Stewart', 'smart', 'estuary', 'stunt', 'Stark', 'Starr', 'stark', 'strait', 'Surat', 'squat', 'stare', 'stat', 'spurt', 'tart', 'quart', 'squirt', 'stoat', 'Seurat', 'SART', 'SMART', 'STAR', 'STARS', 'STAAR']
TYPOS



In [None]:
def yield_errs(err_code, num_to_print=1):
    n = 0
    for err in perturbed_text_errors:
        if n >= num_to_print:
            break
        if err.ruleId == err_code:
            print('err:', err)
            badword = err.context[err.offset:err.offset+err.errorLength]
            goodwords = err.replacements
            err_str = err_code + ': ' + err.message + ' || Replace '+badword+' with one of ' + '[' +','.join(goodwords) + ']'
            context = err.context
            cat = err.category
            yield err_str
            n += 1

def get_err(err_code):
    ans = list(yield_errs(err_code, num_to_print=1))
    return ans[0] if len(ans) else None

def get_errs(*args):
    return list(yield_errs(*args))

def print_errs(*args):
    print(get_errs(*args))

print_errs('I_AM')

[]


In [None]:
def yield_errs_context(err_code, num_to_print=1):
    n = 0
    for err in perturbed_text_errors:
        if n >= num_to_print:
            break
        if err.ruleId == err_code:
            badword = err.context[err.offset:err.offset+err.errorLength]
            goodwords = err.replacements
            err_str = err_code + ': ' + err.message + ' || Replace '+badword+' with one of ' + '[' +','.join(goodwords) + ']'
            context = err.context
            cat = err.category
            yield context
            n += 1

def get_err_context(err_code):
    ans = list(yield_errs_context(err_code, num_to_print=1))
    return ans[0] if len(ans) else None

def get_errs_context(*args):
    return list(yield_errs_context(*args))

def print_errs_context(*args):
    print(get_errs_context(*args))

print_errs('I_AM')

[]


In [None]:
get_err('DID_BASEFORM')

In [None]:
get_errs('DID_BASEFORM', 20)

[]

In [None]:
pd.options.display.max_colwidth = 500
err_data['Explanation'] = err_data['rule_id'].map(get_err)
err_data['Context'] = err_data['rule_id'].map(get_err_context)
err_data.head()

err: Offset 74, length 6, Rule ID: MORFOLOGIK_RULE_EN_US
Message: Possible spelling mistake found.
Suggestion: Stuart; start; Stuarts; square; star; stars; starts; Stewart; smart; estuary; stunt; Stark; Starr; stark; strait; Surat; squat; stare; stat; spurt; tart; quart; squirt; stoat; Seurat; SART; SMART; STAR; STARS; STAAR
...anner of a golden book sprung to life , stuart little 2 manages sweetness largely with...
                                           ^^^^^^
err: Offset 71, length 2, Rule ID: COMMA_PARENTHESIS_WHITESPACE
Message: Put a space after the comma, but not before the comma.
Suggestion: ,
...e manner of a golden book sprung to life , stuart little 2 manages sweetness large...
                                           ^^
err: Offset 0, length 2, Rule ID: UPPERCASE_SENTENCE_START
Message: This sentence does not start with an uppercase letter.
Suggestion: It
it's like a " big chill " reunion of the b...
^^
err: Offset 12, length 9, Rule ID: EN_SPECIFIC_CASE
Message: If th

Unnamed: 0,freq_diff,orig_freq,pert_freq,freq_ratio,rule_id,freq_diff_perc,Explanation,Context
0,70,3122,3192,1.022422,MORFOLOGIK_RULE_EN_US,0.021264,"MORFOLOGIK_RULE_EN_US: Possible spelling mistake found. || Replace ss lar with one of [Stuart,start,Stuarts,square,star,stars,starts,Stewart,smart,estuary,stunt,Stark,Starr,stark,strait,Surat,squat,stare,stat,spurt,tart,quart,squirt,stoat,Seurat,SART,SMART,STAR,STARS,STAAR]","...anner of a golden book sprung to life , stuart little 2 manages sweetness largely with..."
1,0,906,906,1.0,COMMA_PARENTHESIS_WHITESPACE,0.0,"COMMA_PARENTHESIS_WHITESPACE: Put a space after the comma, but not before the comma. || Replace we with one of [,]","...e manner of a golden book sprung to life , stuart little 2 manages sweetness large..."
2,0,408,408,1.0,UPPERCASE_SENTENCE_START,0.0,UPPERCASE_SENTENCE_START: This sentence does not start with an uppercase letter. || Replace it with one of [It],"it's like a "" big chill "" reunion of the b..."
3,0,18,18,1.0,EN_SPECIFIC_CASE,0.0,"EN_SPECIFIC_CASE: If the term is a proper noun, use initial capitals. || Replace star trek with one of [Star Trek]",devotees of star trek ii : the wrath of khan will feel a nagg...
4,0,12,12,1.0,I_LOWERCASE,0.0,I_LOWERCASE: The personal pronoun “I” should be uppercase. || Replace i with one of [I],can i admit xxx is as deep as a petri dish an...


In [None]:
err_data.sort_values(by=['freq_ratio','freq_diff'], ascending=False)

Unnamed: 0,freq_diff,orig_freq,pert_freq,freq_ratio,rule_id,freq_diff_perc,Explanation,Context
0,70,3122,3192,1.022422,MORFOLOGIK_RULE_EN_US,0.021264,"MORFOLOGIK_RULE_EN_US: Possible spelling mistake found. || Replace ss lar with one of [Stuart,start,Stuarts,square,star,stars,starts,Stewart,smart,estuary,stunt,Stark,Starr,stark,strait,Surat,squat,stare,stat,spurt,tart,quart,squirt,stoat,Seurat,SART,SMART,STAR,STARS,STAAR]","...anner of a golden book sprung to life , stuart little 2 manages sweetness largely with..."
1,0,906,906,1.0,COMMA_PARENTHESIS_WHITESPACE,0.0,"COMMA_PARENTHESIS_WHITESPACE: Put a space after the comma, but not before the comma. || Replace we with one of [,]","...e manner of a golden book sprung to life , stuart little 2 manages sweetness large..."
2,0,408,408,1.0,UPPERCASE_SENTENCE_START,0.0,UPPERCASE_SENTENCE_START: This sentence does not start with an uppercase letter. || Replace it with one of [It],"it's like a "" big chill "" reunion of the b..."
3,0,18,18,1.0,EN_SPECIFIC_CASE,0.0,"EN_SPECIFIC_CASE: If the term is a proper noun, use initial capitals. || Replace star trek with one of [Star Trek]",devotees of star trek ii : the wrath of khan will feel a nagg...
4,0,12,12,1.0,I_LOWERCASE,0.0,I_LOWERCASE: The personal pronoun “I” should be uppercase. || Replace i with one of [I],can i admit xxx is as deep as a petri dish an...
5,0,4,4,1.0,EN_COMPOUNDS,0.0,EN_COMPOUNDS: This word is normally spelled with a hyphen. || Replace old fashioned with one of [old-fashioned],... . quite good at [[provided]] some good old fashioned spooks .
6,0,4,4,1.0,EN_UNPAIRED_BRACKETS,0.0,"EN_UNPAIRED_BRACKETS: Unpaired symbol: ‘""’ seems to be missing || Replace with one of []","...d dark , funny [[travesty]] [[offer]] "" "" the [[bestest]] [[nana]] "" a film [[cha..."
8,0,2,2,1.0,WHETHER,0.0,WHETHER: Consider shortening this phrase to just “whether”. It is correct though if you mean ‘regardless of whether’. || Replace whether or not with one of [whether],whether or not ram [[zum]] proves as clear and reliabl...
9,0,2,2,1.0,SOME_OF_THE,0.0,"SOME_OF_THE: If the text is a generality, ‘of the’ is not necessary. || Replace some of the with one of [some]",it uses some of the figures from the real-life story to por...
11,0,2,2,1.0,MUCH_NEEDED_HYPHEN,0.0,"MUCH_NEEDED_HYPHEN: When “much-needed” is used as a modifier, it is usually spelled with a hyphen. || Replace with one of [much-needed]",...nt past and south korea's future adds a much needed moral weight .


In [None]:
min_pert_freq = 25
min_pert_ratio = 5.0


explanatory_df = err_data \
    [(err_data['pert_freq'] > min_pert_freq) & (err_data['freq_ratio'] > min_pert_ratio)] \
    .sort_values(by=['freq_ratio','freq_diff'], ascending=False) \
    .reset_index() \
    .drop(['freq_ratio', 'freq_diff', 'index', 'freq_diff_perc'], axis=1)

explanatory_df = explanatory_df[['rule_id', 'orig_freq', 'pert_freq', 'Explanation', 'Context']]
explanatory_df.head()

Unnamed: 0,rule_id,orig_freq,pert_freq,Explanation,Context


In [None]:
latex = explanatory_df.to_latex()
for line in latex.split('\n'):
    print(line)

  latex = explanatory_df.to_latex()


ModuleNotFoundError: ignored

In [None]:
df['run'].unique()

array(['movie_review_jin', nan], dtype=object)

In [None]:
len(df)

3292

In [None]:
df['introduced_error'] = df['original_text_error_count'] < df['perturbed_text_error_count']
df.groupby('run')['introduced_error'].mean()

run
movie_review_jin    0.165
Name: introduced_error, dtype: float64

In [None]:
df['introduced_error'].mean()

0.020048602673147023