In [12]:
import pandas as pd
from corruptions import TextAttackCorruption

from textattack.transformations import WordSwapRandomCharacterInsertion
from textattack.transformations import WordSwapQWERTY
from textattack.transformations import WordSwapRandomCharacterSubstitution
from textattack.transformations import WordSwapRandomCharacterDeletion

from textattack.constraints.pre_transformation import StopwordModification
from textattack.constraints.pre_transformation import RepeatModification

In [3]:
df = pd.read_json("data/test.json",
                #   compression="gzip", 
                  lines=True)

text_corruption = TextAttackCorruption(
    column="reviewText",
    fraction=0.2,
    transformations=[
        WordSwapRandomCharacterDeletion(),
        WordSwapRandomCharacterInsertion(),
        WordSwapRandomCharacterSubstitution(),
        WordSwapQWERTY()
    ],
    constraints=[
        RepeatModification(), 
        StopwordModification()
    ]
)

corrupted_df = text_corruption.transform(df)

In [6]:
# save corrupted data to /data/corrupted_test.json
corrupted_df.to_json("data/corrupted_test.json", orient="records", lines=True)


In [29]:
# read corrupted data
corrupted_df = pd.read_json("data/corrupted_test.json", lines=True)
corrupted_df

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
0,A1HK2FQW6KXQB2,097293751X,"Amanda Johnsen ""Amanda E. Johnsen""","[0, 0]",Perfect for new parents. We were able to keep ...,5,Awesine,1373932800,"07 16, 2013"
1,A19K65VY14D13R,097293751X,angela,"[0, 0]",This book is such a life saver. It has been s...,5,Should be required for all new parents!,1372464000,"06 29, 2013"
2,A2LL1TGG90977E,097293751X,Carter,"[0, 0]",Helps me know exactly how my babies day has go...,5,Grandmother watching baby,1395187200,"03 19, 2014"
3,A5G19RYX8599E,097293751X,cfpurplerose,"[0, 0]",I bought this a few times for my older son and...,5,repeat buyer,1376697600,"08 17, 2013"
4,A2496A4EWMLQ7,097293751X,C. Jeter,"[0, 0]",I wanted an alternative to printing out daily ...,4,Great,1396310400,"04 1, 2014"
...,...,...,...,...,...,...,...,...,...
995,AO5IS72AH2CEZ,B000056HM5,K. Jordan,"[0, 0]","When I bought this system, I was concerned if ...",4,Works with pump - sort of,1322697600,"12 1, 2011"
996,A2ROS5VVU6LIVY,B000056HM5,Lauren,"[1, 1]","I got these as a gift at my baby shower, so I ...",2,Doesn't seal very well...,1323561600,"12 11, 2011"
997,A3QO04PYHINBRZ,B000056HM5,L. Sito,"[0, 0]",My daughter wasn't drinking out of the bottle....,5,Only one that worked,1256601600,"10 27, 2009"
998,A3R7Q2RWQ8K2S7,B000056HM5,MamaCito,"[0, 0]",I knew from the beginning that I would be usin...,4,Love the convenience,1300147200,"03 15, 2011"


In [31]:
from cleaning import tb_correct_spelling

clean_df = corrupted_df.copy()

# clean_df['reviewText'][0]
# find culumns"reviewText" and "summary" in corrupted_df and use tb_correct_spelling to correct the spelling
for index, row in corrupted_df.iterrows():
    clean_df.loc[index, 'reviewText'] = ''.join(tb_correct_spelling(str(row['reviewText'])))
    clean_df.loc[index, 'summary'] = ''.join(tb_correct_spelling(str(row['summary'])))


# corrupted_df

In [32]:
clean_df.to_json("data/clean_tb_test.json", orient="records", lines=True)