# Ekstrak & Pre-proc Dataset 

In [255]:

import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import re
from tqdm.auto import tqdm
tqdm.pandas()
     

In [256]:
df = pd.read_csv(r"D:\TEL-U\BANGKIT!\NARASPEAK\DATASET\C4_200M.tsv-00000-of-00010", delimiter='\t', nrows=100000)
df.columns = ["Incorrect", "Correct"]

In [257]:
df

Unnamed: 0,Incorrect,Correct
0,The effect of widespread dud targets two face ...,"1. The effect of ""widespread dud"" targets two ..."
1,tax on sales of stores for non residents are s...,Capital Gains tax on the sale of properties fo...
2,Much many brands and sellers still in the market.,Many brands and sellers still in the market.
3,this is is the latest Maintenance release of S...,This is is the latest maintenance release of S...
4,"Fairy Or Not, I'm the Godmother: no just look,...","Fairy Or Not, I'm the Godmother: Not just a lo..."
...,...,...
99995,The Toy’s slump seems to coincide with the dis...,The Toy’s slump seems to coincide with the dis...
99996,It took me 4 hours to find this stupid failure.,It took me 4 hours to find this stupid problem.
99997,"Made of high tech thermoplastic rubber, TPR (D...","Made of high-tech thermoplastic rubber, TPR (D..."
99998,According to a study written by the Annie E. C...,"According to a study by the Annie E. Casey, Fo..."


In [258]:
# Adding length features


df['correct_char_count'] = df['Correct'].astype('str').apply(lambda x:len(x))
df['incorrect_char_count'] = df['Incorrect'].astype('str').apply(lambda x:len(x))

df['correct_word_count'] = df['Correct'].astype('str').apply(lambda x:len(x.split()))
df['incorrect_word_count'] = df['Incorrect'].astype('str').apply(lambda x:len(x.split()))

In [259]:
df.head(5)

Unnamed: 0,Incorrect,Correct,correct_char_count,incorrect_char_count,correct_word_count,incorrect_word_count
0,The effect of widespread dud targets two face ...,"1. The effect of ""widespread dud"" targets two ...",92,87,16,15
1,tax on sales of stores for non residents are s...,Capital Gains tax on the sale of properties fo...,355,334,63,59
2,Much many brands and sellers still in the market.,Many brands and sellers still in the market.,44,49,8,9
3,this is is the latest Maintenance release of S...,This is is the latest maintenance release of S...,55,54,10,10
4,"Fairy Or Not, I'm the Godmother: no just look,...","Fairy Or Not, I'm the Godmother: Not just a lo...",100,94,20,18


# Preprocessing
Removing Missing/NA

In [260]:

pd.DataFrame(df.isna().sum(),columns=['missing_count'])

Unnamed: 0,missing_count
Incorrect,0
Correct,0
correct_char_count,0
incorrect_char_count,0
correct_word_count,0
incorrect_word_count,0


In [261]:

df[df.isna().any(axis=1)]

Unnamed: 0,Incorrect,Correct,correct_char_count,incorrect_char_count,correct_word_count,incorrect_word_count


In [262]:

df = df.dropna().reset_index(drop=True)

In [263]:

df.shape

(100000, 6)

Keep unique sentence pairs

In [264]:
print(f"total number of duplicate pairs: {len(df[df['Correct']==df['Incorrect']])}")

total number of duplicate pairs: 590


In [265]:

df[df['Correct']==df['Incorrect']].sample(5)

Unnamed: 0,Incorrect,Correct,correct_char_count,incorrect_char_count,correct_word_count,incorrect_word_count
16491,Let’s play “two truths and a lie” about brewin...,Let’s play “two truths and a lie” about brewin...,72,72,13,13
70215,are regularly evaluated and reviewed.,are regularly evaluated and reviewed.,37,37,5,5
57101,The Wise Men came from afar to seek Jesus. Are...,The Wise Men came from afar to seek Jesus. Are...,69,69,14,14
5154,Sketch effortlessly in Adobe Illustrator using...,Sketch effortlessly in Adobe Illustrator using...,64,64,8,8
55655,Ratio of instructor to student 1:1.,Ratio of instructor to student 1:1.,35,35,6,6


In [266]:
df = df[df['Correct']!=df['Incorrect']]

In [267]:
df.shape

(99410, 6)

In [268]:
df.sample(5)

Unnamed: 0,Incorrect,Correct,correct_char_count,incorrect_char_count,correct_word_count,incorrect_word_count
89088,Double Dragon (SEGA GENESIS) Is tested!,Double Dragon (SEGA GENESIS) Tested!,36,39,5,6
49346,2 game units can be to skills head-to-head com...,2 game units can be linked for head-to-head co...,57,56,9,9
11648,"No CPU cooled, as it would have raised the pri...","No CPU cooling, as it would have raised the pr...",121,118,25,24
44485,Have a question about our Recreation ministry. {,Have questions about our recreation ministry?,45,48,6,8
30496,The day in which I've eyewitness the anti-Trum...,The day I witnessed the anti-Trump protest was...,322,340,58,62


Remove Duplicates

In [269]:

print(f'total number of duplicates: {df.duplicated().sum()}')

total number of duplicates: 0


In [270]:

df[df.duplicated(keep=False)].sort_values('Correct')

Unnamed: 0,Incorrect,Correct,correct_char_count,incorrect_char_count,correct_word_count,incorrect_word_count


In [271]:

df = df.drop_duplicates().reset_index(drop=True)

In [272]:
df.shape

(99410, 6)

In [273]:
df.sample(5)

Unnamed: 0,Incorrect,Correct,correct_char_count,incorrect_char_count,correct_word_count,incorrect_word_count
11961,Logan: Excellent and we’ll of course have link...,Logan: Excellent and we’ll of course have link...,269,269,53,53
23664,"Naruto Shippuden. 2 eps back-to-back, Mon-Fri,...","Naruto Shippuden. 2 eps back-to-back, Mon-Fri,...",146,157,20,23
39344,To approve the responsibilities of kg on stude...,Approves the provisions on student self-govern...,58,72,7,10
85334,"However, the study does not explain this corre...","However, the study does not explain this corre...",212,218,37,38
34104,The company can assign trained combatant to pr...,The company can assign a certified trainer to ...,201,196,33,32


Remove Small sentences

In [274]:
df[df['incorrect_char_count']<2].shape

(0, 6)

In [275]:

df = df[df['incorrect_char_count']>2].reset_index(drop=True)

In [276]:
df.shape

(99409, 6)

In [277]:
df[df['correct_char_count']<2].shape

(0, 6)

In [278]:
df[df['correct_char_count']<2].sample

<bound method NDFrame.sample of Empty DataFrame
Columns: [Incorrect, Correct, correct_char_count, incorrect_char_count, correct_word_count, incorrect_word_count]
Index: []>

In [279]:
df = df[df['correct_char_count']>2].reset_index(drop=True)

In [280]:
df.shape

(99409, 6)

Cleaning Text

In [281]:
#https://www.analyticsvidhya.com/blog/2020/04/beginners-guide-exploratory-data-analysis-text-data/
contractions_dict = { "ain't": "are not","'s":" is","aren't": "are not",
                     "can't": "cannot","can't've": "cannot have",
                     "'cause": "because","could've": "could have","couldn't": "could not",
                     "couldn't've": "could not have", "didn't": "did not","doesn't": "does not",
                     "don't": "do not","hadn't": "had not","hadn't've": "had not have",
                     "hasn't": "has not","haven't": "have not","he'd": "he would",
                     "he'd've": "he would have","he'll": "he will", "he'll've": "he will have",
                     "how'd": "how did","how'd'y": "how do you","how'll": "how will",
                     "I'd": "I would", "I'd've": "I would have","I'll": "I will",
                     "I'll've": "I will have","I'm": "I am","I've": "I have", "isn't": "is not",
                     "it'd": "it would","it'd've": "it would have","it'll": "it will",
                     "it'll've": "it will have", "let's": "let us","ma'am": "madam",
                     "mayn't": "may not","might've": "might have","mightn't": "might not", 
                     "mightn't've": "might not have","must've": "must have","mustn't": "must not",
                     "mustn't've": "must not have", "needn't": "need not",
                     "needn't've": "need not have","o'clock": "of the clock","oughtn't": "ought not",
                     "oughtn't've": "ought not have","shan't": "shall not","sha'n't": "shall not",
                     "shan't've": "shall not have","she'd": "she would","she'd've": "she would have",
                     "she'll": "she will", "she'll've": "she will have","should've": "should have",
                     "shouldn't": "should not", "shouldn't've": "should not have","so've": "so have",
                     "that'd": "that would","that'd've": "that would have", "there'd": "there would",
                     "there'd've": "there would have", "they'd": "they would",
                     "they'd've": "they would have","they'll": "they will",
                     "they'll've": "they will have", "they're": "they are","they've": "they have",
                     "to've": "to have","wasn't": "was not","we'd": "we would",
                     "we'd've": "we would have","we'll": "we will","we'll've": "we will have",
                     "we're": "we are","we've": "we have", "weren't": "were not","what'll": "what will",
                     "what'll've": "what will have","what're": "what are", "what've": "what have",
                     "when've": "when have","where'd": "where did", "where've": "where have",
                     "who'll": "who will","who'll've": "who will have","who've": "who have",
                     "why've": "why have","will've": "will have","won't": "will not",
                     "won't've": "will not have", "would've": "would have","wouldn't": "would not",
                     "wouldn't've": "would not have","y'all": "you all", "y'all'd": "you all would",
                     "y'all'd've": "you all would have","y'all're": "you all are",
                     "y'all've": "you all have", "you'd": "you would","you'd've": "you would have",
                     "you'll": "you will","you'll've": "you will have", "you're": "you are",
                     "you've": "you have","n\'t":" not","\'re":" are","\'s": " is","\'d":" would",
                     "\'ll": " will","\'t":" not","\'ve": " have","\'m":" am"}


# Regular expression for finding contractions
contractions_re=re.compile('(%s)' % '|'.join(contractions_dict.keys()))

# Function for expanding contractions
def expand_contractions(text,contractions_dict=contractions_dict):
    def replace(match):
        return contractions_dict[match.group(0)]
    return contractions_re.sub(replace, text)

In [282]:
def clean(text):
    text = re.sub('\s*\<.*?\>\s', '', text)
    text = re.sub('\s*.*?\s', '', text)
    text = re.sub('\s*.*?\s', '', text)
    text = re.sub('\s*\{.*?\}\s', '', text)
    text = re.sub("[-+@#^/|*(){}$~<>=_%:;]","",text)
    text = text.replace("\\","")
    text = re.sub("\[","",text)
    text = re.sub("\]","",text)
    text = re.sub("\<","",text)
    text = re.sub("\>","",text)
    text = re.sub("\(","",text)
    text = re.sub("\)","",text)
    text = re.sub("[0-9]","",text)
    text = ' '.join(text.split())
    return text

In [283]:

# df['Correct'] = df['Correct'].progress_apply(clean)
df['Correct'] = df['Correct'].progress_apply(expand_contractions)

  0%|          | 0/99409 [00:00<?, ?it/s]

100%|██████████| 99409/99409 [00:04<00:00, 24134.48it/s]


In [284]:

# df['Incorrect'] = df['Incorrect'].progress_apply(clean)
df['Incorrect'] = df['Incorrect'].progress_apply(expand_contractions)

  0%|          | 0/99409 [00:00<?, ?it/s]

100%|██████████| 99409/99409 [00:03<00:00, 26653.22it/s]


In [285]:
df.sample(10)

Unnamed: 0,Incorrect,Correct,correct_char_count,incorrect_char_count,correct_word_count,incorrect_word_count
23843,The revocation of such agreement on the use of...,The revocation of such agreement on the use of...,159,154,28,28
3143,BORRENPOHL-Kelli and Steve Borrenpohl of Vened...,BORRENPOHL — Kelli and Steve Borrenpohl of Ven...,67,64,12,8
38668,Due to popular demand we will only be abccepti...,"Due to popular demand, we will only be accepti...",78,79,16,16
74215,Most of the stuff should be available with any...,Most of the stuff should be available at any l...,79,89,15,17
30082,If a future in sustainability farming is to be...,If a future in sustainable farming is to be ac...,228,234,38,42
64739,"Ogle, Andy. ""A treasure of plant life; Museum ...","Ogle, Andy. ""A treasure of plant life; Museum ...",140,129,20,20
47314,"In any event, there is no possibility l ate of...","In any event, there is no possibility of the o...",265,267,46,48
70787,03 Apr 2017 - Michael Sheppard is 8pm.,03 Apr 2017 - 6:14 PM Michael Sheppard Updated...,52,37,10,7
80186,Virus wont go away and the lad still feeling a...,The virus will not go away and the lad is stil...,65,57,14,12
8325,"I love the look of the vintage bath hardware, ...","I love the look of the vintage bath hardware, ...",88,91,14,16


In [286]:
def remove_spaces(text) : 
    text = re.sub(r" '(\w)",r"'\1",text)
    text = re.sub(r" \,",",",text)
    text = re.sub(r" \.+",".",text)
    text = re.sub(r" \!+","!",text)
    text = re.sub(r" \?+","?",text)
    text = re.sub(" n't","n't",text)
    text = re.sub("[\(\)\;\_\^\`\/]","",text)
    
    return text


def decontract(text) : 
    text = re.sub(r"won\'t", "will not", text)
    text = re.sub(r"can\'t", "can not", text)
    text = re.sub(r"n\'t", " not", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'s", " is", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'t", " not", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'m", " am", text)
    
    return text



In [287]:
def WrongSentence_preprocessing(col) : 
    df[col] = df[col].astype(str) 
    df[col] = df[col].apply(lambda x:x.lower()) 
    df[col] = df[col].apply(lambda x : re.sub("\n","",x)) 
    df[col] = df[col].apply(lambda x : remove_spaces(x)) 
    df[col] = df[col].apply(lambda x : re.sub(r"\.+",".",x)) 
    df[col] = df[col].apply(lambda x : re.sub(r"\!+","!",x)) 
    df[col] = df[col].apply(lambda x : decontract(x)) 
    df[col] = df[col].apply(lambda x: re.sub("[^A-Za-z\s]","",x)) 
    df[col] = df[col].apply(lambda x: x.replace("\s+"," "))
    df[col] = df[col].apply(lambda x: " ".join([word for word in x.split()]))
    return df

def TrueSentence_preprocessing(col) : 
    df[col] = df[col].astype(str) 
    df[col] = df[col].apply(lambda x:x.lower()) 
    df[col] = df[col].apply(lambda x : re.sub("\n","",x)) 
    df[col] = df[col].apply(lambda x : remove_spaces(x)) 
    df[col] = df[col].apply(lambda x : re.sub(r"\.+",".",x)) 
    df[col] = df[col].apply(lambda x : re.sub(r"\!+","!",x)) 
    df[col] = df[col].apply(lambda x : decontract(x)) 
    df[col] = df[col].apply(lambda x: re.sub("[^A-Za-z\s]","",x)) 
    df[col] = df[col].apply(lambda x: re.sub(r'\d','',x))
    df[col] = df[col].apply(lambda x: re.sub(r'\s+',' ',x))
    df[col] = df[col].apply(lambda x: re.sub(r"[-()\"#/@;:<>{}`+=~|.!?,।]", "", x))
    df[col] = df[col].apply(lambda x: x.strip())
    # df[col] = "<sos> " + df[col] + " <eos>"
    return df

In [288]:
df = WrongSentence_preprocessing('Incorrect') 
df = TrueSentence_preprocessing('Correct')

In [291]:
df.sample(10)

Unnamed: 0,Incorrect,Correct,correct_char_count,incorrect_char_count,correct_word_count,incorrect_word_count
84363,of hs codes that are eligible to us just aroun...,of hs codes that are eligible to the us only a...,93,89,19,18
89774,electric grill barbecue indoor camping portable,electric grill indoor barbecue camping portable,48,48,6,6
37324,she said most of the people who lived along th...,she said most of the people who live along the...,175,177,32,32
95360,the male pcw for brought dad out with to join ...,the male pcw then brought dad out to join us i...,58,59,13,13
9733,uziing the city anticipated so many site visit...,using the city anticipating so many site visit...,117,116,20,20
42391,if you can feel problems making the above mixc...,if you have problems using the above mixcloud ...,82,87,14,15
93807,summary this informational bulletin provides a...,summary this informational bulletin provides a...,214,212,26,26
63422,here is an example that i will be using throug...,here is an example that i will be using throug...,62,67,12,13
98430,what other issues can stand in your way,what other issues might stand in your way,42,40,8,8
47979,harry has avoided paying cgt at the rate of pe...,harry has avoided paying cgt at the rate of pe...,56,56,12,12


In [292]:
df.to_csv('cleaned_data.csv', index=False)