# Ekstrak & Pre-proc Dataset 

In [1]:

import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import re
from tqdm.auto import tqdm
tqdm.pandas()
     

In [2]:
df = pd.read_csv(r"D:\TEL-U\BANGKIT!\NARASPEAK\DATASET\C4_200M.tsv-00000-of-00010", delimiter='\t', nrows=100000)
df.columns = ["Incorrect", "Correct"]

In [3]:
df

Unnamed: 0,Incorrect,Correct
0,The effect of widespread dud targets two face ...,"1. The effect of ""widespread dud"" targets two ..."
1,tax on sales of stores for non residents are s...,Capital Gains tax on the sale of properties fo...
2,Much many brands and sellers still in the market.,Many brands and sellers still in the market.
3,this is is the latest Maintenance release of S...,This is is the latest maintenance release of S...
4,"Fairy Or Not, I'm the Godmother: no just look,...","Fairy Or Not, I'm the Godmother: Not just a lo..."
...,...,...
99995,The Toy’s slump seems to coincide with the dis...,The Toy’s slump seems to coincide with the dis...
99996,It took me 4 hours to find this stupid failure.,It took me 4 hours to find this stupid problem.
99997,"Made of high tech thermoplastic rubber, TPR (D...","Made of high-tech thermoplastic rubber, TPR (D..."
99998,According to a study written by the Annie E. C...,"According to a study by the Annie E. Casey, Fo..."


In [4]:
# Adding length features


df['correct_char_count'] = df['Correct'].astype('str').apply(lambda x:len(x))
df['incorrect_char_count'] = df['Incorrect'].astype('str').apply(lambda x:len(x))

df['correct_word_count'] = df['Correct'].astype('str').apply(lambda x:len(x.split()))
df['incorrect_word_count'] = df['Incorrect'].astype('str').apply(lambda x:len(x.split()))

In [5]:
df.head(5)

Unnamed: 0,Incorrect,Correct,correct_char_count,incorrect_char_count,correct_word_count,incorrect_word_count
0,The effect of widespread dud targets two face ...,"1. The effect of ""widespread dud"" targets two ...",92,87,16,15
1,tax on sales of stores for non residents are s...,Capital Gains tax on the sale of properties fo...,355,334,63,59
2,Much many brands and sellers still in the market.,Many brands and sellers still in the market.,44,49,8,9
3,this is is the latest Maintenance release of S...,This is is the latest maintenance release of S...,55,54,10,10
4,"Fairy Or Not, I'm the Godmother: no just look,...","Fairy Or Not, I'm the Godmother: Not just a lo...",100,94,20,18


# Preprocessing
Removing Missing/NA

In [6]:

pd.DataFrame(df.isna().sum(),columns=['missing_count'])

Unnamed: 0,missing_count
Incorrect,0
Correct,0
correct_char_count,0
incorrect_char_count,0
correct_word_count,0
incorrect_word_count,0


In [7]:

df[df.isna().any(axis=1)]

Unnamed: 0,Incorrect,Correct,correct_char_count,incorrect_char_count,correct_word_count,incorrect_word_count


In [8]:

df = df.dropna().reset_index(drop=True)

In [9]:

df.shape

(100000, 6)

Keep unique sentence pairs

In [10]:
print(f"total number of duplicate pairs: {len(df[df['Correct']==df['Incorrect']])}")

total number of duplicate pairs: 590


In [11]:

df[df['Correct']==df['Incorrect']].sample(5)

Unnamed: 0,Incorrect,Correct,correct_char_count,incorrect_char_count,correct_word_count,incorrect_word_count
8905,When is Chronic Ink Tattoo Vancouver Opening?,When is Chronic Ink Tattoo Vancouver Opening?,45,45,7,7
84608,Generating reports for traffic analysis.,Generating reports for traffic analysis.,40,40,5,5
28665,3. What is one of Tamars jobs?,3. What is one of Tamars jobs?,30,30,7,7
54106,Free telephone consultation - please call 0772...,Free telephone consultation - please call 0772...,55,55,8,8
91105,"Throne of Bhaal novelization, Minsc?","Throne of Bhaal novelization, Minsc?",36,36,5,5


In [12]:
df = df[df['Correct']!=df['Incorrect']]

In [13]:
df.shape

(99410, 6)

In [14]:
df.sample(5)

Unnamed: 0,Incorrect,Correct,correct_char_count,incorrect_char_count,correct_word_count,incorrect_word_count
90811,I have a very simple text sketch in Which I'm ...,I have a very simple test sketch in which I'm ...,134,135,28,28
41022,I'm happy that I played the game thanks for SI...,"I'm happy that I played the game, thanks to SI...",141,140,29,29
87784,Oil prices were also supported by OPEC's decis...,Oil prices were also supported by OPEC's decis...,142,134,23,22
93313,Are you want to save €17.15 per year?,Do you want to save €17.15 per year?,36,37,8,8
22091,4 Latins and No Latins express yes to them!,4. No means yes to them!,24,43,6,9


Remove Duplicates

In [15]:

print(f'total number of duplicates: {df.duplicated().sum()}')

total number of duplicates: 0


In [16]:

df[df.duplicated(keep=False)].sort_values('Correct')

Unnamed: 0,Incorrect,Correct,correct_char_count,incorrect_char_count,correct_word_count,incorrect_word_count


In [17]:

df = df.drop_duplicates().reset_index(drop=True)

In [18]:
df.shape

(99410, 6)

In [19]:
df.sample(5)

Unnamed: 0,Incorrect,Correct,correct_char_count,incorrect_char_count,correct_word_count,incorrect_word_count
94552,The first question many new comers ask is... w...,"The first question many new comers ask is, wha...",183,154,30,26
90835,Mar5: 26 5:40 PM PT6:40 PM MT7:40 PM Yukat CT8...,Mar. 26 5:40 PM PT6:40 PM MT7:40 PM CT8:40 PM ...,284,265,56,51
54761,"We love to take custom orders, If you have any...","We love to take custom orders, if you have any...",191,204,42,42
87095,The home must be on minimum one acre land that...,The home must be on at minimum one acre land t...,174,196,32,34
57785,And don't expect Deron Williams do much better...,And don't expect Deron Williams to do much bet...,173,162,33,29


Remove Small sentences

In [20]:
df[df['incorrect_char_count']<2].shape

(0, 6)

In [21]:

df = df[df['incorrect_char_count']>2].reset_index(drop=True)

In [22]:
df.shape

(99409, 6)

In [23]:
df[df['correct_char_count']<2].shape

(0, 6)

In [24]:
df[df['correct_char_count']<2].sample

<bound method NDFrame.sample of Empty DataFrame
Columns: [Incorrect, Correct, correct_char_count, incorrect_char_count, correct_word_count, incorrect_word_count]
Index: []>

In [25]:
df = df[df['correct_char_count']>2].reset_index(drop=True)

In [26]:
df.shape

(99409, 6)

Cleaning Text

In [27]:
#https://www.analyticsvidhya.com/blog/2020/04/beginners-guide-exploratory-data-analysis-text-data/
contractions_dict = { "ain't": "are not","'s":" is","aren't": "are not",
                     "can't": "cannot","can't've": "cannot have",
                     "'cause": "because","could've": "could have","couldn't": "could not",
                     "couldn't've": "could not have", "didn't": "did not","doesn't": "does not",
                     "don't": "do not","hadn't": "had not","hadn't've": "had not have",
                     "hasn't": "has not","haven't": "have not","he'd": "he would",
                     "he'd've": "he would have","he'll": "he will", "he'll've": "he will have",
                     "how'd": "how did","how'd'y": "how do you","how'll": "how will",
                     "I'd": "I would", "I'd've": "I would have","I'll": "I will",
                     "I'll've": "I will have","I'm": "I am","I've": "I have", "isn't": "is not",
                     "it'd": "it would","it'd've": "it would have","it'll": "it will",
                     "it'll've": "it will have", "let's": "let us","ma'am": "madam",
                     "mayn't": "may not","might've": "might have","mightn't": "might not", 
                     "mightn't've": "might not have","must've": "must have","mustn't": "must not",
                     "mustn't've": "must not have", "needn't": "need not",
                     "needn't've": "need not have","o'clock": "of the clock","oughtn't": "ought not",
                     "oughtn't've": "ought not have","shan't": "shall not","sha'n't": "shall not",
                     "shan't've": "shall not have","she'd": "she would","she'd've": "she would have",
                     "she'll": "she will", "she'll've": "she will have","should've": "should have",
                     "shouldn't": "should not", "shouldn't've": "should not have","so've": "so have",
                     "that'd": "that would","that'd've": "that would have", "there'd": "there would",
                     "there'd've": "there would have", "they'd": "they would",
                     "they'd've": "they would have","they'll": "they will",
                     "they'll've": "they will have", "they're": "they are","they've": "they have",
                     "to've": "to have","wasn't": "was not","we'd": "we would",
                     "we'd've": "we would have","we'll": "we will","we'll've": "we will have",
                     "we're": "we are","we've": "we have", "weren't": "were not","what'll": "what will",
                     "what'll've": "what will have","what're": "what are", "what've": "what have",
                     "when've": "when have","where'd": "where did", "where've": "where have",
                     "who'll": "who will","who'll've": "who will have","who've": "who have",
                     "why've": "why have","will've": "will have","won't": "will not",
                     "won't've": "will not have", "would've": "would have","wouldn't": "would not",
                     "wouldn't've": "would not have","y'all": "you all", "y'all'd": "you all would",
                     "y'all'd've": "you all would have","y'all're": "you all are",
                     "y'all've": "you all have", "you'd": "you would","you'd've": "you would have",
                     "you'll": "you will","you'll've": "you will have", "you're": "you are",
                     "you've": "you have","n\'t":" not","\'re":" are","\'s": " is","\'d":" would",
                     "\'ll": " will","\'t":" not","\'ve": " have","\'m":" am"}


# Regular expression for finding contractions
contractions_re=re.compile('(%s)' % '|'.join(contractions_dict.keys()))

# Function for expanding contractions
def expand_contractions(text,contractions_dict=contractions_dict):
    def replace(match):
        return contractions_dict[match.group(0)]
    return contractions_re.sub(replace, text)

In [28]:
def clean(text):
    text = re.sub('\s*\<.*?\>\s', '', text)
    text = re.sub('\s*.*?\s', '', text)
    text = re.sub('\s*.*?\s', '', text)
    text = re.sub('\s*\{.*?\}\s', '', text)
    text = re.sub("[-+@#^/|*(){}$~<>=_%:;]","",text)
    text = text.replace("\\","")
    text = re.sub("\[","",text)
    text = re.sub("\]","",text)
    text = re.sub("\<","",text)
    text = re.sub("\>","",text)
    text = re.sub("\(","",text)
    text = re.sub("\)","",text)
    text = re.sub("[0-9]","",text)
    text = ' '.join(text.split())
    return text

In [29]:

# df['Correct'] = df['Correct'].progress_apply(clean)
df['Correct'] = df['Correct'].progress_apply(expand_contractions)

100%|██████████| 99409/99409 [00:04<00:00, 22774.89it/s]


In [30]:

# df['Incorrect'] = df['Incorrect'].progress_apply(clean)
df['Incorrect'] = df['Incorrect'].progress_apply(expand_contractions)

100%|██████████| 99409/99409 [00:04<00:00, 24750.67it/s]


In [31]:
df.sample(10)

Unnamed: 0,Incorrect,Correct,correct_char_count,incorrect_char_count,correct_word_count,incorrect_word_count
46408,Erection of a detached two storey dwellnbryhou...,Erection of a detached two storey dwellinghous...,129,130,17,17
31916,"Russia: According to SKRIN Newswire, the presi...","Russia: According to SKRIN Newswire, the presi...",179,190,25,27
90461,"63:36 Shot by SUNY Purchase Iurillo), Anthony ...","63:36 Shot by SUNY Purchase Iurillo, Anthony, ...",73,82,12,14
94348,Patients need nurses more than ever in their f...,Patients need nurses more than ever in their f...,56,55,10,10
47522,Contact vtg to drop your comumnication infrast...,Contact VTG to drop your communication infrast...,80,80,11,11
89154,Click on anyone is name in the Onlin tab to ch...,Click on anyone is name in the Online tab to j...,70,80,13,15
59709,The hotel also serves meals with indoor restra...,The hotel also serves meals with an indoor res...,74,72,12,11
34159,"A future or a ca,",A Future or a Funeral?,22,17,5,5
33094,"We’ll file, shape, tidy cuticles, exfoliate an...","We’ll file, shape, tidy cuticles, exfoliate an...",109,117,18,19
47369,Do facial expressions can make in any measuran...,Do facial expressions differ in any measurable...,117,119,20,21


In [32]:
def remove_spaces(text) : 
    text = re.sub(r" '(\w)",r"'\1",text)
    text = re.sub(r" \,",",",text)
    text = re.sub(r" \.+",".",text)
    text = re.sub(r" \!+","!",text)
    text = re.sub(r" \?+","?",text)
    text = re.sub(" n't","n't",text)
    text = re.sub("[\(\)\;\_\^\`\/]","",text)
    
    return text


def decontract(text) : 
    text = re.sub(r"won\'t", "will not", text)
    text = re.sub(r"can\'t", "can not", text)
    text = re.sub(r"n\'t", " not", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'s", " is", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'t", " not", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'m", " am", text)
    
    return text



In [33]:
def WrongSentence_preprocessing(col) : 
    df[col] = df[col].astype(str) 
    df[col] = df[col].apply(lambda x:x.lower()) 
    df[col] = df[col].apply(lambda x : re.sub("\n","",x)) 
    df[col] = df[col].apply(lambda x : remove_spaces(x)) 
    df[col] = df[col].apply(lambda x : re.sub(r"\.+",".",x)) 
    df[col] = df[col].apply(lambda x : re.sub(r"\!+","!",x)) 
    df[col] = df[col].apply(lambda x : decontract(x)) 
    df[col] = df[col].apply(lambda x: re.sub("[^A-Za-z\s]","",x)) 
    df[col] = df[col].apply(lambda x: x.replace("\s+"," "))
    df[col] = df[col].apply(lambda x: " ".join([word for word in x.split()]))
    return df

def TrueSentence_preprocessing(col) : 
    df[col] = df[col].astype(str) 
    df[col] = df[col].apply(lambda x:x.lower()) 
    df[col] = df[col].apply(lambda x : re.sub("\n","",x)) 
    df[col] = df[col].apply(lambda x : remove_spaces(x)) 
    df[col] = df[col].apply(lambda x : re.sub(r"\.+",".",x)) 
    df[col] = df[col].apply(lambda x : re.sub(r"\!+","!",x)) 
    df[col] = df[col].apply(lambda x : decontract(x)) 
    df[col] = df[col].apply(lambda x: re.sub("[^A-Za-z\s]","",x)) 
    df[col] = df[col].apply(lambda x: re.sub(r'\d','',x))
    df[col] = df[col].apply(lambda x: re.sub(r'\s+',' ',x))
    df[col] = df[col].apply(lambda x: re.sub(r"[-()\"#/@;:<>{}`+=~|.!?,।]", "", x))
    df[col] = df[col].apply(lambda x: x.strip())
    # df[col] = "<sos> " + df[col] + " <eos>"
    return df

In [34]:
df = WrongSentence_preprocessing('Incorrect') 
df = TrueSentence_preprocessing('Correct')

In [35]:
df.sample(10)

Unnamed: 0,Incorrect,Correct,correct_char_count,incorrect_char_count,correct_word_count,incorrect_word_count
61568,can it be feasible need to come up with the po...,where feasible you need to have the possibilit...,156,169,26,30
24144,a popuraled breakfast payday in expat at chian...,a popular breakfast spot for expats in chiang mai,50,52,9,9
41258,d tokyo has been released,delphi tokyo has been released,36,31,6,6
9091,incorporate shelves etertain areas such as din...,incorporate shelves in entertainment areas suc...,150,137,22,20
88929,how my garden is grew,how did my garden grow,23,22,5,5
1700,zach plays football and wrestlers for the west...,zach plays football and wrestlers for the west...,71,72,11,11
18800,they are estimated that the money is breaker,they give the rider an estimated total cost,44,45,8,8
1304,join us in elgin cool wine country food festiv...,join us during the elgin cool wine country foo...,213,201,37,35
12738,omg i want big wheel that so bad i still had p...,omg i wanted a big wheel so bad i still have t...,89,93,19,19
12534,whne should i use it email address,when should i use a custom email address,41,36,8,7


In [36]:
df.to_csv('cleaned_data.csv', index=False)