# Ekstrak & Pre-proc Dataset

In [None]:
import tensorflow as tf

print("TensorFlow version:", tf.__version__)
print("Num GPUs Available: ", len(tf.config.list_physical_devices('TPU')))

TensorFlow version: 2.15.0
Num GPUs Available:  8


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import warnings
# warnings.filterwarnings("ignore")
import pandas as pd
import re
from tqdm.auto import tqdm #
tqdm.pandas()

In [None]:
df = pd.read_csv(r"/content/drive/MyDrive/Dataset/C4_200M.tsv-00000-of-00010", delimiter='\t', nrows=500000)
df.columns = ["incorrect", "correct"]

In [None]:
df

Unnamed: 0,incorrect,correct
0,The effect of widespread dud targets two face ...,"1. The effect of ""widespread dud"" targets two ..."
1,tax on sales of stores for non residents are s...,Capital Gains tax on the sale of properties fo...
2,Much many brands and sellers still in the market.,Many brands and sellers still in the market.
3,this is is the latest Maintenance release of S...,This is is the latest maintenance release of S...
4,"Fairy Or Not, I'm the Godmother: no just look,...","Fairy Or Not, I'm the Godmother: Not just a lo..."
...,...,...
499995,If you were lucky enough to grab PAX it here i...,If you were lucky enough to grab a PAX pack he...
499996,"This page has last edited at December 20, 2018...","This page was last edited on December 20, 2018..."
499997,suction cup shower shelves bathroom corner sto...,suction cup shower shelves bathroom corner sto...
499998,W work over pass 2012 can be seen in the photo...,Woman working past 2012 can be seen in the pho...


In [None]:
# Adding length features

df['correct_char_count'] = df['correct'].astype('str').apply(lambda x:len(x))
df['incorrect_char_count'] = df['incorrect'].astype('str').apply(lambda x:len(x))

df['correct_word_count'] = df['correct'].astype('str').apply(lambda x:len(x.split()))
df['incorrect_word_count'] = df['incorrect'].astype('str').apply(lambda x:len(x.split()))

In [None]:
df.head(5)

Unnamed: 0,incorrect,correct,correct_char_count,incorrect_char_count,correct_word_count,incorrect_word_count
0,The effect of widespread dud targets two face ...,"1. The effect of ""widespread dud"" targets two ...",92,87,16,15
1,tax on sales of stores for non residents are s...,Capital Gains tax on the sale of properties fo...,355,334,63,59
2,Much many brands and sellers still in the market.,Many brands and sellers still in the market.,44,49,8,9
3,this is is the latest Maintenance release of S...,This is is the latest maintenance release of S...,55,54,10,10
4,"Fairy Or Not, I'm the Godmother: no just look,...","Fairy Or Not, I'm the Godmother: Not just a lo...",100,94,20,18


# Preprocessing
Removing Missing/NA

In [None]:
pd.DataFrame(df.isna().sum(),columns=['missing_count'])

Unnamed: 0,missing_count
incorrect,0
correct,2
correct_char_count,0
incorrect_char_count,0
correct_word_count,0
incorrect_word_count,0


In [None]:
df[df.isna().any(axis=1)]

Unnamed: 0,incorrect,correct,correct_char_count,incorrect_char_count,correct_word_count,incorrect_word_count
152174,Some information and sources for the Fourier c...,,3,234,1,38
487904,Unpacking the Effects of Repression: the Evolu...,,3,375,1,55


In [None]:

df = df.dropna().reset_index(drop=True)

In [None]:

df.shape

(499998, 6)

Keep unique sentence pairs

In [None]:
print(f"total number of duplicate pairs: {len(df[df['correct']==df['incorrect']])}")

total number of duplicate pairs: 3072


In [None]:

df[df['correct']==df['incorrect']].sample(5)

Unnamed: 0,incorrect,correct,correct_char_count,incorrect_char_count,correct_word_count,incorrect_word_count
301248,washer hose home depot home depot washing mach...,washer hose home depot home depot washing mach...,199,199,31,31
142629,We support equity planning and meaningful comm...,We support equity planning and meaningful comm...,64,64,8,8
318672,"Use jigdo to download your image(s) now, and s...","Use jigdo to download your image(s) now, and s...",94,94,16,16
486985,black iron twin bed full size of bed metal twi...,black iron twin bed full size of bed metal twi...,123,123,24,24
203422,For associated major establishment of HM Docky...,For associated major establishment of HM Docky...,138,138,23,23


In [None]:
df = df[df['correct']!=df['incorrect']]

In [None]:
df.shape

(496926, 6)

In [None]:
df.sample(5)

Unnamed: 0,incorrect,correct,correct_char_count,incorrect_char_count,correct_word_count,incorrect_word_count
132202,a) The scholar have already forgotten the prin...,a) The scholar has already forgotten the princ...,181,172,28,27
129719,that county commissioners that.,county commissioners that emergency exists.,43,31,5,4
10285,Iconoclast... libera commuter which makes a gu...,"Iconoclast... liberal Utahn who likes guns, li...",121,116,21,20
466289,One of the workshops that I attended Men in Th...,One of the workshops that I attended at the Me...,216,209,37,35
6544,"The petition, initiated from some fasceless pe...","The petition, initiated by some faceless perso...",298,297,47,46


Remove Duplicates

In [None]:

print(f'total number of duplicates: {df.duplicated().sum()}')

total number of duplicates: 0


In [None]:
df[df.duplicated(keep=False)].sort_values('correct')

Unnamed: 0,incorrect,correct,correct_char_count,incorrect_char_count,correct_word_count,incorrect_word_count


In [None]:

df = df.drop_duplicates().reset_index(drop=True)

In [None]:
df.shape

(496926, 6)

In [None]:
df.sample(5)

Unnamed: 0,incorrect,correct,correct_char_count,incorrect_char_count,correct_word_count,incorrect_word_count
361426,table track nickyp wood shop enhancements t tr...,table track nickyp wood shop enhancements t tr...,124,126,19,19
438726,GHS’ center for pediatric sleep deslorders is ...,GHS’ Center for Pediatric Sleep Disorders has ...,193,183,31,29
459335,Admiited admission is $10 and Include comprime...,General admission is $10 and includes complime...,61,60,8,8
114936,Next articleWhat’s My TWIST? Unexpected expert...,Next articleWhat’s My TWIST? Unexpected expert...,68,73,10,11
429860,to investigate this incident is a inside job a...,"to investigate this incident, another inside job.",49,51,7,9


Remove Small sentences

In [None]:
df[df['incorrect_char_count']<2].shape

(0, 6)

In [None]:

df = df[df['incorrect_char_count']>2].reset_index(drop=True)

In [None]:
df.shape

(496920, 6)

In [None]:
df[df['correct_char_count']<2].shape

(0, 6)

In [None]:
df[df['correct_char_count']<2].sample

In [None]:
df = df[df['correct_char_count']>2].reset_index(drop=True)

In [None]:
df.shape

(496920, 6)

Cleaning Text

In [None]:
#https://www.analyticsvidhya.com/blog/2020/04/beginners-guide-exploratory-data-analysis-text-data/
contractions_dict = { "ain't": "are not","'s":" is","aren't": "are not",
                     "can't": "cannot","can't've": "cannot have",
                     "'cause": "because","could've": "could have","couldn't": "could not",
                     "couldn't've": "could not have", "didn't": "did not","doesn't": "does not",
                     "don't": "do not","hadn't": "had not","hadn't've": "had not have",
                     "hasn't": "has not","haven't": "have not","he'd": "he would",
                     "he'd've": "he would have","he'll": "he will", "he'll've": "he will have",
                     "how'd": "how did","how'd'y": "how do you","how'll": "how will",
                     "I'd": "I would", "I'd've": "I would have","I'll": "I will",
                     "I'll've": "I will have","I'm": "I am","I've": "I have", "isn't": "is not",
                     "it'd": "it would","it'd've": "it would have","it'll": "it will",
                     "it'll've": "it will have", "let's": "let us","ma'am": "madam",
                     "mayn't": "may not","might've": "might have","mightn't": "might not",
                     "mightn't've": "might not have","must've": "must have","mustn't": "must not",
                     "mustn't've": "must not have", "needn't": "need not",
                     "needn't've": "need not have","o'clock": "of the clock","oughtn't": "ought not",
                     "oughtn't've": "ought not have","shan't": "shall not","sha'n't": "shall not",
                     "shan't've": "shall not have","she'd": "she would","she'd've": "she would have",
                     "she'll": "she will", "she'll've": "she will have","should've": "should have",
                     "shouldn't": "should not", "shouldn't've": "should not have","so've": "so have",
                     "that'd": "that would","that'd've": "that would have", "there'd": "there would",
                     "there'd've": "there would have", "they'd": "they would",
                     "they'd've": "they would have","they'll": "they will",
                     "they'll've": "they will have", "they're": "they are","they've": "they have",
                     "to've": "to have","wasn't": "was not","we'd": "we would",
                     "we'd've": "we would have","we'll": "we will","we'll've": "we will have",
                     "we're": "we are","we've": "we have", "weren't": "were not","what'll": "what will",
                     "what'll've": "what will have","what're": "what are", "what've": "what have",
                     "when've": "when have","where'd": "where did", "where've": "where have",
                     "who'll": "who will","who'll've": "who will have","who've": "who have",
                     "why've": "why have","will've": "will have","won't": "will not",
                     "won't've": "will not have", "would've": "would have","wouldn't": "would not",
                     "wouldn't've": "would not have","y'all": "you all", "y'all'd": "you all would",
                     "y'all'd've": "you all would have","y'all're": "you all are",
                     "y'all've": "you all have", "you'd": "you would","you'd've": "you would have",
                     "you'll": "you will","you'll've": "you will have", "you're": "you are",
                     "you've": "you have","n\'t":" not","\'re":" are","\'s": " is","\'d":" would",
                     "\'ll": " will","\'t":" not","\'ve": " have","\'m":" am"}


# Regular expression for finding contractions
contractions_re=re.compile('(%s)' % '|'.join(contractions_dict.keys()))

# Function for expanding contractions
def expand_contractions(text,contractions_dict=contractions_dict):
    def replace(match):
        return contractions_dict[match.group(0)]
    return contractions_re.sub(replace, text)

In [None]:
def clean(text):
    text = re.sub('\s*\<.*?\>\s', '', text)
    text = re.sub('\s*.*?\s', '', text)
    text = re.sub('\s*.*?\s', '', text)
    text = re.sub('\s*\{.*?\}\s', '', text)
    text = re.sub("[-+@#^/|*(){}$~<>=_%:;]","",text)
    text = text.replace("\\","")
    text = re.sub("\[","",text)
    text = re.sub("\]","",text)
    text = re.sub("\<","",text)
    text = re.sub("\>","",text)
    text = re.sub("\(","",text)
    text = re.sub("\)","",text)
    text = re.sub("[0-9]","",text)
    text = ' '.join(text.split())
    return text

In [None]:
# df['Correct'] = df['Correct'].progress_apply(clean)
df['correct'] = df['correct'].progress_apply(expand_contractions)

  0%|          | 0/496920 [00:00<?, ?it/s]

In [None]:
# df['Incorrect'] = df['Incorrect'].progress_apply(clean)
df['incorrect'] = df['incorrect'].progress_apply(expand_contractions)

  0%|          | 0/496920 [00:00<?, ?it/s]

In [None]:
df.sample(10)

Unnamed: 0,incorrect,correct,correct_char_count,incorrect_char_count,correct_word_count,incorrect_word_count
39962,The pewter bilby will be a nice addition to yo...,The pewter bilby would be a nice addition to y...,86,86,14,14
153813,"Basically, you will need to configure the West...","Basically, you will need to configure the West...",69,69,11,11
444360,Closed now ! Aiken & Hightower P.A.,Closed now Aiken & Hightower P.A.,33,35,6,7
356090,sneha’ first solo trip ever with WanderTrust !,"Sneha’ first solo trip ever, with WanderTrust !",47,46,8,8
136595,What is the weather like in Iowa City.,What is the weather like in Iowa City?,38,38,8,8
121057,Another sample animation i did for Heartstring...,Another sample animation I did for Heartstring...,87,86,13,13
230926,But that is no kind of reson to quit,But that is no reason to quit.,29,35,6,8
253079,Nothing to do with engine oil Transmission Fluid.,Nothing to do with Engine Oil. Transmission Fl...,50,49,8,8
454679,Click HERE to find out how I got brand new wed...,Click HERE to find out how I got a brand new w...,73,70,16,15
408047,pioneer also participated in goods such as Win...,Pioneer also worked on pieces of Windows Phone...,166,180,32,34


In [None]:
def remove_spaces(text) :
    text = re.sub(r" '(\w)",r"'\1",text)
    text = re.sub(r" \,",",",text)
    text = re.sub(r" \.+",".",text)
    text = re.sub(r" \!+","!",text)
    text = re.sub(r" \?+","?",text)
    text = re.sub(" n't","n't",text)
    text = re.sub("[\(\)\;\_\^\`\/]","",text)

    return text


def decontract(text) :
    text = re.sub(r"won\'t", "will not", text)
    text = re.sub(r"can\'t", "can not", text)
    text = re.sub(r"n\'t", " not", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'s", " is", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'t", " not", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'m", " am", text)

    return text



In [None]:
def WrongSentence_preprocessing(col) :
    df[col] = df[col].astype(str)
    df[col] = df[col].apply(lambda x:x.lower())
    df[col] = df[col].apply(lambda x : re.sub("\n","",x))
    df[col] = df[col].apply(lambda x : remove_spaces(x))
    df[col] = df[col].apply(lambda x : re.sub(r"\.+",".",x))
    df[col] = df[col].apply(lambda x : re.sub(r"\!+","!",x))
    df[col] = df[col].apply(lambda x : decontract(x))
    df[col] = df[col].apply(lambda x: re.sub("[^A-Za-z\s]","",x))
    df[col] = df[col].apply(lambda x: x.replace("\s+"," "))
    df[col] = df[col].apply(lambda x: " ".join([word for word in x.split()]))
    return df

def TrueSentence_preprocessing(col) :
    df[col] = df[col].astype(str)
    df[col] = df[col].apply(lambda x:x.lower())
    df[col] = df[col].apply(lambda x : re.sub("\n","",x))
    df[col] = df[col].apply(lambda x : remove_spaces(x))
    df[col] = df[col].apply(lambda x : re.sub(r"\.+",".",x))
    df[col] = df[col].apply(lambda x : re.sub(r"\!+","!",x))
    df[col] = df[col].apply(lambda x : decontract(x))
    df[col] = df[col].apply(lambda x: re.sub("[^A-Za-z\s]","",x))
    df[col] = df[col].apply(lambda x: re.sub(r'\d','',x))
    df[col] = df[col].apply(lambda x: re.sub(r'\s+',' ',x))
    df[col] = df[col].apply(lambda x: re.sub(r"[-()\"#/@;:<>{}`+=~|.!?,।]", "", x))
    df[col] = df[col].apply(lambda x: x.strip())
    # df[col] = "<sos> " + df[col] + " <eos>"
    return df

In [None]:
df = WrongSentence_preprocessing('incorrect')
df = TrueSentence_preprocessing('correct')

In [None]:
df.sample(10)

Unnamed: 0,incorrect,correct,correct_char_count,incorrect_char_count,correct_word_count,incorrect_word_count
227873,arts culture and data art culture and humaniti...,arts culture and data arts culture and humanit...,202,209,40,42
111709,this map was also a participfnt of cncmapscom ...,this map was also a participant of cncmapscom ...,81,81,13,13
114740,teachers and facilitators to develop and susta...,teachers and facilitators to develop and susta...,123,113,16,16
346359,now just have to recover and drool some more p...,now just sit back and drool some more p you ca...,152,159,29,31
32993,the tie up responds to the increasing demand f...,the tieup responds to the increasing demand fr...,190,190,32,33
48659,drink a large glass of water minites prior to ...,drink a large glass of water minutes prior to ...,58,58,12,12
178548,in the s it was known for hijacking grocery tr...,in the s it was known for hijacking grocery tr...,90,93,16,17
247695,oh my my i am drooling now looking just how de...,oh my i am drooling now just looking at how de...,71,74,13,14
120844,castle view wants to use the digital signage t...,castle view wanted to use the digital signage ...,269,275,39,40
424888,preclipping before bth does make it easier to ...,pre clipping before th bath does make it easie...,438,425,86,84


In [None]:
df.to_csv('cleaned_data.csv', index=False)