# Natural Language Processing

# This notebook is for Text prepocessing using NLTK library
# Here we follow the processes to clean the textual data for our final output database
1. Import needed libraries
2. Open text file
3. remove newlines, join the text based of advertise & normalize (lowercase) the text
4. Contractions Replacement & Replace Important text for variable.
5. Punctuations Removal
6. tokenize the word, stop words & custom words removal
7. Repeating characters removal (Typeing errors Ex: Bedrooom - Bedroom)
8. Check the spelling of words and correct them
9. Merge the tokenize text to list
10. Replace word numbers to integer numbers
11. Lemmitizations

# Libararies

In [1]:
from nltk.tokenize import word_tokenize
import pandas as pd
import re
from nltk.corpus import stopwords
from itertools import groupby

In [2]:
# Open text file
with open(r"D:\R & D PROJECT\Test_text\Test_Text.txt", "r") as filename:
    advertise_lines = filename.readlines()

# Text Preprocessing

In [3]:
advertise_lines[:18]

['new line\n',
 'TWO STOREY HOUSE FOR SALE - NEGONBO\n',
 '\n',
 'Two storey air conditioned house for sale\n',
 '\n',
 'at Thushara Mawatha. 2 mins to Negombo\n',
 '\n',
 'Colombo Road. 5 mins to Negombo Town.\n',
 'Lobby, Living, Dinning,-Ki Kitchen Pantry, Balcony\n',
 '\n',
 '4 bed rooms, 3 bathrooms; 3000 sq feet, solar\n',
 'power hot water’ ‘system nmatap water with upper\n',
 '& down tanks, Garbage service 3 times a week\n',
 'Land area 12 perches\n',
 'PRICE - 28000000/= Negotiable\n',
 '\n',
 'TEL : 076 53312545\n',
 '\n']

# Remove new lines and join the text

In [4]:
# Join the lines and Remove punctuation
newline = "\n".join ([line.strip() for line in advertise_lines])
combine_ad_lst = []
for line in newline.split("new line"):
    if not line.strip():
            continue
    combine_ad_lst.append(line.lstrip().lower())
ad_list = [new.replace("\n", " ") for new in combine_ad_lst]

In [5]:
ad_list[:5]

['two storey house for sale - negonbo  two storey air conditioned house for sale  at thushara mawatha. 2 mins to negombo  colombo road. 5 mins to negombo town. lobby, living, dinning,-ki kitchen pantry, balcony  4 bed rooms, 3 bathrooms; 3000 sq feet, solar power hot water’ ‘system nmatap water with upper & down tanks, garbage service 3 times a week land area 12 perches price - 28000000/= negotiable  tel : 076 53312545   ',
 'ratmalana  house for sale  ratmalana - borupana road 11.8 perches land and house for sale. 3 bedrooms, 2 toilets (one attached) hall, dining hal!, pantry and kitchen.  wall right round with front garden, rear garden and with two court yards walking distance to cargills super and kdu. rs. 32 million, negotiable. no brokers, only genuine buyers.  077°38 90 545   ',
 'katubedda  pbalwis perera mawatha, 300 m from galle road, 4 spacious bedrooms, 2 attached bathrooms, large living and dining area, spacious pantry, outside extra  kitchen, toilet, garage, parapet wall a

In [6]:
len(ad_list)

178

# Punctuations

In [7]:
import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [8]:
def clean(punlist):
    punc_clean_lst = []
    punc = '''!"#$%&\'()*+,-‘./:;<=>?@[\\]^_`¢{|}~«»—©®°'''
    for text in punlist:
        text_punc = "".join([word for word in text if word not in punc])
        singl_char = ' '.join( [re.sub(r"\b[a-zA-Z]\b", "", w) for w in text_punc.split()]) # remove singel string character only 
        num_char = re.sub(r'(?<=\d) (?=\d)', '', singl_char)
        punc_clean_lst.append(num_char)
    return punc_clean_lst
punc_clean_lst=clean(ad_list)

In [9]:
punc_clean_lst[:5]

['two storey house for sale negonbo two storey air conditioned house for sale at thushara mawatha 2 mins to negombo colombo road 5 mins to negombo town lobby living dinningki kitchen pantry balcony 4 bed rooms 3 bathrooms 3000 sq feet solar power hot water’ system nmatap water with upper down tanks garbage service 3 times  week land area 12 perches price 28000000 negotiable tel 07653312545',
 'ratmalana house for sale ratmalana borupana road 118 perches land and house for sale 3 bedrooms 2 toilets one attached hall dining hal pantry and kitchen wall right round with front garden rear garden and with two court yards walking distance to cargills super and kdu rs 32 million negotiable no brokers only genuine buyers 0773890545',
 'katubedda pbalwis perera mawatha 300  from galle road 4 spacious bedrooms 2 attached bathrooms large living and dining area spacious pantry outside extra kitchen toilet garage parapet wall around 19 perches asking price 360 lakhs inspection by appointment only oy

# Contractions replacement 

In [10]:
replacement_patterns = [
                (r'won\'t', 'will not'),
                (r'can\'t', 'cannot'),
                (r'i\'m', 'i am'),
                (r'ain\'t', 'is not'),
                (r'(\w+)\'ll', '\g<1> will'),
                (r'(\w+)n\'t', '\g<1> not'),
                (r'(\w+)\'ve', '\g<1> have'),
                (r'(\w+)\'s', '\g<1> is'),
                (r'(\w+)\'re', '\g<1> are'),
                (r'(\w+)\'d', '\g<1> would')
                ]

In [11]:
class RegexpReplacer(object):
    def __init__(self, patterns=replacement_patterns):
        self.patterns = [(re.compile(regex), repl) for (regex, repl) in patterns]
    
    def replace(self, text):
        s = text
        for (pattern, repl) in self.patterns:
            s = re.sub(pattern, repl, s)
        return s

replacer = RegexpReplacer()
cont_rep_lst = [replacer.replace(contraction) for contraction in punc_clean_lst]

In [12]:
cont_rep_lst[1]

'ratmalana house for sale ratmalana borupana road 118 perches land and house for sale 3 bedrooms 2 toilets one attached hall dining hal pantry and kitchen wall right round with front garden rear garden and with two court yards walking distance to cargills super and kdu rs 32 million negotiable no brokers only genuine buyers 0773890545'

# Word tokenize and Stop word removal

In [13]:
custom_stop = ['sale','rent','land','phone','date','email','please','call','details','sri' ,'lanka','style','beautiful','modern','contact','attached','new','brand','negotiable']

In [14]:
def preprocessing(textlist):

    word_token =[]
    for lists in textlist:
        word_token.append(word_tokenize(lists))

    
    tokenized_word = [[] for i in range(len(word_token))]
    stop = stopwords.words('english')
    stop.extend(custom_stop)
    for i in range(len(word_token)):
        for lst in word_token[i]: 
            if lst not in stop:
                tokenized_word[i].append(lst)
                
    return tokenized_word
tokenize_list=preprocessing(cont_rep_lst)
col_cleaned_list = [lst for lst in tokenize_list if len(lst)>15] # 2 pages pdf splited as 2 images so remove non content set
cont_num_trim_list = [[s[:10] if s.isdigit() else s for s in sub] for sub in col_cleaned_list]

In [15]:
print(cont_num_trim_list[1])

['ratmalana', 'house', 'ratmalana', 'borupana', 'road', '118', 'perches', 'land', 'house', '3', 'bedrooms', '2', 'toilets', 'one', 'hall', 'dining', 'hal', 'pantry', 'kitchen', 'wall', 'right', 'round', 'front', 'garden', 'rear', 'garden', 'two', 'court', 'yards', 'walking', 'distance', 'cargills', 'super', 'kdu', 'rs', '32', 'million', 'brokers', 'genuine', 'buyers', '0773890545']


# Separate the numbers and sting from the list

In [16]:
sep_num_char_lst = []
for lsts in cont_num_trim_list:
    charnum_lst = [[''.join(g) for _, g in groupby(num_char, str.isalpha)] for num_char in lsts]
    comp_lst = [char for sep_char in charnum_lst for char in sep_char]
    sep_num_char_lst.append(comp_lst)
print(sep_num_char_lst[1])

['ratmalana', 'house', 'ratmalana', 'borupana', 'road', '118', 'perches', 'land', 'house', '3', 'bedrooms', '2', 'toilets', 'one', 'hall', 'dining', 'hal', 'pantry', 'kitchen', 'wall', 'right', 'round', 'front', 'garden', 'rear', 'garden', 'two', 'court', 'yards', 'walking', 'distance', 'cargills', 'super', 'kdu', 'rs', '32', 'million', 'brokers', 'genuine', 'buyers', '0773890545']


In [17]:
#uniq_val_lst = [list(set(lst)) for lst in prepros_list]    #unique characters

# Repeating Charactors

In [18]:
import re
from nltk.corpus import wordnet
class RepeatReplacer(object):
    def __init__(self):
        self.repeat_regexp = re.compile(r'(\[a-zA-Z]*)(\[a-zA-Z])\2(\[a-zA-Z]*)')    # 0 or more repeat char in first, middle and last
        self.repl = r'\1\2\3'
    def replace(self, word):
        if wordnet.synsets(word):
            return word
        repl_word = self.repeat_regexp.sub(self.repl, word)
        if repl_word != word:
            return self.replace(repl_word)
        else:
            return repl_word

In [19]:
rep_replacer = RepeatReplacer()
rep_char_cleaned_lst = [[rep_replacer.replace(txt) for txt in lst] for lst in sep_num_char_lst]
print(rep_char_cleaned_lst[1])

['ratmalana', 'house', 'ratmalana', 'borupana', 'road', '118', 'perches', 'land', 'house', '3', 'bedrooms', '2', 'toilets', 'one', 'hall', 'dining', 'hal', 'pantry', 'kitchen', 'wall', 'right', 'round', 'front', 'garden', 'rear', 'garden', 'two', 'court', 'yards', 'walking', 'distance', 'cargills', 'super', 'kdu', 'rs', '32', 'million', 'brokers', 'genuine', 'buyers', '0773890545']


# Spelling checking

In [20]:
#!pip install pyenchant

In [21]:
import enchant
from nltk.metrics import edit_distance
class SpellingReplacer(object):
    def __init__(self, dict_name='en_GB', max_dist=2):
        self.spell_dict = enchant.Dict(dict_name)
        self.max_dist = max_dist
    def replace(self, word):
        if self.spell_dict.check(word):
            return word
        suggestions = self.spell_dict.suggest(word)
        if suggestions and edit_distance(word, suggestions[0]) <= self.max_dist:
            return suggestions[0]
        else:
            return word

In [22]:
spel_replacer = SpellingReplacer()
spell_corr_lst = [[spel_replacer .replace(txt) for txt in lst] for lst in rep_char_cleaned_lst]
print(spell_corr_lst[1])

['ratmalana', 'house', 'ratmalana', 'borupana', 'road', '118', 'perches', 'land', 'house', '3', 'bedrooms', '2', 'toilets', 'one', 'hall', 'dining', 'Hal', 'pantry', 'kitchen', 'wall', 'right', 'round', 'front', 'garden', 'rear', 'garden', 'two', 'court', 'yards', 'walking', 'distance', 'car gills', 'super', 'kudu', 'rs', '32', 'million', 'brokers', 'genuine', 'buyers', '0773890545']


# Lemmatizing

In [23]:
from textblob import Word
lem_text = [[] for i in range(len(spell_corr_lst))]
for i in range(len(spell_corr_lst)):
    for word in spell_corr_lst[i]:
        txt = Word(word).lemmatize()
        lem_text[i].append(txt)

In [24]:
print(lem_text[1])

['ratmalana', 'house', 'ratmalana', 'borupana', 'road', '118', 'perch', 'land', 'house', '3', 'bedroom', '2', 'toilet', 'one', 'hall', 'dining', 'Hal', 'pantry', 'kitchen', 'wall', 'right', 'round', 'front', 'garden', 'rear', 'garden', 'two', 'court', 'yard', 'walking', 'distance', 'car gills', 'super', 'kudu', 'r', '32', 'million', 'broker', 'genuine', 'buyer', '0773890545']


# Merge Tokenize words

In [25]:
def untokenize(ad_lsts):
    untokized_lst = []
    for lst in ad_lsts:
        text = ' '.join(lst)
        step1 = text.replace("`` ", '"').replace(" ''", '"')
        step2 = re.sub(r' ([.,:;?!%]+)([ \'"`])', r"\1\2", step1)
        untokized_lst.append(step2.strip())
    return untokized_lst

In [26]:
untokenized_lst=untokenize(lem_text)
untokenized_lst[:5]

['two storey house negonbo two storey air conditioned house thushara Hiawatha 2 mind negombo Colombo road 5 mind negombo town lobby living dinning ki kitchen pantry balcony 4 bed room 3 bathroom 3000 sq foot solar power hot water e system nmatap water upper tank garbage service 3 time week land area 12 perch price 28000000 tel 0765331254',
 'ratmalana house ratmalana borupana road 118 perch land house 3 bedroom 2 toilet one hall dining Hal pantry kitchen wall right round front garden rear garden two court yard walking distance car gills super kudu r 32 million broker genuine buyer 0773890545',
 'katubedda pbalwis per era Hiawatha 300 Galle road 4 spacious bedroom 2 bathroom large living dining area spacious pantry outside extra kitchen toilet garage parapet wall around 19 perch asking price 360 lakh inspection appointment ovary wry aye',
 'luxury house Colombo 6 2500 fast 4 bedroom ac 3 bathroom hot water parking 2 rolled shutter gate oz unfurnished upstairs house 225 k pm garden backy

# Custom words replacement

In [27]:
custom_word = [ (r'\smn', 'million'),
                (r'\d\smillion','000000'),
                 (r'sq\sft', 'sqft'),
                 (r'fast', 'sqft'),
                 (r'sift', 'sqft'),
                (r'lkh', 'lakh'),
                (r'lakh', '00000'),
                (r'bn', 'billion'),
                (r'barth', 'bath'),
                (r'bath\sroom', 'bathroom'),
                (r'bed\sroom', 'bedroom'),
                (r'toilel', 'bathroom'),
                (r'wash\sroom', 'bathroom')
          ]

In [28]:
class CustomReplacer(object):
    def __init__(self, patterns=custom_word):
        self.patterns = [(re.compile(regex), repl) for (regex, repl) in patterns]
    
    def replace(self, text):
        s = text
        for (pattern, repl) in self.patterns:
            s = re.sub(pattern, repl, s)
        return s

replacer = CustomReplacer()
costum_word_rep_lst = [replacer.replace(contraction) for contraction in untokenized_lst]

In [29]:
costum_word_rep_lst[:10]

['two storey house negonbo two storey air conditioned house thushara Hiawatha 2 mind negombo Colombo road 5 mind negombo town lobby living dinning ki kitchen pantry balcony 4 bedroom 3 bathroom 3000 sq foot solar power hot water e system nmatap water upper tank garbage service 3 time week land area 12 perch price 28000000 tel 0765331254',
 'ratmalana house ratmalana borupana road 118 perch land house 3 bedroom 2 toilet one hall dining Hal pantry kitchen wall right round front garden rear garden two court yard walking distance car gills super kudu r 3000000 broker genuine buyer 0773890545',
 'katubedda pbalwis per era Hiawatha 300 Galle road 4 spacious bedroom 2 bathroom large living dining area spacious pantry outside extra kitchen toilet garage parapet wall around 19 perch asking price 360 00000 inspection appointment ovary wry aye',
 'luxury house Colombo 6 2500 sqft 4 bedroom ac 3 bathroom hot water parking 2 rolled shutter gate oz unfurnished upstairs house 225 k pm garden backyard

# Replace word numbers as integer numbers

In [30]:
def text2int(textnum, numwords={}):
    if not numwords:
        units = [
        "zero", "one", "two", "three", "four", "five", "six", "seven", "eight",
        "nine", "ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen",
        "sixteen", "seventeen", "eighteen", "nineteen",
        ]

        tens = ["", "", "twenty", "thirty", "forty", "fifty", "sixty", "seventy", "eighty", "ninety"]

        scales = ["hundred", "thousand", "million", "billion", "trillion"]

        numwords["and"] = (1, 0)
        for idx, word in enumerate(units):  numwords[word] = (1, idx)
        for idx, word in enumerate(tens):       numwords[word] = (1, idx * 10)
        for idx, word in enumerate(scales): numwords[word] = (10 ** (idx * 3 or 2), 0)

    ordinal_words = {'first':1, 'second':2, 'third':3, 'fifth':5, 'eighth':8, 'ninth':9, 'twelfth':12}
    ordinal_endings = [('ieth', 'y'), ('th', '')]

    textnum = textnum.replace('-', ' ')

    current = result = 0
    curstring = ""
    onnumber = False
    for word in textnum.split():
        if word in ordinal_words:
            scale, increment = (1, ordinal_words[word])
            current = current * scale + increment
            if scale > 100:
                result += current
                current = 0
            onnumber = True
        else:
            for ending, replacement in ordinal_endings:
                if word.endswith(ending):
                    word = "%s%s" % (word[:-len(ending)], replacement)

            if word not in numwords:
                if onnumber:
                    curstring += repr(result + current) + " "
                curstring += word + " "
                result = current = 0
                onnumber = False
            else:
                scale, increment = numwords[word]

                current = current * scale + increment
                if scale > 100:
                    result += current
                    current = 0
                onnumber = True

    if onnumber:
        curstring += repr(result + current)

    return curstring
final_preprocessed_lst = [text2int(text) for text in costum_word_rep_lst] 

In [31]:
final_preprocessed_lst[:10]

['2 storey house negonbo 2 storey air conditioned house thushara Hiawatha 2 mind negombo Colombo road 5 mind negombo town lobby living dinning ki kitchen pantry balcony 4 bedroom 3 bathroom 3000 sq foot solar power hot water e system nmatap water upper tank garbage service 3 time week land area 12 perch price 28000000 tel 0765331254 ',
 'ratmalana house ratmalana borupana road 118 perch land house 3 bedroom 2 toilet 1 hall dining Hal pantry kitchen wall right round front garden rear garden 2 court yard walking distance car gills super kudu r 3000000 broker genuine buyer 0773890545 ',
 'katubedda pbalwis per era Hiawatha 300 Galle road 4 spacious bedroom 2 bathroom large living dining area spacious pantry outside extra kitchen toilet garage parapet wall around 19 perch asking price 360 00000 inspection appointment ovary wry aye ',
 'luxury house Colombo 6 2500 sqft 4 bedroom ac 3 bathroom hot water parking 2 rolled shutter gate oz unfurnished upstairs house 225 k pm garden backyard avai

In [32]:
len(final_preprocessed_lst)

157

# Save as Text file separately train and test

In [33]:
output_file = r"D:\R & D PROJECT\Test_text\test_data.txt"
f = open(output_file, "w")
for data in final_preprocessed_lst:
    f.write("%s,\n"% data)
f.close()

In [34]:
with open(r"D:\R & D PROJECT\Test_text\test_data.txt", "r") as filename:
    train_file = filename.read()

In [35]:
cleaned_text = []
for line in train_file.split(","):
    if not line.strip():
            continue
    cleaned_text.append(line.lstrip())

In [36]:
cleaned_text[1]

'ratmalana house ratmalana borupana road 118 perch land house 3 bedroom 2 toilet 1 hall dining Hal pantry kitchen wall right round front garden rear garden 2 court yard walking distance car gills super kudu r 3000000 broker genuine buyer 0773890545 '