#Information Extraction

Importing important libraries

In [None]:
# Using several libraries

import pandas as pd # to import the dataset
import spacy # for name entity recognition

import re # for regular expression


In [None]:
Address_Data = pd.read_csv("/content/drive/MyDrive/addresses.csv")

**The dataset contains the following** <br>
The return to address that you usually find on the packaging of a product. <br>


# Data pre-processing

In [None]:
Missing_Values = Address_Data.isna().any()
Missing_Values

returnTo     False
frequency    False
dtype: bool

In [None]:
Address_Data.shape

(977, 2)

In [None]:
# New_DataFrame = Address_Data.drop_duplicates()


Extract the following entities from the column: <br>
•	companyName <br>
•	companyAddress <br>
•	webAddress <br>
•	emailAddress <br>
•	telephoneNo



In [None]:
Address_Data.shape[0]

977

In [None]:
Address_Data['returnTo'][1]

'Coty UK & I Ltd, Rimmel London, SW19 4DR.'

#Task: 1

Use a NON machine learning approach to achieve the goal. You can use a pretrained NLP model if you think it might be helpful

#Approach 1


In [None]:
empty_list = [] # creating an empty list so that I can append into the list 

for i in range(0,len(Address_Data)):

  empty_list.append(Address_Data['returnTo'][i])
  doc = nlp_obj(empty_list[i])  # creating a spacy object 
  
  for ent in doc.ents:
    print(ent.text,ent.label_) #The entity type can be accessed via ent.label as a hash value or ent.label_ as a string.

#My Approach 
Initial Approach to extract the data using built in spacy library. It is used for name entity recognition (NER). Extracting relevant and usable information from unstructured raw text sources is known as information retrieval. NER locates and categorises identified entities included in unstructured text into standard categories such as person names, locations, organisations, time expressions, amounts, monetary values, percentages, codes, and so on. However, only organization name extraction is possible via this process.

In [None]:
nlp_obj = spacy.load('en_core_web_sm')  # Creating an spacy object using english 


In [None]:
document = nlp_obj(Address_Data['returnTo'][1])

from spacy import displacy
displacy.render(document, style = 'ent', jupyter = True)

The output recieved using this technique is not what I expect as output as it can only extract Organisation name.

Creating a class to extract all the information of the company. The data is unstructured. How to find relevant information from those text. Using spacy, organisation name have been extracted. However, for extraxting email, address, phone number and web addres Regex (regular expression) have been used. Later an object of that class has been created and all the information is retrieved only by calling the function.

The advantage of using this approach is that when conducting unit test it is more efficient and it will be easy to test.

In [None]:
class InfoExtractor:
  
  # import re #for regular expression
  def __init__(self,string): # avoid repeatation so that it has the ability to use global variable within a class. 
    self.string = string


  # This function is intended to fetch the name of the company
  def getName(self):
    doc = nlp_obj(self.string)
    
    for ent in doc.ents:
      print(ent)
      return('companyName:',ent.text)

  # This function fetches the address of the company  
  # docstring can be used
  # PPE class names ---
  def getAddress(self):
    r = re.compile(r'(GIR 0AA|[A-PR-UWYZ]([0-9][0-9A-HJKPS-UW]?|[A-HK-Y][0-9][0-9ABEHMNPRV-Y]?) [0-9][ABD-HJLNP-UW-Z]{2})')
    List_a = r.findall(self.string)
    string_con_a = ''.join([str(i) for i in List_a]) 
  
    return ('companyAddress:',string_con_a)

  
# This function fetches the phone number of the company
  def getPhoneNo(self):
    r = re.compile(r'(\d{3}[-\.\s]??\d{3}[-\.\s]??\d{4}|\(\d{3}\)\s*\d{3}[-\.\s]??\d{4}|\d{3}[-\.\s]??\d{4})')
    phone_numbers = r.findall(self.string)
    List_p = [re.sub(r'\D', '', number) for number in phone_numbers]
    string_conv = ''.join([str(i) for i in List_p])
    return ('telephoneNo:', string_conv)

 # This function fetches the email address of the company 
  def getEmail(self):
    r = re.compile(r'[\w\.-]+@[\w\.-]+')
    List_e = r.findall(self.string)
    string_con_e = ''.join([str(i) for i in List_e])
    return ('emailAddress:',string_con_e)


# This function fetches the web address of the company
  def getWeb(self):
    r = re.compile(r'https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)')
    List_w = r.findall(self.string)
    string_con_w = ''.join([str(i) for i in List_w])
    return ('webAddress:',string_con_w)


In [None]:
# Creating an object of InfoExtractor class
info = InfoExtractor(Address_Data['returnTo'][99])
print(info.getName())
print(info.getAddress())
print(info.getEmail())
print(info.getPhoneNo())
print(info.getWeb())


Munchkin Inc
('companyName:', 'Munchkin Inc')
('companyAddress:', "('WF10 5HX', 'F10')")
('emailAddress:', 'careline@munchkin.com')
('telephoneNo:', '')
('webAddress:', '')


In [None]:
print(Address_Data['returnTo'][100])

E.T. Browne (U.K.) Ltd., Loughton, IG10 3FL, U.K. www.palmers.com
