# Information extraction from Cheshire Fire & Rescue reports using Deep Learning

## Setup the Environment

In [None]:
import re
import os
import random
import spacy
from arcgis.learn import prepare_data, EntityRecognizer
import pandas as pd
from arcgis.gis import GIS
from arcgis.raster.functions import colormap
from arcgis.geocoding import batch_geocode
from word2number import w2n
from datetime import *

In [None]:
gis = GIS(profile='your_online_profile')

## Data Preparation

Data preparation involves splitting the data into training and validation sets, creating the necessary data structures for loading data into the model and so on. The prepare_data() method can directly read the training samples in one of the above specified formats and automate the entire process.

In [None]:
data = prepare_data("file.json", dataset_type='ner_json',class_mapping={'address_tag':'Address'})

The show_batch() method can be used to visualize the training samples, along with labels

In [None]:
data.show_batch()

## Model Training
First we will create the model using `EntityRecognizer` class and passing it the data object.
Training the model is an iterative process. We can train the model using its fit() method till the validation loss (or error rate) continues to go down with each training pass also known as epoch. This is indicative of the model learning the task

In [None]:
ner = EntityRecognizer(data)

### Train the Model

In [None]:
ner.fit(30)

## Validate results 
Now we have the trained model, let's look at how the model perform

In [None]:
ner.show_results()

## Save and load trained models 
Once you are satisfied with the model, you can save it using the save() method. This creates an Esri Model Definition (EMD file) that can be used for inferencing on new data. Saved models can also be loaded back using the load() method. load() method takes the path to the emd file as a required argument.

In [None]:
ner.save('fire_30epoch')

In [None]:
ner.load(r'./models/fire_30epoch/fire_30a.emd')

## Model inference 
Now we can use the trained model to extract entities from new text documents using extract_entities() function. Just need to pass the folder path of where new text document are located

##### Colorize the Reports

In [None]:
def color_gen():    
    random_number = random.randint(16777215//2,16777215)
    hex_number = format(random_number, 'x')
    hex_number = '#' + hex_number
    return hex_number

colors = {ent.upper():color_gen() for ent in ner.entities}
options = {"ents":[ent.upper() for ent in ner.entities], "colors":colors}

In [None]:
filenames=os.listdir('reports')

#### Display a Report

In [None]:
filename=f'reports/{random.choice(filenames)}'
with open(filename,'r') as file:
    txt=file.read()

doc1 = ner.model(txt.replace('\n',' '))
spacy.displacy.render(doc1,jupyter=True, style='ent',options=options)

## Extract the Entities from Other Reports

In [None]:
results = ner.extract_entities('reports')

## Post process results

#### Data Clean-up

In [None]:
def convert_number(x):    
    try:        
        number=w2n.word_to_num(x) 
    except:
        number=0
    return number   

In [None]:
def convert_date(x):    
    try:        
        date=datetime.strptime(x,'%d/%m/%Y - %H:%M') 
    except:
        date=datetime.strptime('01/01/1970 - 00:00','%d/%m/%Y - %H:%M')
    return date 

In [None]:
results['Number_of_Engines']=results['Number_of_Engines'].apply(lambda x: convert_number(x))

results['Date_and_Time']=results['Date_and_Time'].apply(lambda x: convert_date(x))

#### Display Results

In [None]:
results.tail()

## Create Feature Layer

#### Geocode and Prepare Data

In [None]:
def geocode_locations(df,  address_col, Region='',Country='',prob=0.8):
    processed_df=df.copy(deep=True)
    #creating address with city and region
    add_miner = processed_df[address_col].apply(lambda x: x+f', {Region}, {Country}') 
    chunk_size = 200
    chunks = len(processed_df[address_col])//chunk_size+1
    batch = list()
    for i in range(chunks):
        batch.extend(batch_geocode(list(add_miner.iloc[chunk_size*i:chunk_size*(i+1)]),source_country=Country))
    batch_geo_codes = []
    for i,item in enumerate(batch):
        if isinstance(item,dict):
            if (item['score']>(prob*10) and 
                    item['address']!= f'{Region}, {Country}'):
                batch_geo_codes.append(item['location'])
            else:
                batch_geo_codes.append('')    
        else:
            batch_geo_codes.append('') 
    processed_df['geo_codes'] = batch_geo_codes    
    return processed_df

def prepare_sdf(processed_df):
    processed_df['geo_codes_x'] = 'x'
    processed_df['geo_codes_y'] = 'y'
    for i,geo_code in processed_df['geo_codes'].iteritems():
        if geo_code == '': 
            processed_df.drop(i,inplace=True) #dropping rows with empty location
        else:
            processed_df['geo_codes_x'].loc[i]=geo_code.get('x')
            processed_df['geo_codes_y'].loc[i]=geo_code.get('y')
    
    sdf = processed_df.reset_index(drop=True)
    sdf['geo_x_y'] = sdf['geo_codes_x'].astype('str') + ',' +sdf['geo_codes_y'].astype('str')
    sdf = pd.DataFrame.spatial.from_df(sdf, address_column='geo_x_y') #adding geometry to the dataframe
    sdf.drop(['geo_codes_x','geo_codes_y','geo_x_y','geo_codes'],axis=1,inplace=True) #dropping redundant columns
    return sdf
def publish_to_feature(df, gis, layer_title:str, tags:str,  
                       Region:str,Country:str, address_col:str,prob:float=0.8):
    processed_df = geocode_locations(df, address_col, Region, Country,prob)
    sdf = prepare_sdf(processed_df)
#     return sdf
    try:        
        layer = sdf.spatial.to_featurelayer(layer_title, gis,tags) 
    except:
        layer = sdf.spatial.to_featurelayer(layer_title, gis, tags)

    return layer

#### Publish Data

In [None]:
# This will take few minutes to run
fire_report = publish_to_feature(results, gis,  layer_title='Cheshire Fire & Rescue Service Incident Reports Test', 
                                tags='nlp,fire',
                                Region='Cheshire',Country='England',
                                prob=0.9,address_col='Address')

In [None]:
fire_report

#### Display the Results

In [None]:
from arcgis.gis import GIS
gis = GIS(profile='your_online_profile')
gis.content.get("40d1ec92432d4828a345000c2641e52c")