# Example proof of concept for a recommender system based on text similarity between incidents

Adapted after (main credit goes to): https://www.datacamp.com/community/tutorials/recommender-systems-python

## 1. Imports

In [1]:
import pandas as pd
import numpy as np
import time
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from nltk.corpus import wordnet, stopwords
from nltk.tokenize import word_tokenize
import re
import nltk

# progress bar
from tqdm.auto import tqdm  # for notebooks

### necessary downloads before using nltk locally

In [None]:
nltk.download('punkt')

In [None]:
nltk.download('stopwords')

## 2. Functions

In [4]:
def cleaner(text):
    
    """ lowercase and tokenize text, keep only alphabetical cahrs 

    Args: 
        text            (string):      text to clean

    Returns:
        text            (string):      cleaned text

    """
    
    
    text = str(text).lower()
    # adding numbers to regex, possibly information like "BKR01" as location is helpful
    text = re.sub('[^A-Za-z0-9]', ' ', text)
    text = word_tokenize(text)
    text = [token for token in text if token not in stopwords.words('english')]
    
    
    return " ".join(text)

In [5]:
def check_mention_in_document(stri, text):
    
    """Return true if (sub-)string in text

    Args: 
        stri          (string):   string to search for
        text          (string):   text in which is searched

    Returns:
        bool

    """
    
    # check inside list
    if stri in text:
        return True
    else:
        return False

In [6]:
def dataframe_column_to_cosine_sim(df, column):

    """Creates cosine similarity for each document in a dataframe. 

    Args: 
        df              (obj):      DataFrame
        column          (string):   Name of the column

    Returns:
        corpus          (list):     list of documents as strings
        tf_idf_matrix   (obj):      scipy sparse tf-idf matrix
        cosine_sim      (array):    cosine similarity matrix

    """
    start = time.time()
    
    # convert dataframe strings into list of strings
    corpus = df[column].tolist()
    
    # use english stop words
    tfidf_vectorizer = TfidfVectorizer(stop_words='english')
    
    # generate tf-idf vectors for corpus
    tfidf_matrix = tfidf_vectorizer.fit_transform(corpus)
    
    # compute similarity matrix with pairwise scores, faster version
    cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

    end = time.time()

    # get total time
    print('Time taken is {} seconds'.format(round(end-start,4)))
    
    
    return corpus, tfidf_matrix, cosine_sim

In [7]:
def get_recommendations(title, cosine_sim, df, column, top_n):

    """Create item recommendations based on cosine similarity

    Args: 
        title           (string):   Item name
        cosine_sim      (array):    cosine similarity matrix
        df              (obj):      DataFrame
        column          (string):   Column name
        top_n           (int):      amount of similar observations

    Returns:
        df              (obj):      DataFrame with top_n recommendations

    """
    
    # check input value for top_n to be smaller than 20
    try:
        assert top_n < 20
    except:
        raise ValueError('Too high value for top_n. Enter less than 20.')
    

    # adjust index for searching
    df.reset_index(inplace=True)

    # Generate mapping between titles and index
    indices = pd.Series(df.index, index=df[column])

    # Get index of item that matches title
    idx = indices[title]

    # Sort the items based on the similarity scores
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores for n most similar items
    sim_scores = sim_scores[1:top_n]

    # Get the item indices
    item_indices = [i[0] for i in sim_scores]

    # Return the top n most similar items
    return df[column].iloc[item_indices]

## 3. Read in data and check values

In [10]:
df.head(1)

Unnamed: 0,index,case_no,trans,calenday_day,title,description,observer,shift,comments,len_descr,...,actual_hazard,process,product,control_doc,causes_descr,suggestion,audit_type,diagnosis,location_description,product_description
0,0,93038,539254,2015-05-04 09:30:00,BKR01 - Bullbay fender damaged during push on ...,Whilst the bullbay was pushed on to Q05 push o...,0,0,,732,...,0.0,0.0,0.0,0.0,Vessel fender had been altered to suit require...,PC - The section of the fender which was damag...,0.0,2.0,Germany - _Projects/Construction sites - Proje...,-- Not selected --


In [12]:
cols_to_drop = ['trans','index','contact_person','comments','loss_potential_comments','observer','shift','risk_area','control_doc', 'start_date', 'end_date']

In [13]:
df.drop(labels=cols_to_drop, axis = 1, inplace = True)

In [14]:
df.dropna(inplace = True)

In [15]:
df.shape

(40654, 26)

### 3.1 Clean title text column

In [17]:
start = time.time()


df['title_cleaned'] = df['title'].progress_apply(lambda x: cleaner(str(x)))

end = time.time()

# get total time
print('Time taken is {} seconds'.format(round(end-start,4)))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=40654.0), HTML(value='')))


Time taken is 80.4146 seconds


### 3.2 Investigate amount of mentions

In [18]:
df['title_push'] = df['title_cleaned'].apply(lambda x: check_mention_in_document('push', x))

In [19]:
print('amount of mentions is {} which is {} per cent'.format(df['title_push'].sum(), round(df['title_push'].sum()/len(df),4)))

amount of mentions is 149 which is 0.0037 per cent


In [20]:
title_values = ['vessel', 'water','wave']

In [21]:
for value in title_values:
    df['title_'+value] = df['title_cleaned'].apply(lambda x: check_mention_in_document(value, x))

In [22]:
for value in title_values:
    print(value+':', df['title_'+value].sum())

vessel: 1623
water: 897
wave: 87


## Example recommendations based on title

In [23]:
# choose column to index
column = 'title_cleaned'

In [24]:
# prepare text, store returned variables
title_corpus, title_tfidf_matrix, title_cosine_sim = dataframe_column_to_cosine_sim(df, column)

Time taken is 5.1998 seconds


In [25]:
# print cleaned title for first case
test_title = df.iloc[0,-5]

print(test_title)

bkr01 bullbay fender damaged push operations location q05


In [26]:
print(get_recommendations(test_title, title_cosine_sim,
                          df, column, 10))

21314                                          push system
15302                               seized door handle q05
15589                                   damaged push tubes
35324                             crane damaged operations
26395    bkr01 crew transfer vessel largo push fender s...
171        damage vessel fender boat landing push tube a01
13744            bkr01 beaumaris bay fender poor condition
14162                        c wind artimus damaged fender
5677                                           push lights
Name: title_cleaned, dtype: object
