##  Imports

In [2]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import pandas as pd

In [3]:
# setting display options to maximum in order to display all columns
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', 0)
# disable chained assignment warnings
pd.options.mode.chained_assignment = None 

In [4]:
# maximize display width
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

## Functions

In [5]:
def generate_word_metrics(df, column):
    
    """
    Calculate amount of words, word length and .
    
    Arguments:
    df_column      (obj): Pandas dataframe column as Series
    
    Returns:
    tuple
    
    """
    # Replace all non letter characters with a whitespace
    df['text_clean_'+column] = df[column].str.replace('[^a-zA-Z]', ' ')
    # Change to lower case
    df['text_clean_'+column] = df['text_clean_'+column].str.lower()
    # Print the first 5 rows of the text_clean column
    print(df['text_clean_'+column].head())

    # Find the length of each text
    df['char_cnt_'+column] = df['text_clean_'+column].str.len()
    # Count the number of words in each text
    df['word_cnt_'+column] = df['text_clean_'+column].str.split().str.len()
    # Find the average length of word
    df['avg_word_length_'+column] = df['char_cnt_'+column] / df['word_cnt_'+column]
    # Print the first 5 rows of these columns
    print(df[['text_clean_'+column, 'char_cnt_'+column, 'word_cnt_'+column, 'avg_word_length_'+column]])
    
    return df
    
    

In [None]:
# Assume DataFrame exists as variable df, removal due to privacy

In [8]:
# remove columns with few variables
cols_to_drop = ['index','observer','shift','risk_area','len_descr','control_doc','loss_potential_comments','suggestion','contact_person','comments']

In [9]:
# lets get rid of the cols
df.drop(labels=cols_to_drop, axis = 1, inplace = True)

In [10]:
df.shape

(58836, 27)

## Add word metrics

In [None]:
# perform for description field
df_description_metrics = generate_word_metrics(df, 'description')

In [15]:
df.shape

(58836, 31)

### Adding clustered LDA topics

In [19]:
# dirpath_nlp contains the created LDA categorical features (e.g. 3 for an incident being assigned to topic 3)

In [21]:
nlp_dominant_topics = pd.read_csv(dirpath_nlp+'/topics_sentences_descr_four.csv')

In [22]:
nlp_dominant_topics.shape

(110764, 5)

In [23]:
nlp_dominant_topics.head(1)

Unnamed: 0,Document_No,Dominant_Topic,Topic_Perc_Contrib,Keywords,Text
0,0,2.0,0.8831,"vessel, lift, oper, crew, team, crane, deck, drill, transfer, technician","['whilst', 'bullbai', 'push', 'push', 'tube', 'central', 'fender', 'loos', 'free', 'main', 'fender', 'hous', 'deckhand', 'inform', 'master', 'push', 'seiz', 'immedi', 'bullbai', 'report', 'damag', 'site', 'manag', 'agre', 'bullbai', 'head', 'ashor', 'damag', 'assess', 'drop', 'equip', 'regina', 'baltica', 'transfer', 'equip', 'regina', 'baltica', 'readi', 'damag', 'fender', 'came', 'complet', 'free', 'hous', 'drop', 'water', 'equip', 'transfer', 'safe', 'bullbai', 'proceed', 'shore', 'fender', 'retriev', 'vessel', 'oshor', 'bullbai']"


In [29]:
# assume another dataframe with more observations, df_110, has been added.
merged_dominant_topics = pd.merge(df_110, nlp_dominant_topics, left_index=True, right_index=True)

In [30]:
merged_dominant_topics.shape

(110764, 41)

In [32]:
# subsetting to only incidents with these three application numbers
incidents = merged_dominant_topics[merged_dominant_topics['application'].isin([20, 21, 22])] 

In [33]:
incidents.shape

(58836, 41)

In [37]:
incidents_word_metrics['text_preprocessed_word_cnt'] = incidents_word_metrics['Text'].str.split().str.len()

In [38]:
cols_to_drop

['index',
 'observer',
 'shift',
 'risk_area',
 'len_descr',
 'control_doc',
 'loss_potential_comments',
 'suggestion',
 'contact_person',
 'comments']

In [41]:
cols_to_drop.pop(0)

'index'

In [42]:
incidents_word_metrics.drop(labels=cols_to_drop, axis =1, inplace=True)

In [43]:
incidents_word_metrics.shape

(58836, 37)

In [45]:
incidents_word_metrics.to_csv('incidents_with_word_metrics.csv', index=False)