# Import Libraries

In [90]:
import matplotlib.pyplot as plt
import pandas as pd
import spacy


In [91]:
company_data = pd.read_csv('../data/input/ml_insurance_challenge.csv')
taxonomy_data = pd.read_csv('../data/input/insurance_taxonomy.csv')

In [92]:
company_data.isnull().sum()      # Missing Data

description      12
business_tags     0
sector           27
category         27
niche             0
dtype: int64

We have missing data, we have two options:

(a) Remove rows with missing values

(b) Fill missing data with a default value

I delete the missing data because i don't want to fill in with wrong information (being a small number of data we don't lose a large amount of information)

In [93]:
company_data.dropna(inplace=True)
company_data.isnull().sum()

description      0
business_tags    0
sector           0
category         0
niche            0
dtype: int64

Eliminate noise (special characters, stopwords & apply Lemmatizer( "running", "ran", "runs" -> "run")) that does not bring significant information for classification.

In [96]:
# !python -m spacy download en_core_web_sm

nlp = spacy.load("en_core_web_sm")

def lemmatize_text_spacy(text):
    doc = nlp(text)
    lemmatized_words = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct]

    return " ".join(lemmatized_words)


company_data['clean_description'] = company_data['description'].apply(lemmatize_text_spacy)

In [103]:
company_data['clean_business_tags'] = (
    company_data['business_tags']
    .str.replace(r"[\[\]']", "", regex=True)
    .str.lower()
    .str.strip()
)


company_data[['business_tags', 'clean_business_tags']].head()


Unnamed: 0,business_tags,clean_business_tags
0,"['Construction Services', 'Multi-utilities', '...","construction services, multi-utilities, utilit..."
1,"['Wholesale', 'Dual-task Movement Products', '...","wholesale, dual-task movement products, cast i..."
2,"['Living Forms', 'Farm Cafe', 'Fresh Coffee', ...","living forms, farm cafe, fresh coffee, communi..."
3,"['Automotive Body Repair Services', 'Interior ...","automotive body repair services, interior repa..."
4,"['Cultural Activities', 'Accommodation Service...","cultural activities, accommodation services, k..."
