# Import Libraries

In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import spacy
import re


# Load the Data

In [15]:
company_data = pd.read_csv('../data/input/ml_insurance_challenge.csv')
taxonomy_data = pd.read_csv('../data/input/insurance_taxonomy.csv')

# Clean Data

In [16]:
company_data.isnull().sum()      # Missing Data

description      12
business_tags     0
sector           27
category         27
niche             0
dtype: int64

We have missing data, we have two options:

(a) Remove rows with missing values

(b) Fill missing data with a default value

I delete the missing data because i don't want to fill in with wrong information (being a small number of data we don't lose a large amount of information)

In [4]:
company_data.dropna(inplace=True)
company_data.isnull().sum()

description      0
business_tags    0
sector           0
category         0
niche            0
dtype: int64

Eliminate noise (special characters, stopwords & apply Lemmatizer( "running", "ran", "runs" -> "run")) that does not bring significant information for classification.

In [5]:
# !python -m spacy download en_core_web_sm

nlp = spacy.load("en_core_web_sm")

def lemmatize_text_spacy(text):
    doc = nlp(text)
    lemmatized_words = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct]

    return " ".join(lemmatized_words)


company_data['description'] = company_data['description'].apply(lemmatize_text_spacy)

In [6]:
company_data['business_tags'] = (
    company_data['business_tags']
    .str.replace(r"[\[\]']", "", regex=True)
    .str.lower()
    .str.strip()
    .str.replace(",", "")
)

company_data['sector'] = company_data['sector'].str.lower().str.strip()
company_data['category'] = company_data['category'].str.lower().str.strip()
company_data['niche'] = company_data['niche'].str.lower().str.strip()

In [7]:
company_data.head()

Unnamed: 0,description,business_tags,sector,category,niche
0,welchcivil civil engineering construction comp...,construction services multi-utilities utility ...,services,civil engineering services,other heavy and civil engineering construction
1,Kyoto Vegetable Specialists Uekamo know Iwa ma...,wholesale dual-task movement products cast iro...,manufacturing,fruit & vegetable - markets & stores,"frozen fruit, juice, and vegetable manufacturing"
2,Loidholdhof Integrative Hofgemeinschaft compan...,living forms farm cafe fresh coffee community ...,manufacturing,farms & agriculture production,all other miscellaneous crop farming
3,PATAGONIA Chapa Y Pintura auto body shop locat...,automotive body repair services interior repai...,services,auto body shops,"automotive body, paint, and interior repair an..."
4,Stanica WODNA PTTK Swornegacie cultural establ...,cultural activities accommodation services kay...,services,boat tours & cruises,"scenic and sightseeing transportation, water"


In [8]:
company_data['extra_info'] = company_data['sector'] + " " + company_data['category'] + " " + company_data["niche"]
company_data['full_text'] = company_data['description'] + " " + company_data['business_tags'] + " " + company_data['extra_info']

# Additional preprocessing:

In [9]:
company_data['full_text'] = company_data['full_text'].apply(lambda x: re.sub(r'[^\w\s]', '', x))

In [10]:
company_data.iloc[0, 6]

'welchcivil civil engineering construction company specialize design build utility network connection UK offer multi utility solution combine electricity gas water fibre optic installation single contract design engineer team capable design electricity water gas network exist network connection point meter location development project management reinforcement diversion provide custom connection solution account exist asset maximize usage trench meet project deadline welchcivil considerable expertise instal gas electricity connection variety market category include residential commercial industrial project construction services multiutilities utility network connections design and construction water connection installation multiutility connections fiber optic installation services civil engineering services other heavy and civil engineering construction'

# Final Data:

In [11]:
company_data.drop(columns=['description', 'business_tags', 'sector', 'category', 'niche', 'extra_info'], inplace=True)

In [14]:
company_data.to_csv(path_or_buf="../data/output/data_clean.csv", index=False)