## Feature engineering notebook for sector classification
- Loads ICMS data
- EDA and feature engineering
- Train/Val/Test split
- Datasets stored in S3

#### Load dataset

In [1]:
# input data is simply csv of below query:
    # SELECT * 
    # FROM datalake-curated-production.icms_issuer 
    # where 1=1`

import pandas as pd

bucket='sector-classification-aiml'
data_key = 'input/icms_issuers_data.csv'
data_location = 's3://{}/{}'.format(bucket, data_key)

data = pd.read_csv(data_location)

severe performance issues, see also https://github.com/dask/dask/issues/10276

To fix, you should specify a lower version bound on s3fs, or
update the current installation.



In [2]:
data.head()

Unnamed: 0,id,subsector_id,crunchbase_url,description,override_valuation_dollars,year_founded,lifecycle_status,exclude_from_data_products,archived_at,total_funding_dollars,...,name,cap_iq_id,updated_at,sharex_id,sector,sub_sector,legal_address,phone_number,domicile_code,distribute_forge_price
0,4993,,https://www.crunchbase.com/organization/accomp...,Accompany Health is a company aiming to combin...,,2022.0,,True,,,...,Accompany Health,,2024-03-26 16:46:18.321,100005186.0,Healthcare,Digital Health,,,,False
1,3158,,https://www.crunchbase.com/organization/tether...,Developer of drugs for the treatment of inflam...,,2002.0,,False,,,...,Tetherex Pharmaceuticals,5292859.0,2024-02-02 22:13:09.925,100001336.0,Healthcare,BioTech & Pharma,,,,False
2,4491,,https://www.crunchbase.com/organization/laser-...,Laser Light Communications is a telecommunicat...,,2012.0,,False,,,...,Laser Light Communications,0.0,2024-02-02 22:12:57.243,100004694.0,Enterprise Software,Cloud/Networking Infrastructure,,,,False
3,1199,,https://www.crunchbase.com/organization/finastra,Finastra is a fintech company focused on build...,,2017.0,,True,,,...,Finastra,,2024-03-07 23:01:22.334,100001924.0,FinTech,Other Fintech,,,,False
4,3209,,https://www.crunchbase.com/organization/tigerg...,"TigerGraph, founded in 2012, is the developer ...",,2012.0,,False,,,...,TigerGraph,0.0,2024-01-18 00:00:00.000,100001713.0,Enterprise Software,Data Intelligence,,,,True


In [3]:
# list of priority 1 issuers
bucket='sector-classification-aiml'
data_key = 'input/top_tier_issuer_list.csv'
data_location = 's3://{}/{}'.format(bucket, data_key)

top_issuers = pd.read_csv(data_location)
top_issuers.head()

Unnamed: 0,issuerKey
0,6sense
1,abra
2,acorns
3,addepar
4,adroll


In [4]:
# merge data with priority 1 issuers
data = data.merge(top_issuers, how='left', left_on=['slug'], right_on=['issuerKey'])
data['top_issuer'] = data['issuerKey'].apply(lambda x: 0 if pd.isnull(x) else 1)

#### Dropping instances where description == NULL

In [5]:
# identify issuers with null description
data_b = data[['name', 'slug', 'description', 'sector', 'sub_sector', 'top_issuer']].copy()
print(data_b.shape)
pd.isnull(data_b).sum()


(4886, 6)


name            1
slug            0
description    96
sector          0
sub_sector      0
top_issuer      0
dtype: int64

In [6]:
# remove instances without description and sector/sub sector information missing
df = data_b[ ~(
            (pd.isnull(data["description"]))
            | (data["sector"] == "Missing")
            | (data["sub_sector"] == "Missing")
        )].reset_index(drop=True).copy()

print(df.shape)
pd.isnull(df).sum()


(4486, 6)


name           0
slug           0
description    0
sector         0
sub_sector     0
top_issuer     0
dtype: int64

#### Dropping instances where duplicate descriptions

In [7]:
desc_group = df.groupby('description')['slug'].count().reset_index().rename(columns={'slug':'descr_count'})\
.sort_values('descr_count', ascending=False)

desc_group.head()

Unnamed: 0,description,descr_count
2512,"Moon Active, is a growing mobile game companie...",4
933,Contineum is a clinical stage biopharmaceutica...,2
1455,Figure AI is a humanoid robotics developer wit...,2
2960,Permutive offers a privacy-centric audience pl...,2
3845,Teamshares is an employee ownership platform t...,2


In [8]:
dfm = df.merge(desc_group, how="left", on="description")

dff = dfm[dfm.descr_count == 1].reset_index(drop=True).copy()
dff.shape

(4454, 7)

#### Relabeling subsector to OTHER if proportion <= 1% to reduce class label size

In [9]:
# # distribution of sector label
dff['sector_label'] = dff.apply(lambda row: f"{row['sector']} | {row['sub_sector']}", axis=1)

sector_value_counts_dict = dff['sector_label'].value_counts(normalize=True).to_dict()

def relabel(sector_label, min_proportion)->str:
    """Relabel based on a minimum proportion. If less than min proportion relabel as [Sector] | Other [Sector]"""
    if sector_value_counts_dict[sector_label] <= min_proportion:
        return f"{sector_label.split('|')[0]} | Other {sector_label.split('|')[0]}"
    else:
        return sector_label
    

dff['sector_relabel'] = dff['sector_label'].apply(lambda x: relabel(x,.01))

#### Encoding categorical labels

In [10]:
# set dataframe for prediction at subsector level
dfs = dff[['description', 'sector_relabel', 'top_issuer']].copy()

dfs['encoded_labels'] = dfs['sector_relabel'].astype('category').cat.codes

data_texts = dfs['description'].to_list()
data_labels = dfs['encoded_labels'].to_list()

dfs.head()

Unnamed: 0,description,sector_relabel,top_issuer,encoded_labels
0,Accompany Health is a company aiming to combin...,Healthcare | Digital Health,0,32
1,Developer of drugs for the treatment of inflam...,Healthcare | BioTech & Pharma,0,31
2,Laser Light Communications is a telecommunicat...,Enterprise Software | Cloud/Networking Infrast...,0,12
3,Finastra is a fintech company focused on build...,FinTech | Other Fintech,0,27
4,"TigerGraph, founded in 2012, is the developer ...",Enterprise Software | Data Intelligence,1,15


#### Splitting and stratifying dataset into train, validation, and test sets

In [11]:
# Stratify dataset when splitting
from sklearn.model_selection import train_test_split

train_texts, val_texts, train_labels, val_labels = train_test_split(data_texts, data_labels, test_size = 0.2, random_state=23, stratify=data_labels)
train_texts, test_texts, train_labels, test_labels = train_test_split(train_texts, train_labels, test_size = 0.1, random_state=23, stratify=train_labels)

In [12]:
dfi = dfs[dfs['top_issuer']==1].reset_index(drop=True).copy()
top_issuers_texts = dfi['description'].to_list()
top_issuers_labels = dfi['encoded_labels'].to_list()

#### Generating Dataset

In [13]:
from datasets.dataset_dict import DatasetDict
from datasets import Dataset

d = {'train':Dataset.from_dict({'label':train_labels,'text':train_texts}),
     'val':Dataset.from_dict({'label':val_labels,'text':val_texts}),
     'test':Dataset.from_dict({'label':test_labels,'text':test_texts}),
     'top_issuers':Dataset.from_dict({'label':top_issuers_labels,'text':top_issuers_texts})
     }

sectors = DatasetDict(d)

  from .autonotebook import tqdm as notebook_tqdm


#### Saving Dataset on S3

In [14]:
# saving sector data to S3
import os

s3_root_folder = f"s3://team-orange-datasets"
data_path = os.path.join(s3_root_folder, "subsector-classification")
sectors.save_to_disk(data_path)

                                                                                             