# 01 Import and cleanse firms data

## Import modules

In [1]:
import pandas as pd
import numpy as np

In [2]:
# Find the file location
! ls ../../data/raw/

Datasheet_Cleansed.xlsx Datasheet_Raw.xlsx


## Import raw data

In [3]:
path = '../../data/raw/Datasheet_Raw.xlsx'
df = pd.read_excel(path, sheet_name = 0, skiprows=2)

## Quick peak at the raw file

In [4]:
df.head(2)

Unnamed: 0.1,Unnamed: 0,Firm ID,Firm,CodeX Category,Country,Head office,Founded,Market,Primary Problem,Secondary Problem,Secondary Problem.1,Analytics,AI,Point vs Integrated,PvI Rating (if needed),Company URL,Other reference link (if applicable),Comments,Unnamed: 18
0,,FIRM0002,LexisNexis,Legal Research,USA,New York,1973,Corporate,Legal Research,Other Practice Tech,,Yes,No,Integrated Solution,,https://www.lexisnexis.com/en-us/gateway.page,,LexisNexis started out as a legal research com...,
1,,FIRM0003,Korbitec,Legal Document Automation,South Africa,Cape Town,1976,Corporate,Contracts,Other Practice Tech,,Yes,No,Integrated Solution,3.0,https://www.lexisnexis.co.za/,https://www.linkedin.com/company/korbitec/?ori...,Korbitec was acquired by LexisNexis.,


## Drop trash columns

In [5]:
df.drop(columns = ['Unnamed: 0', 'PvI Rating (if needed)', 'Unnamed: 18'], inplace=True)

## Rename columns

In [6]:
cols = ['firm_id','firm_name','codex_category','country','city','year_founded','market','problem_1',
        'problem_2','problem_3','analytics','ai','point_integrated','company_url','other_link','comments ']
df.columns = cols

### Convert Analytics and AI to boolean, 1 and 0

In [7]:
df.analytics = df.analytics.map(lambda x: 1 if x =='Yes' else 0)

In [8]:
df.ai = df.ai.map(lambda x: 1 if x=='Yes' else 0)

## Some of the data in problems have text ``Unclear - SEE NOTES``.  Drop rows

In [9]:
df[df.problem_1=='Unclear - SEE NOTES']

Unnamed: 0,firm_id,firm_name,codex_category,country,city,year_founded,market,problem_1,problem_2,problem_3,analytics,ai,point_integrated,company_url,other_link,comments
24,FIRM0061,IntellinX,Legal Practice Management,Israel,Tel Aviv,2005,Corporate,Unclear - SEE NOTES,,,1,0,Point Solution,https://www.bottomline.com/us/solutions/cyber-...,,Track end-user activity in internal business a...
34,FIRM0074,CellBreaker,E-Discovery,USA,North Carolina,2007,Consumer,Unclear - SEE NOTES,,,0,0,Point Solution,https://www.cellbreaker.com/,,Switch cell phone carriers without paying the ...
58,FIRM0148,Virtual Viewbox,Legal Practice Management,USA,San Antonio,2010,Corporate,Unclear - SEE NOTES,,,0,0,,https://www.virtualviewbox.com/home.html,,Virtual Viewbox looks more meditech than legal...
189,FIRM0511,Reduse,Legal Practice Management,UK,Cambridge,2014,Corporate,Unclear - SEE NOTES,,,0,0,,https://www.linkedin.com/company/reduse/,,"Reduse remove print from paper using laser, re..."


In [10]:
# Get urls for companies
[x for x in df[df.problem_1=='Unclear - SEE NOTES']['company_url']]

['https://www.bottomline.com/us/solutions/cyber-fraud-and-risk-management/',
 'https://www.cellbreaker.com/',
 'https://www.virtualviewbox.com/home.html',
 'https://www.linkedin.com/company/reduse/']

In [11]:
# Get urls for companies
[x for x in df[df.problem_1=='Unclear - SEE NOTES']['comments ']]

['Track end-user activity in internal business applications for forensic audit of autorhised user activities',
 'Switch cell phone carriers without paying the early termination fee',
 'Virtual Viewbox looks more meditech than legaltech ',
 'Reduse remove print from paper using laser, recyclying paper for multiple use']

**Note:** After looking at the website of these 4 businesses, I think its safe to just drop them because they aren't even really that related to legal tech.

In [12]:
df.shape

(339, 16)

In [13]:
# Drop rows for "unclear" problems
df = df[df.problem_1!='Unclear - SEE NOTES']
df.shape

(335, 16)

In [14]:
# Check for problem_2 as well
df[df.problem_2=='Unclear - SEE NOTES']

Unnamed: 0,firm_id,firm_name,codex_category,country,city,year_founded,market,problem_1,problem_2,problem_3,analytics,ai,point_integrated,company_url,other_link,comments
18,FIRM0053,Gust,Legal Marketplace,USA,New York,2004,Corporate,DealTech,Unclear - SEE NOTES,,0,0,Integrated Solution,https://gust.com/,,Gust connects startups to angel investors thro...
138,FIRM0365,Loudr,Legal Document Automation,USA,San Francisco,2013,Corporate,Contracts,Unclear - SEE NOTES,,1,0,Integrated Solution,https://loudr.fm/,,Acquired by Spotify. Loudr builds products and...


In [15]:
# Since problem_1 is populated, just change these to None
df.loc[df.problem_2=='Unclear - SEE NOTES', 'problem_2'] = np.NaN

## Create map for problems - This will be used to shorten the problem text

In [16]:
# get list of all problems
problems = list(df.problem_1) + list(df.problem_2) + list(df.problem_3)
problems = pd.Series(problems)
problems.dropna(inplace = True)
set(problems)

{'Contracts',
 'DealTech',
 'KM',
 'Legal Education',
 'Legal Ops',
 'Legal Research',
 'LitTech ',
 'Other Practice Tech',
 'Peak Load / Staffing',
 'RegTech',
 'Workflow & PM',
 'eDiscovery'}

In [17]:
problem_map = {
    'Contracts':'C',
    'DealTech':'D',
    'KM':'K',
    'Legal Education':'LE',
    'Legal Ops':'LO',
    'Legal Research':'LR',
    'LitTech ': 'LT',
    'Other Practice Tech': 'OPT',
    'Peak Load / Staffing':'PLS',
    'RegTech':'RT',
    'Workflow & PM': 'W',
    'eDiscovery':'E' 
}

## Apply map to shorten problems (commented this out for now)

In [18]:
# df.problem_1 = df.problem_1.map(problem_map)
# df.problem_2 = df.problem_2.map(problem_map)
# df.problem_3 = df.problem_3.map(problem_map)

## Replace NaN values with "NONE" string

In [19]:
df.problem_1.fillna('NONE', inplace=True)
df.problem_2.fillna('NONE', inplace=True)
df.problem_3.fillna('NONE', inplace=True)

## There is a typo with "eDiscovery".  Fix it here

In [20]:
df.codex_category.value_counts()

Legal Document Automation    85
Legal Practice Management    65
Legal Compliance             53
Legal Marketplace            46
Legal Analytics              30
Legal Research               27
E-Discovery                  19
Online Dispute Resolution     8
e-Discovery                   1
Legal Education               1
Name: codex_category, dtype: int64

In [21]:
df.loc[df.codex_category=='e-Discovery', 'codex_category'] = 'E-Discovery'

## Export the cleansed file

In [22]:
# Find the file location
! ls ../../data/interim

datasheet_1.csv      firms.csv            investors.csv
deals.csv            funding_investor.csv


In [23]:
file_path = '../../data/interim/firms.csv'
df.to_csv(file_path)

In [24]:
df.head(5)

Unnamed: 0,firm_id,firm_name,codex_category,country,city,year_founded,market,problem_1,problem_2,problem_3,analytics,ai,point_integrated,company_url,other_link,comments
0,FIRM0002,LexisNexis,Legal Research,USA,New York,1973,Corporate,Legal Research,Other Practice Tech,NONE,1,0,Integrated Solution,https://www.lexisnexis.com/en-us/gateway.page,,LexisNexis started out as a legal research com...
1,FIRM0003,Korbitec,Legal Document Automation,South Africa,Cape Town,1976,Corporate,Contracts,Other Practice Tech,NONE,1,0,Integrated Solution,https://www.lexisnexis.co.za/,https://www.linkedin.com/company/korbitec/?ori...,Korbitec was acquired by LexisNexis.
2,FIRM0014,Microsystems,Legal Document Automation,USA,Illinois,1995,Corporate,Contracts,KM,LitTech,1,0,Integrated Solution,https://www.litera.com/products/legal/,https://www.linkedin.com/company/microsystems/,Microsoftsystems became Litera Microsystems in...
3,FIRM0015,WorkProducts,Legal Practice Management,USA,Virginia,1995,Corporate,eDiscovery,Other Practice Tech,NONE,0,0,Point Solution,http://www.workproducts.com/#,https://www.businesswire.com/news/home/2009072...,
4,FIRM0021,LiveOffice,Legal Practice Management,USA,California,1998,Corporate,KM,eDiscovery,NONE,0,0,Point Solution,https://www.crunchbase.com/organization/liveof...,https://www.computerworld.com/article/2501427/...,Cloud-based data archiving and storage
