# 02 Import and cleanse deals (funding) data

## Import modules & settings

In [17]:
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt

%matplotlib inline

## Import raw data

In [18]:
# Find file location
!ls ../../data/raw/

Datasheet_Cleansed.xlsx Datasheet_Raw.xlsx


In [19]:
file_path = '../../data/raw/Datasheet_Cleansed.xlsx'
df = pd.read_excel(file_path, sheet_name='Funding_Startup')

df.head()

Unnamed: 0,Deal ID,Deal Date,Deal Y,Firm,Unique ID Lookup,Round Total ($),Deal Type,Country,Head office,Year founded,CodeX Category
0,DEAL0001,2000-03-22,2000,LexisNexis,FIRM0002,30000000.0,Venture Capital,USA,New York,1973,Legal Research
1,DEAL0002,2002-10-03,2002,Workshare,FIRM0027,4500000.0,Series A,UK,London,1999,Legal Analytics
2,DEAL0003,2003-01-01,2003,DocuSign,FIRM0045,,Seed Fund,USA,San Francisco,2003,Legal Practice Management
3,DEAL0004,2003-10-01,2003,XMLAW,FIRM0051,150000.0,Seed Fund,USA,Boston,2003,Legal Practice Management
4,DEAL0005,2004-01-01,2004,Gust,FIRM0053,,Series A,USA,New York,2004,Legal Marketplace


## Clean up headers to make it easier to access

In [20]:
[col.lower() for col in df.columns]

['deal id',
 'deal date',
 'deal y',
 'firm',
 'unique id lookup',
 'round total ($)',
 'deal type',
 'country',
 'head office',
 'year founded',
 'codex category']

In [21]:
cols = ['deal_ id','deal_date','deal_year','firm_name','firm_id','round_total','deal_type','country','city',
        'year_founded','codex_category']
df.columns = cols

In [22]:
# How many total records and missing data do we have?
df.shape

(541, 11)

**Note:** We have 541 rows.  What does each row represent?  An instance of a legal tech firm receiving funding.  <br>
<br>
How much missing data do we have?

In [23]:
missing_rows = df.round_total.isnull().sum()
total_rows = df.shape[0]
percent = missing_rows/total_rows

print(f"{missing_rows} of the {total_rows} rows are missing funding data. This represents {percent:.2} of the data.")


122 of the 541 rows are missing funding data. This represents 0.23 of the data.


## Drop the rows with nulls for initial analysis

In [24]:
before_drop = df.shape
after_drop = df.dropna().shape
print(f"There are {before_drop[0]} rows before dropping, and {after_drop[0]} after.")
df = df.dropna()

There are 541 rows before dropping, and 419 after.


## A bit of quick feature engineering. firm_age at funding

In [25]:
df['firm_age'] = (df.deal_year - df.year_founded)

# Some of the firm age shows negative.  Set to 0
df.loc[df.firm_age <0, 'firm_age'] = 0

## Export the cleansed file

In [26]:
! ls ../../data/interim

datasheet_1.csv      firms.csv            investors.csv
deals.csv            funding_investor.csv


In [27]:
file_path = '../../data/interim/deals.csv'
df.to_csv(file_path)