# Clean data and match SBIR recipients to i3 dataset

In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
# Read in data
sbir_df = pd.read_csv('inputs/new_sbir_awards.csv')
i3_comps = pd.read_csv('outputs/companies_from_merged_data.csv')
print('Length of SBIR data: '+ str(len(sbir_df)))
print('Length of i3 data: '+ str(len(i3_comps)))

Length of SBIR data: 5877
Length of i3 data: 3102


In [3]:
# Clean startup names
sbir_df = sbir_df.drop(columns={'phase','agency_tracking_number','contract','duns',
                               'Solicitation.Topic.Code','Solicitation.Number','DUNS',
                               'HUBZone.Owned','Woman.Owned','Socially.and.Economically.Disadvantaged',
                               'hubzone_owned','women_owned','socially_economically_disadvantaged',
                               'Contract','Agency.Tracking.Number','Agency','Branch','Contract',
                               'award_link','solicitation_number','solicitation_year','Award.Year',
                               'Award.Start.Date..Proposal.Award.Date.','Solicitation.Year',
                               'Award.End.Date..Contract.End.Date.','Program','Phase','Amount'})

sbir_check = sbir_df.merge(i3_comps, left_on = 'firm', right_on = 'Company', how = 'inner')

def removesuffix(s, suf):
    if suf and s.endswith(suf):
        return s[:-len(suf)]
    return s

stop_list = ['ltd','plc','s a','nv','inc','llc',' co ','s e','holdings','group','corporation','gmbh','sa','management',
             'holding','se','nv','n v','ag','investors','limited','il','plc','spa']

sbir_df['firm'] = sbir_df['firm'].apply(lambda x: x.lower().strip('. ,:;?!~*-/&'))

for s in stop_list:
    sbir_df['firm'] = sbir_df['firm'].apply(lambda x: removesuffix(x,s).strip(',. '))

In [4]:
# Match startups and grants
sbir_df = sbir_df.merge(i3_comps, left_on = 'firm', right_on = 'Company',
                        how = 'left', indicator = True)

sbir_manual = sbir_df[sbir_df['_merge'] == 'left_only']
sbir_matches = sbir_df[sbir_df['_merge'] == 'both']
sbir_matches = sbir_matches.append(sbir_check)

agency_dict = {'DOE':'us department of energy', 'DOD':'us department of defense',
               'HHS':'us department of health and human services', 'DHS':'us department of homeland security',
               'NASA':'nasa', 'DOT':'us department of transportation',
               'DOC':'us department of commerce', 'ED':'us department of education', 
               'EPA':'us environmental protection agency', 'NSF':'national science foundation (nsf)',
               'USDA':'us department of agriculture'}
id_dict = {'DOE':8583, 'DOD':8581, 'HHS':8584, 'DHS':8585, 'NASA':5685,
           'DOT':8587,'DOC':8580, 'EPA':8590, 'NSF':5706, 'USDA':8579,'ED':''}

sbir_matches = sbir_matches.rename(columns={'firm':'company','agency':'investor.firm',
                                           'award_year':'investment.year','award_amount':'investment.amount',
                                           'proposal_award_date':'investment.date'})
sbir_matches = sbir_matches[['company','investor.firm','investment.year','investment.date','investment.amount']]
sbir_matches['investor.type.edited'] = 'public or quasi public'
sbir_matches['investment.type'] = 'grant'
sbir_matches['investor.firm.renamed'] = sbir_matches['investor.firm'].apply(lambda x: agency_dict[x])
sbir_matches['investor.id'] = sbir_matches['investor.firm'].apply(lambda x: id_dict[x])
sbir_matches['investor.country'] = 'united states'
sbir_matches['investor.country.edited'] = 'united states'

sbir_matches.head()

  sbir_matches = sbir_matches.append(sbir_check)


Unnamed: 0,company,investor.firm,investment.year,investment.date,investment.amount,investor.type.edited,investment.type,investor.firm.renamed,investor.id,investor.country,investor.country.edited
5,motiv power systems,DOE,2013,2013-02-19,149702.0,public or quasi public,grant,us department of energy,8583,united states,united states
36,spectrum magnetics,DOD,2013,2013-02-21,100000.0,public or quasi public,grant,us department of defense,8581,united states,united states
84,kuehnle agrosystems,DOD,2010,2010-09-29,493000.0,public or quasi public,grant,us department of defense,8581,united states,united states
87,mc10,DOD,2013,2013-07-01,79968.0,public or quasi public,grant,us department of defense,8581,united states,united states
89,groundmetrics,DOE,2013,2013-02-19,149995.0,public or quasi public,grant,us department of energy,8583,united states,united states


In [97]:
sbir_manual.to_csv("outputs/sbir_to_check_manually.csv", index=False, encoding = "utf-8")
sbir_matches.to_csv("outputs/sbir_matches.csv", index=False, encoding = "utf-8")