# Crunchbase Raw Data Cleaning Pipeline (US, EU, China)

In [1]:
import pandas as pd

In [29]:
# manual data input 
country = ['US', 'EU', 'Germany', 'China']  #'France', ''Latvia''

first_n = list(range(1,1000,50))
last_n = list(range(50,1001,50))

## step 1: converting txt to csv

In [38]:
def cb_txt_to_csv(filename, i):
    
    text = ''
    
    with open(f'../data/crunchbase-raw/{filename}.txt', 'r') as f:
    
        # log row counts
        count = i

        # log previous lime
        last_line = ''

        for line in f:

            # meets index, switch to next line
            if (line == f'{count}.\n') or (line == f'{count}.'):
                count += 1
                text += '\n'
                last_line = line

            # logo for company
            elif (last_line == f'{count}.\n') and ('Logo' in line):
                pass

            # logo for investors etc.
            elif 'Logo' in line:
                last_line = line
                pass

            # empty lines
            elif line == '\n':
                last_line = line
                pass

            # line with commas
            elif ',' in line:
                line = line.strip('\n').replace(',', ';') #note
                text += line
                text += ','
                last_line = line

            # data on same row, add comma 
            else:
                text += line.strip('\n')
                text += ','
                last_line = line

    with open(f'../data/crunchbase-csv/{filename}.csv', 'w') as f:
        f.write(text)

In [4]:
# note
# this is not the best method, 
# even though it worked well for a list of values in a single column;
# e.g. sentences, dates, or money values also had their commas replaced
# might need to clean the data again before using it

### run script

In [43]:
for region in country:
    for i in range(len(first_n)):
        filename = f'{region}_{first_n[i]}-{last_n[i]}'
        cb_txt_to_csv(f'{region}/{filename}', first_n[i])

In [44]:
# check: if empty means everything is good!!!
for region in country:
    for i in range(len(first_n)):
        filename = f'{region}_{first_n[i]}-{last_n[i]}'
        try:
            df = pd.read_csv(f'../data/crunchbase-csv/{region}/{filename}.csv')
            # check correct format
            if (df.shape[0]!=50) or (df.shape[1]!=108):
                if (df.shape[1]!=106):
                    print(f'{filename}: {df.shape}')
        except:
            print(f'{filename}')

## step 2: aggregate data

In [7]:
# #seed --> series A, B, C, D --> exit (either IPO or get acquisition)
# df['Industry Groups']

In [52]:
for region in country:
    df_1000 = pd.DataFrame()
    for i in range(len(first_n)):
        filename = f'{region}_{first_n[i]}-{last_n[i]}'
        df_50 = pd.read_csv(f'../data/crunchbase-csv/{region}/{filename}.csv')
        df_1000 = pd.concat([df_1000, df_50], ignore_index=True)
    df_1000.to_csv(f'../data/crunchbase-aggregated/{region}1000.csv', index=False)

In [54]:
for region in country:
    df = pd.read_csv(f'../data/crunchbase-aggregated/{region}1000.csv')
    print(f'{region}: {df.shape}')

US: (1000, 108)
EU: (1000, 108)
Germany: (1000, 109)
China: (1000, 108)
