# Import relevant libraries

In [1]:
import os
import pandas as pd
import numpy as np

# Setup

In [2]:
# disable false positive warnings
pd.options.mode.chained_assignment = None

In [3]:
# set file path
base_path = '../..'

file_path = os.path.join(base_path, 'datasets', 'final_cleaned_data.csv')

# load data
df = pd.read_csv(file_path)

In [4]:
df['sector_code_1'] = df['sector_code_1'].str.rstrip()

# Preprocessing

In [5]:
# import all industry code files
fin = pd.read_csv(os.path.join(base_path, 'datasets', 'Industry_codes - Finance.csv'), dtype = 'str')
info = pd.read_csv(os.path.join(base_path, 'datasets', 'Industry_codes - Info - Communcation Tech.csv'), dtype = 'str')
tech = pd.read_csv(os.path.join(base_path, 'datasets', 'Industry_codes - Professional, scientific, technical.csv'), dtype = 'str')

In [6]:
# create array with all industry sector codes
fin = np.array(fin)
info = np.array(info)
tech = np.array(tech)

In [7]:
# choose on specific industries related to tech
result = []
for value in df['sector_code_1']:
    if value in fin:
        result.append('Finance')
    elif value in tech:
        result.append('Sci & Tech Rsrch.')
    elif value in info:
        result.append('informatin Tech')
    else:
        result.append('NA')

df['Industry'] = result

In [8]:
# remove all NA industry rows
df = df.loc[df['Industry'] != 'NA']

# remove inactive companies
df = df.loc[df['CompanyStatus'] == 'Active']

In [9]:
# define a count column
df['count'] = 1

In [10]:
# only use specfic sectors in k list
k = [
    '58210', '58290', '61200', '61300', '61900', '62011', '62012', '62020',
    '62030', '62090', '63110', '63120', '64303', '64304', '66300', '71200',
    '72110', '72190', '72200', '68320'
]
df1 = df.loc[df['sector_code_1'].isin(k)]

# drop random secotr name called ready
df1.drop(df1[df1['sector_name_1'] == ' Ready'].index, inplace=True)

In [11]:
# make industry names better
df1['sector_name_1'] = df1['sector_name_1'].apply(lambda x: x.replace(' Activities of open', ' Open-ended Investors'))
df1['sector_name_1'] = df1['sector_name_1'].apply(lambda x: x.replace(' Information technology consultancy activities', ' Info Tech Consulting'))
df1['sector_name_1'] = df1['sector_name_1'].apply(lambda x: x.replace(' Data processing, hosting and related activities', ' Data Companies'))
df1['sector_name_1'] = df1['sector_name_1'].apply(lambda x: x.replace(' Other software publishing', ' Software Dev.'))
df1['sector_name_1'] = df1['sector_name_1'].apply(lambda x: x.replace(' Business and domestic software development', ' Business Software Dev.'))
df1['sector_name_1'] = df1['sector_name_1'].apply(lambda x: x.replace(' Other information technology service activities', ' Info Tech services'))
df1['sector_name_1'] = df1['sector_name_1'].apply(lambda x: x.replace(' Computer facilities management activities', ' Computer facilities mgmt.'))
df1['sector_name_1'] = df1['sector_name_1'].apply(lambda x: x.replace(' Research and experimental development on social sciences and humanities', ' Social Sciences Rsrch.'))
df1['sector_name_1'] = df1['sector_name_1'].apply(lambda x: x.replace(' Activities of venture and development capital companies', ' Venture Capital'))
df1['sector_name_1'] = df1['sector_name_1'].apply(lambda x: x.replace(' Other research and experimental development on natural sciences and engineering', ' Natural Sciences Rsrch.'))
df1['sector_name_1'] = df1['sector_name_1'].apply(lambda x: x.replace(' Wireless telecommunications activities', ' Wireless Telecommuncations'))
df1['sector_name_1'] = df1['sector_name_1'].apply(lambda x: x.replace(' Other telecommunications activities', ' Wireless Telecommuncations'))
df1['sector_name_1'] = df1['sector_name_1'].apply(lambda x: x.replace(' Research and experimental development on biotechnology', ' Biotech Rsrch.'))
df1['sector_name_1'] = df1['sector_name_1'].apply(lambda x: x.replace(' Satellite telecommunications activities', ' Satellite Telecommuncations'))
df1['sector_name_1'] = df1['sector_name_1'].apply(lambda x: x.replace(' Publishing of computer games', ' Computer Game Companies'))


In [12]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10729 entries, 19 to 94974
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   IncorporationDate  10729 non-null  object
 1   year               10729 non-null  int64 
 2   month              10729 non-null  int64 
 3   CompanyName        10729 non-null  object
 4   CompanyNumber      10729 non-null  object
 5   CompanyStatus      10729 non-null  object
 6   sector_code_1      10729 non-null  object
 7   sector_name_1      10729 non-null  object
 8   Industry           10729 non-null  object
 9   count              10729 non-null  int64 
dtypes: int64(3), object(7)
memory usage: 922.0+ KB


In [13]:
# function takes cumsum up to a specific set year and month


def year_month_counts(year, month):

    # slice the data until pre-specified year
    active_df = df1[df1['year'] <= year]

    # sort values by 'year', 'month'
    active_df.sort_values(by=['year', 'month'], inplace=True)

    # reset index
    active_df.reset_index(drop=True, inplace=True)

    # get the index of the last month we want
    idx = active_df[(active_df['year'] == year)
                    & (active_df['month'] == month)].index[-1] + 1

    # slice dataframe
    active_df = active_df.iloc[:idx]

    # group data
    active_df = pd.DataFrame(
        active_df.groupby('sector_name_1')['count'].agg('sum'))
    # reset index
    active_df.reset_index(inplace=True)

    return active_df

In [14]:
# can be used for diverging graphs
before = pd.DataFrame(year_month_counts(1998, 1))
after = pd.DataFrame(year_month_counts(2001, 12))
before

Unnamed: 0,sector_name_1,count
0,Biotech Rsrch.,1
1,Business Software Dev.,17
2,Computer facilities mgmt.,3
3,Data Companies,4
4,Fund management activities,8
5,Info Tech Consulting,41
6,Info Tech services,34
7,Natural Sciences Rsrch.,3
8,Social Sciences Rsrch.,1
9,Software Dev.,7


In [15]:
# sorty by largest to smallest count

before.sort_values(by='count', inplace=True, ascending=False)
after.sort_values(by='count', inplace=True, ascending=False)

In [16]:
# get overall count by years
year_grp = df1.groupby(['year'])
count_df = pd.DataFrame(year_grp['count'].sum())
count_df.reset_index(inplace=True)
count_df.head()

Unnamed: 0,year,count
0,1991,8
1,1992,14
2,1993,12
3,1994,16
4,1995,23


In [17]:
# get overall count by sector
sect_grp = df1.groupby(['sector_name_1'])
sect_df = pd.DataFrame(sect_grp['count'].sum())
sect_df.reset_index(inplace=True)
sect_df.sort_values(by='count', inplace=True, ascending=False)
sect_df = sect_df[sect_df['count'] > 100]
pd.options.display.max_colwidth = 100
print(sect_df.head(20))

                  sector_name_1  count
6          Info Tech Consulting   3251
1        Business Software Dev.   2456
7            Info Tech services   1903
12                Software Dev.    489
15                  Web portals    470
16   Wireless Telecommuncations    415
4                Data Companies    396
5    Fund management activities    381
14              Venture Capital    230
9          Open-ended Investors    159
2       Computer Game Companies    135
8       Natural Sciences Rsrch.    117
0                Biotech Rsrch.    110


In [18]:
# save data as csv
save_path = os.path.join(base_path, 'datasets',
                         'industry_added_cleaned_data.csv')
df1.to_csv(save_path, index=False)