In [130]:
import pandas as pd

# Extraction

### Go to the files path

In [131]:
# The path to our CSV files to extract all Inc 5000 company list

# 2019 Inc 5000 company list
Inc_2019_csv = "Resources/inc5000-2019.csv"
# 2018 Inc 5000 company list
Inc_2018_csv = "Resources/inc5000-2018.csv"
# 2007-2017 Inc 5000 company list
Inc_10year_csv ="Resources/inc5000_all10years.csv"


### Read the files

In [132]:
# Read csv file
# Read 2019 Inc 5000 company list
Inc_2019_df = pd.read_csv(Inc_2019_csv)

# Read 2018 Inc 5000 company list
Inc_2018_df = pd.read_csv(Inc_2018_csv)


# Read 2007-2017 Inc 5000 company list
Inc_10year_df = pd.read_csv(Inc_10year_csv,encoding='cp1252')

# Transforming

### Cleaning the data 

#### Cleaning 2019 Inc 5000 company list

In [133]:
# Drop column we don't need
drop_column_2019=Inc_2019_df.drop(['Profile','_ - previous_workers','_ - founded','_ - metro','url','_ - yrs_on_list'], axis=1)

# Add the year column 
drop_column_2019['Year']= '2019'

# Rename columns 
rename_df_2019= drop_column_2019.rename(columns={"_ - rank":"Rank","name":"Company_Name","state":"State","_ - revenue":"Revenue $","_ - growth":"Growth","_ - industry":"Industry","_ - workers":"Number_of_Employees","city":"City"})


# re-Order the data to suitable format

cleaned_2019_df =rename_df_2019[['Rank','Year','Company_Name','Industry','Number_of_Employees','Revenue $','Growth','City','State']]
cleaned_2019_df.head()


Unnamed: 0,Rank,Year,Company_Name,Industry,Number_of_Employees,Revenue $,Growth,City,State
0,1,2019,Freestar,Advertising & Marketing,40.0,36.9 Million,36680.3882,Phoenix,AZ
1,2,2019,FreightWise,Logistics & Transportation,39.0,33.6 Million,30547.9317,Brentwood,TN
2,3,2019,Cece's Veggie Co.,Food & Beverage,190.0,24.9 Million,23880.4852,Austin,TX
3,4,2019,LadyBoss,Consumer Products & Services,57.0,32.4 Million,21849.8925,Albuquerque,NM
4,5,2019,Perpay,Retail,25.0,22.5 Million,18166.407,Philadelphia,PA


#### Cleaning 2018 Inc 5000 company list

In [134]:
# Drop column we don't need

drop_column_2018=Inc_2018_df.drop(['_ - id','_ - zipcode','_ - ifmid','_ - latitude','_ - longitude','_ - website','_ - state_l','_ - metrocode','_ - ifiid','_ - previous_workers','_ - metro','_ - founded','_ - url','_ - partner_lists - partner_lists','_ - yrs_on_list'], axis=1)

# Add the year column 
drop_column_2018['Year']= '2018'

# Rename columns 
rename_df_2018=drop_column_2018.rename(columns={"_ - rank":"Rank","_ - company":"Company_Name","_ - state_s":"State","_ - revenue":"Revenue $","_ - growth":"Growth","_ - industry":"Industry","_ - workers":"Number_of_Employees","_ - state_l":"State","_ - city":"City"})
rename_df_2018.head()

# Normalizing revenue in to mellions 
rename_df_2018['Revenue $']= (rename_df_2018['Revenue $']/1000000).apply(lambda x: '{:,.1f} Million'.format(x))


cleaned_2018_df =rename_df_2018[['Rank','Year','Company_Name','Industry','Number_of_Employees','Revenue $','Growth','City','State']]
cleaned_2018_df.head()

Unnamed: 0,Rank,Year,Company_Name,Industry,Number_of_Employees,Revenue $,Growth,City,State
0,1.0,2018,SwanLeap,Logistics & Transportation,49.0,99.0 Million,75660.8425,Madison,WI
1,2.0,2018,PopSockets,Consumer Products & Services,118.0,168.8 Million,71423.762,Boulder,CO
2,3.0,2018,Home Chef,Food & Beverage,865.0,255.0 Million,60165.5058,Chicago,IL
3,4.0,2018,Velocity Global,Business Products & Services,55.0,49.2 Million,39816.5093,Denver,CO
4,5.0,2018,DEPCOM Power,Energy,104.0,219.6 Million,38962.9022,Scottsdale,AZ


#### Cleaning 10 year (2007-2017) Inc 5000 company list

In [135]:
# Rename columns 

drop_column_10year_df=Inc_10year_df.drop(['_ - state_l','_ - metro','_ - yrs_on_list'],axis=1)

# Add the year column 
rename_10year_df=drop_column_10year_df.rename(columns={"year":"Year","_ - rank":"Rank","_ - company":"Company_Name","_ - website":"Company_Website","_ - state_s":"State","_ - revenue":"Revenue $","_ - growth":"Growth","_ - industry":"Industry","_ - workers":"Number_of_Employees","_ - founded":"Founded_Year","_ - city":"City"})


# Normalizing revenue in to mellions 
rename_10year_df['Revenue $']= (rename_10year_df['Revenue $']/1000000).apply(lambda x: '{:,.1f} Million'.format(x))

cleaned_10year_df =rename_10year_df[['Rank','Year','Company_Name','Industry','Number_of_Employees','Revenue $','Growth','City','State']]
cleaned_10year_df.head()


Unnamed: 0,Rank,Year,Company_Name,Industry,Number_of_Employees,Revenue $,Growth,City,State
0,1,2016,Loot Crate,Consumer Products & Services,218.0,116.2 Million,66788.5962,Los Angeles,CA
1,2,2016,Paint Nite,Consumer Products & Services,100.0,55.0 Million,36555.2472,Somerville,MA
2,3,2016,CalCom Solar,Energy,47.0,33.5 Million,31633.5448,Visalia,CA
3,4,2016,eLuxurySupply.com,Retail,82.0,30.7 Million,23619.7198,Evansville,IN
4,5,2016,Company.com,Business Products & Services,48.0,33.4 Million,23486.8894,Atlanta,GA


#### Combine all data in to a single dataframe

In [136]:
# combine all data in to a single dataframe

combine_data =[cleaned_2018_df,cleaned_2019_df,cleaned_10year_df]

Inc_5000_df= pd.concat(combine_data)
Inc_5000_df

Unnamed: 0,Rank,Year,Company_Name,Industry,Number_of_Employees,Revenue $,Growth,City,State
0,1.0,2018,SwanLeap,Logistics & Transportation,49.0,99.0 Million,75660.8425,Madison,WI
1,2.0,2018,PopSockets,Consumer Products & Services,118.0,168.8 Million,71423.7620,Boulder,CO
2,3.0,2018,Home Chef,Food & Beverage,865.0,255.0 Million,60165.5058,Chicago,IL
3,4.0,2018,Velocity Global,Business Products & Services,55.0,49.2 Million,39816.5093,Denver,CO
4,5.0,2018,DEPCOM Power,Energy,104.0,219.6 Million,38962.9022,Scottsdale,AZ
...,...,...,...,...,...,...,...,...,...
50192,4996.0,2017,Sonic Boom Wellness,Health,41.0,4.9 Million,40.2025,Carlsbad,CA
50193,4997.0,2017,Planet DDS,Software,28.0,6.2 Million,40.1933,Costa Mesa,CA
50194,4998.0,2017,J. Jill,Retail,3801.0,639.1 Million,40.1359,Quincy,MA
50195,4999.0,2017,STS Aviation Group,Logistics & Transportation,1704.0,232.1 Million,40.1260,Jensen Beach,FL


## Analysis