# Part 1 - Data Cleaning

In [1]:
# Loading libraries:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Reading data files:
# using encoding = "ISO-8859-1" to avoid pandas encoding error

rounds = pd.read_csv("rounds2.csv",encoding="ISO-8859-1")
companies = pd.read_table("companies.txt",encoding="ISO-8859-1")

In [3]:
# Viewing the rounds file:

rounds.head()

Unnamed: 0,company_permalink,funding_round_permalink,funding_round_type,funding_round_code,funded_at,raised_amount_usd
0,/organization/-fame,/funding-round/9a01d05418af9f794eebff7ace91f638,venture,B,05-01-2015,10000000.0
1,/ORGANIZATION/-QOUNTER,/funding-round/22dacff496eb7acb2b901dec1dfe5633,venture,A,14-10-2014,
2,/organization/-qounter,/funding-round/b44fbb94153f6cdef13083530bb48030,seed,,01-03-2014,700000.0
3,/ORGANIZATION/-THE-ONE-OF-THEM-INC-,/funding-round/650b8f704416801069bb178a1418776b,venture,B,30-01-2014,3406878.0
4,/organization/0-6-com,/funding-round/5727accaeaa57461bd22a9bdd945382d,venture,A,19-03-2008,2000000.0


In [4]:
# Inspecting the structure of rounds():

rounds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 114949 entries, 0 to 114948
Data columns (total 6 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   company_permalink        114949 non-null  object 
 1   funding_round_permalink  114949 non-null  object 
 2   funding_round_type       114949 non-null  object 
 3   funding_round_code       31140 non-null   object 
 4   funded_at                114949 non-null  object 
 5   raised_amount_usd        94959 non-null   float64
dtypes: float64(1), object(5)
memory usage: 5.3+ MB


In [5]:
rounds.shape

(114949, 6)

In [6]:
# Inspecting the structure companies():

companies.head()

Unnamed: 0,permalink,name,homepage_url,category_list,status,country_code,state_code,region,city,founded_at
0,/Organization/-Fame,#fame,http://livfame.com,Media,operating,IND,16,Mumbai,Mumbai,
1,/Organization/-Qounter,:Qounter,http://www.qounter.com,Application Platforms|Real Time|Social Network...,operating,USA,DE,DE - Other,Delaware City,04-09-2014
2,/Organization/-The-One-Of-Them-Inc-,"(THE) ONE of THEM,Inc.",http://oneofthem.jp,Apps|Games|Mobile,operating,,,,,
3,/Organization/0-6-Com,0-6.com,http://www.0-6.com,Curated Web,operating,CHN,22,Beijing,Beijing,01-01-2007
4,/Organization/004-Technologies,004 Technologies,http://004gmbh.de/en/004-interact,Software,operating,USA,IL,"Springfield, Illinois",Champaign,01-01-2010


In [7]:
# converting all the permalinks to the lowercase:

companies["permalink"] = companies["permalink"].str.lower()
companies.head()

Unnamed: 0,permalink,name,homepage_url,category_list,status,country_code,state_code,region,city,founded_at
0,/organization/-fame,#fame,http://livfame.com,Media,operating,IND,16,Mumbai,Mumbai,
1,/organization/-qounter,:Qounter,http://www.qounter.com,Application Platforms|Real Time|Social Network...,operating,USA,DE,DE - Other,Delaware City,04-09-2014
2,/organization/-the-one-of-them-inc-,"(THE) ONE of THEM,Inc.",http://oneofthem.jp,Apps|Games|Mobile,operating,,,,,
3,/organization/0-6-com,0-6.com,http://www.0-6.com,Curated Web,operating,CHN,22,Beijing,Beijing,01-01-2007
4,/organization/004-technologies,004 Technologies,http://004gmbh.de/en/004-interact,Software,operating,USA,IL,"Springfield, Illinois",Champaign,01-01-2010


In [8]:
companies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 66368 entries, 0 to 66367
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   permalink      66368 non-null  object
 1   name           66367 non-null  object
 2   homepage_url   61310 non-null  object
 3   category_list  63220 non-null  object
 4   status         66368 non-null  object
 5   country_code   59410 non-null  object
 6   state_code     57821 non-null  object
 7   region         58338 non-null  object
 8   city           58340 non-null  object
 9   founded_at     51147 non-null  object
dtypes: object(10)
memory usage: 5.1+ MB


In [9]:
companies.shape

(66368, 10)

Ideally, the permalink column in the companies dataframe should be the unique_key of the table, having 66368 unique company names (links, or permalinks). Also, these 66368 companies should be present in the rounds file.

Let's first confirm that these 66368 permalinks (which are the URL paths of companies' websites) are not repeating in the column, i.e. they are unique.



In [10]:
# Look at the unique values:

len(companies.permalink.unique())

66368

Thus, there are 66368 unique companies in the table and permalink is the unique primary key. Each row represents a unique company.

Let's now check whether all of these 66368 companies are present in the rounds file, and if some extra ones are present.

In [11]:
# look at unique company names in rounds df
# note that the column name in rounds file is different (company_permalink)

len(rounds.company_permalink.unique())

90247

There seem to be 90247 unique values of company_permalink, whereas we expected only 66368. May be this is because of uppercase/lowercase issues.

Let's convert the column to lowercase and look at unique values again

In [12]:
rounds["company_permalink"]=rounds["company_permalink"].str.lower()
rounds.head()

Unnamed: 0,company_permalink,funding_round_permalink,funding_round_type,funding_round_code,funded_at,raised_amount_usd
0,/organization/-fame,/funding-round/9a01d05418af9f794eebff7ace91f638,venture,B,05-01-2015,10000000.0
1,/organization/-qounter,/funding-round/22dacff496eb7acb2b901dec1dfe5633,venture,A,14-10-2014,
2,/organization/-qounter,/funding-round/b44fbb94153f6cdef13083530bb48030,seed,,01-03-2014,700000.0
3,/organization/-the-one-of-them-inc-,/funding-round/650b8f704416801069bb178a1418776b,venture,B,30-01-2014,3406878.0
4,/organization/0-6-com,/funding-round/5727accaeaa57461bd22a9bdd945382d,venture,A,19-03-2008,2000000.0


In [13]:
len(rounds.company_permalink.unique())

66370

There seem to be 2 extra permalinks in the rounds file which are not present in the companies file. Let's hope that this is a data quality issue, since if this were genuine, we have two companies whose investment round details are available but their metadata (company name, sector etc.) is not available in the companies table.


Let's have a look at the company permalinks which are in the 'rounds' file but not in 'companies'.

In [14]:
# companies present in rounds file but not in (~) companies file
rounds.loc[~rounds['company_permalink'].isin(companies['permalink']), :]

Unnamed: 0,company_permalink,funding_round_permalink,funding_round_type,funding_round_code,funded_at,raised_amount_usd
77,/organization/10â°north,/funding-round/b41ff7de932f8b6e5bbeed3966c0ed6a,equity_crowdfunding,,12-08-2014,
729,/organization/51wofang-æ å¿§ææ¿,/funding-round/346b9180d276a74e0fbb2825e66c6f5b,venture,A,06-07-2015,5000000.0
2670,/organization/adslinkedâ¢,/funding-round/449ae54bb63c768c232955ca6911dee4,seed,,29-09-2014,100000.0
3166,/organization/aesthetic-everythingâ®-social-ne...,/funding-round/62593455f1a69857ed05d5734cc04132,equity_crowdfunding,,12-10-2014,
3291,/organization/affluent-attachã©-club-2,/funding-round/626678bdf1654bc4df9b1b34647a4df1,seed,,15-10-2014,100000.0
...,...,...,...,...,...,...
110545,/organization/whodatâs-spaces,/funding-round/d5d6db3d1e6c54d71a63b3aa0c9278e6,seed,,28-10-2014,30000.0
113839,/organization/zengame-ç¦æ¸¸ç§æ,/funding-round/6ba28fb4f3eadf5a9c6c81bc5dde6cdf,seed,,17-07-2010,
114946,/organization/ãeron,/funding-round/59f4dce44723b794f21ded3daed6e4fe,venture,A,01-08-2014,
114947,/organization/ãasys-2,/funding-round/35f09d0794651719b02bbfd859ba9ff5,seed,,01-01-2015,18192.0


All the permalinks have weird non-English characters. Let's see whether these characters are present in the original df as well

In [15]:
# Looking at the indices with wierd characters:

rounds_original = pd.read_csv("rounds2.csv",encoding="ISO-8859-1")
rounds_original.iloc[[29597, 31863, 45176, 58473], :]

Unnamed: 0,company_permalink,funding_round_permalink,funding_round_type,funding_round_code,funded_at,raised_amount_usd
29597,/ORGANIZATION/E-CÃBICA,/funding-round/8491f74869e4fe8ba9c378394f8fbdea,seed,,01-02-2015,
31863,/ORGANIZATION/ENERGYSTONE-GAMES-ÇµÇ³Æ¸¸Æ,/funding-round/b89553f3d2279c5683ae93f45a21cfe0,seed,,09-08-2014,
45176,/organization/huizuche-com-æ ç§ÿè½¦,/funding-round/8f8a32dbeeb0f831a78702f83af78a36,seed,,18-09-2014,
58473,/ORGANIZATION/MAGNET-TECH-Ç£Ç³Ç§Æ,/funding-round/8fc91fbb32bc95e97f151dd0cb4166bf,seed,,16-08-2014,1625585.0


The company weird characters appear when you import the data file. To confirm whether these characters are actually present in the given data or whether python has introduced them while importing into pandas, let's have a look at the original CSV file in Excel.

Thus, this is most likely a data quality issue we have introduced while reading the data file into python. Specifically, this is most likely caused because of encoding.

First, let's try to figure out the encoding type of this file. Then we can try specifying the encoding type at the time of reading the file. The chardet library shows the encoding type of a file.

In [16]:
import chardet

rawdata = open("rounds2.csv","rb").read()
result = chardet.detect(rawdata)
char_encode = result["encoding"]
print(char_encode)

Windows-1254


In [17]:
print(result)

{'encoding': 'Windows-1254', 'confidence': 0.4186155476629225, 'language': 'Turkish'}


In [18]:
rounds["company_permalink"] = rounds.company_permalink.str.encode("utf-8").str.decode("ascii","ignore")
rounds.loc[~rounds["company_permalink"].isin(companies["permalink"]),:]

Unnamed: 0,company_permalink,funding_round_permalink,funding_round_type,funding_round_code,funded_at,raised_amount_usd
77,/organization/10north,/funding-round/b41ff7de932f8b6e5bbeed3966c0ed6a,equity_crowdfunding,,12-08-2014,
729,/organization/51wofang-,/funding-round/346b9180d276a74e0fbb2825e66c6f5b,venture,A,06-07-2015,5000000.0
2670,/organization/adslinked,/funding-round/449ae54bb63c768c232955ca6911dee4,seed,,29-09-2014,100000.0
3166,/organization/aesthetic-everything-social-network,/funding-round/62593455f1a69857ed05d5734cc04132,equity_crowdfunding,,12-10-2014,
3291,/organization/affluent-attach-club-2,/funding-round/626678bdf1654bc4df9b1b34647a4df1,seed,,15-10-2014,100000.0
...,...,...,...,...,...,...
110545,/organization/whodats-spaces,/funding-round/d5d6db3d1e6c54d71a63b3aa0c9278e6,seed,,28-10-2014,30000.0
113839,/organization/zengame-,/funding-round/6ba28fb4f3eadf5a9c6c81bc5dde6cdf,seed,,17-07-2010,
114946,/organization/eron,/funding-round/59f4dce44723b794f21ded3daed6e4fe,venture,A,01-08-2014,
114947,/organization/asys-2,/funding-round/35f09d0794651719b02bbfd859ba9ff5,seed,,01-01-2015,18192.0


Now the permalink column looks clean. Let us look at the unique values in the rounds file

In [19]:
# Checking unique values for rounds

len(rounds["company_permalink"].unique())

66368

Observation:        
1. Initially we had 66370 unique values for rounds dataframe and now we are having 66368 for rounds. 
2. Now we have same number of unique values in companies and rounds dataframe.

We have looked at the rounds dataframe and resolved the encoding issue. What if there is a same encoding issue in companies 
dataframe. So lets look at those companies which are present in the companies file but not in the rounds file.

In [20]:
# companies that are present in the companies file but not in the rounds file:

companies.loc[~companies["permalink"].isin(rounds["company_permalink"]),:]

Unnamed: 0,permalink,name,homepage_url,category_list,status,country_code,state_code,region,city,founded_at
43,/organization/10ãâ°north,10ÃÂ°North,,Fashion,operating,CAN,ON,Toronto,Mississauga,05-01-2013
426,/organization/51wofang-ãâ ãâ¿â§ãëâãëâ¿,51wofang Ã¦â Ã¥Â¿Â§Ã¦ËâÃ¦ËÂ¿,http://www.51wofang.com,,closed,,,,,01-01-2014
1506,/organization/adslinkedã¢ââ¢,AdsLinkedÃ¢âÂ¢,http://www.adslinked.com,Advertising|Internet,operating,,,,,
1775,/organization/aesthetic-everythingã¢â®-social-...,Aesthetic EverythingÃÂ® Social Network,http://aestheticeverything.com/,Public Relations,operating,USA,CA,Los Angeles,Marina Del Rey,15-06-2009
1834,/organization/affluent-attachã£â©-club-2,Affluent AttachÃÂ© Club,http://www.affluentattache.com/,Hospitality,operating,USA,CA,Los Angeles,Beverly Hills,
...,...,...,...,...,...,...,...,...,...,...
63833,/organization/whodatã¢â¬â¢s-spaces,WhodatÃ¢â¬â¢s Spaces,,Apps,operating,,,,,
65778,/organization/zengame-ãâ¦â¦ãâ¸â¸ãâ§âãå¡â¬,ZenGame Ã§Â¦â¦Ã¦Â¸Â¸Ã§Â§âÃ¦Å â¬,http://www.zen-game.com,Internet|Mobile Games|Online Gaming,closed,,,,,17-07-2010
66365,/organization/ãâeron,ÃÂERON,http://www.aeron.hu/,,operating,,,,,01-01-2011
66366,/organization/ãâasys-2,Ãâasys,http://www.oasys.io/,Consumer Electronics|Internet of Things|Teleco...,operating,USA,CA,SF Bay Area,San Francisco,01-01-2014


Obervation: Companies dataframe also has special characters and we need to clean that first.

In [25]:
# Removing the encoding from companies dataframe:

companies["permalink"] = companies['permalink'].str.encode("Windows-1254").str.decode("ascii","ignore")

In [29]:
# Rounds file:

rounds.to_csv("rounds_clean.csv",sep=",",index=False)

In [30]:
# Companies file:

companies.to_csv("companies_clean.csv",sep="\t",index=False)