In [1]:
import pandas as pd

### Categorical variables

In [2]:
# Let's use pandas to create Categorical Series. One way is by 
# specifying dtype="category" when constructing a Series:

s = pd.Series(["a","b","c","a"], dtype="category")
s

0    a
1    b
2    c
3    a
dtype: category
Categories (3, object): [a, b, c]

In [4]:
# Another way is to convert an existing Series or column to a 
# category dtype:

df = pd.DataFrame({"A":["a","b","c","a"]})
df["B"] = df["A"].astype('category')
df

Unnamed: 0,A,B
0,a,a
1,b,b
2,c,c
3,a,a


In [5]:
# You can also pass a pandas.Categorical object to a Series 

raw_cat = pd.Categorical(["a","b","c","a"], categories=["b","c","d"],
                          ordered=False)

In [6]:
s = pd.Series(raw_cat)
s

0    NaN
1      b
2      c
3    NaN
dtype: category
Categories (3, object): [b, c, d]

### Dummy variables

In [22]:
# Let's use pd.get_dummies to convert categorical variables into dummy 
# variables. First let's create a small DataFrame with categorical variables. 

df = pd.DataFrame({'data1': list('bbacab'), 'key': range(6)})

In [23]:
df

Unnamed: 0,data1,key
0,b,0
1,b,1
2,a,2
3,c,3
4,a,4
5,b,5


In [24]:
# Now, let's convert the categorical variables into dummy variables. 

pd.get_dummies(df['data1'])

Unnamed: 0,a,b,c
0,0.0,1.0,0.0
1,0.0,1.0,0.0
2,1.0,0.0,0.0
3,0.0,0.0,1.0
4,1.0,0.0,0.0
5,0.0,1.0,0.0


In [25]:
pd.concat([df.drop('data1',axis=1),pd.get_dummies(df['data1'])], axis = 1)

Unnamed: 0,key,a,b,c
0,0,0.0,1.0,0.0
1,1,0.0,1.0,0.0
2,2,1.0,0.0,0.0
3,3,0.0,0.0,1.0
4,4,1.0,0.0,0.0
5,5,0.0,1.0,0.0


In [54]:
 df = pd.read_csv('/Users/Rebecca/DSI-projects/curriculum/week04/3.1-dummy_variables/crunchbase_monthly_export.csv')

In [55]:
df.head()

Unnamed: 0,permalink,name,homepage_url,category_list,market,funding_total_usd,status,country_code,state_code,region,city,funding_rounds,founded_at,founded_month,founded_quarter,founded_year,first_funding_at,last_funding_at,Unnamed: 18
0,/organization/canal-do-credito,Canal do Credito,http://www.canaldocredito.com.br,|Credit|Technology|Services|Finance|,Credit,750000,,BRA,,Rio de Janeiro,Belo Horizonte,1,,,,,1/1/10,1/1/10,
1,/organization/waywire,#waywire,http://www.waywire.com,|Entertainment|Politics|Social Media|News|,Entertainment,1750000,acquired,USA,NY,New York City,New York,1,6/1/12,2012-06,2012-Q2,2012.0,6/30/12,6/30/12,
2,/organization/tv-communications,&TV Communications,http://enjoyandtv.com,|Games|,Games,4000000,operating,USA,CA,Los Angeles,Los Angeles,2,,,,,6/4/10,9/23/10,
3,/organization/rock-your-paper,'Rock' Your Paper,http://www.rockyourpaper.org,|Publishing|Education|,Education,40000,operating,EST,,Tallinn,Tallinn,1,10/26/12,2012-10,2012-Q4,2012.0,8/9/12,8/9/12,
4,/organization/in-touch-network,(In)Touch Network,http://www.InTouchNetwork.com,|Electronics|Guides|Coffee|Restaurants|Music|i...,Apps,1500000,operating,GBR,,London,London,1,4/1/11,2011-04,2011-Q2,2011.0,4/1/11,4/1/11,


In [56]:
df.columns

Index([u'permalink', u'name', u'homepage_url', u'category_list', u' market ',
       u' funding_total_usd ', u'status', u'country_code', u'state_code',
       u'region', u'city', u'funding_rounds', u'founded_at', u'founded_month',
       u'founded_quarter', u'founded_year', u'first_funding_at',
       u'last_funding_at', u'Unnamed: 18'],
      dtype='object')

In [57]:
len(df['category_list'].unique())

13192

In [58]:
len(df[' market '].unique())

548

In [59]:
pd.concat([df.drop(' market ',axis=1),pd.get_dummies(df[' market '])], axis = 1)

Unnamed: 0,permalink,name,homepage_url,category_list,funding_total_usd,status,country_code,state_code,region,city,...,Web Hosting,Web Tools,Weddings,Wholesale,Wine And Spirits,Wireless,iOS,iPad,iPhone,mHealth
0,/organization/canal-do-credito,Canal do Credito,http://www.canaldocredito.com.br,|Credit|Technology|Services|Finance|,750000,,BRA,,Rio de Janeiro,Belo Horizonte,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,/organization/waywire,#waywire,http://www.waywire.com,|Entertainment|Politics|Social Media|News|,1750000,acquired,USA,NY,New York City,New York,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,/organization/tv-communications,&TV Communications,http://enjoyandtv.com,|Games|,4000000,operating,USA,CA,Los Angeles,Los Angeles,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,/organization/rock-your-paper,'Rock' Your Paper,http://www.rockyourpaper.org,|Publishing|Education|,40000,operating,EST,,Tallinn,Tallinn,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,/organization/in-touch-network,(In)Touch Network,http://www.InTouchNetwork.com,|Electronics|Guides|Coffee|Restaurants|Music|i...,1500000,operating,GBR,,London,London,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,/organization/n-plusn,#NAME?,http://plusn.com,|Software|,600000,operating,USA,NY,New York City,New York,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,/organization/club-domains,.Club Domains,http://nic.club/,|Software|,7000000,,USA,FL,Ft. Lauderdale,Oakland Park,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,/organization/fox-networks,.Fox Networks,http://www.dotfox.com,|Advertising|,4912393,closed,ARG,,Buenos Aires,Buenos Aires,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,/organization/0-6-com,0-6.com,http://www.0-6.com,|Curated Web|,2000000,operating,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,/organization/004-technologies,004 Technologies,http://004gmbh.de/en/004-interact,|Software|,-,operating,USA,IL,"Springfield, Illinois",Champaign,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [60]:
len(df['status'].unique())

4

In [61]:
df['key'] = range(len(df))

In [62]:
pd.concat([df.drop('status',axis=1),pd.get_dummies(df['status'])], axis = 1)

Unnamed: 0,permalink,name,homepage_url,category_list,market,funding_total_usd,country_code,state_code,region,city,...,founded_month,founded_quarter,founded_year,first_funding_at,last_funding_at,Unnamed: 18,key,acquired,closed,operating
0,/organization/canal-do-credito,Canal do Credito,http://www.canaldocredito.com.br,|Credit|Technology|Services|Finance|,Credit,750000,BRA,,Rio de Janeiro,Belo Horizonte,...,,,,1/1/10,1/1/10,,0,0.0,0.0,0.0
1,/organization/waywire,#waywire,http://www.waywire.com,|Entertainment|Politics|Social Media|News|,Entertainment,1750000,USA,NY,New York City,New York,...,2012-06,2012-Q2,2012.0,6/30/12,6/30/12,,1,1.0,0.0,0.0
2,/organization/tv-communications,&TV Communications,http://enjoyandtv.com,|Games|,Games,4000000,USA,CA,Los Angeles,Los Angeles,...,,,,6/4/10,9/23/10,,2,0.0,0.0,1.0
3,/organization/rock-your-paper,'Rock' Your Paper,http://www.rockyourpaper.org,|Publishing|Education|,Education,40000,EST,,Tallinn,Tallinn,...,2012-10,2012-Q4,2012.0,8/9/12,8/9/12,,3,0.0,0.0,1.0
4,/organization/in-touch-network,(In)Touch Network,http://www.InTouchNetwork.com,|Electronics|Guides|Coffee|Restaurants|Music|i...,Apps,1500000,GBR,,London,London,...,2011-04,2011-Q2,2011.0,4/1/11,4/1/11,,4,0.0,0.0,1.0
5,/organization/n-plusn,#NAME?,http://plusn.com,|Software|,Software,600000,USA,NY,New York City,New York,...,2012-01,2012-Q1,2012.0,8/29/12,8/29/12,,5,0.0,0.0,1.0
6,/organization/club-domains,.Club Domains,http://nic.club/,|Software|,Software,7000000,USA,FL,Ft. Lauderdale,Oakland Park,...,2011-10,2011-Q4,2011.0,5/31/13,5/31/13,,6,0.0,0.0,0.0
7,/organization/fox-networks,.Fox Networks,http://www.dotfox.com,|Advertising|,Advertising,4912393,ARG,,Buenos Aires,Buenos Aires,...,,,,1/16/07,1/16/07,,7,0.0,1.0,0.0
8,/organization/0-6-com,0-6.com,http://www.0-6.com,|Curated Web|,Curated Web,2000000,,,,,...,2007-01,2007-Q1,2007.0,3/19/08,3/19/08,,8,0.0,0.0,1.0
9,/organization/004-technologies,004 Technologies,http://004gmbh.de/en/004-interact,|Software|,Software,-,USA,IL,"Springfield, Illinois",Champaign,...,2010-01,2010-Q1,2010.0,7/24/14,7/24/14,,9,0.0,0.0,1.0


In [63]:
len(df['country_code'].unique())

128

In [64]:
len(df['funding_rounds'].unique())

16

In [65]:
df.dtypes

permalink               object
name                    object
homepage_url            object
category_list           object
 market                 object
 funding_total_usd      object
status                  object
country_code            object
state_code              object
region                  object
city                    object
funding_rounds           int64
founded_at              object
founded_month           object
founded_quarter         object
founded_year           float64
first_funding_at        object
last_funding_at         object
Unnamed: 18            float64
key                      int64
dtype: object

In [66]:
df[' funding_total_usd '] = df[' funding_total_usd '].apply(lambda x: x.replace(',', ''))

In [67]:
df[' funding_total_usd '] = df[' funding_total_usd '].apply(lambda x: x.replace('-', ''))

In [77]:
df[' funding_total_usd '] = df[' funding_total_usd '].apply(lambda x: x.replace(' ', ''))

In [78]:
def fill_in(x):
    new_x = []
    for i in x:
        if i == '':
            new_x.append('NaN')
        else:
            new_x.append(float(i))
    return new_x

In [79]:
df[' funding_total_usd '][0]

'750000'

In [81]:
df[' funding_total_usd '] = fill_in(df[' funding_total_usd '])

In [82]:
df.head()

Unnamed: 0,permalink,name,homepage_url,category_list,market,funding_total_usd,status,country_code,state_code,region,city,funding_rounds,founded_at,founded_month,founded_quarter,founded_year,first_funding_at,last_funding_at,Unnamed: 18,key
0,/organization/canal-do-credito,Canal do Credito,http://www.canaldocredito.com.br,|Credit|Technology|Services|Finance|,Credit,750000.0,,BRA,,Rio de Janeiro,Belo Horizonte,1,,,,,1/1/10,1/1/10,,0
1,/organization/waywire,#waywire,http://www.waywire.com,|Entertainment|Politics|Social Media|News|,Entertainment,1750000.0,acquired,USA,NY,New York City,New York,1,6/1/12,2012-06,2012-Q2,2012.0,6/30/12,6/30/12,,1
2,/organization/tv-communications,&TV Communications,http://enjoyandtv.com,|Games|,Games,4000000.0,operating,USA,CA,Los Angeles,Los Angeles,2,,,,,6/4/10,9/23/10,,2
3,/organization/rock-your-paper,'Rock' Your Paper,http://www.rockyourpaper.org,|Publishing|Education|,Education,40000.0,operating,EST,,Tallinn,Tallinn,1,10/26/12,2012-10,2012-Q4,2012.0,8/9/12,8/9/12,,3
4,/organization/in-touch-network,(In)Touch Network,http://www.InTouchNetwork.com,|Electronics|Guides|Coffee|Restaurants|Music|i...,Apps,1500000.0,operating,GBR,,London,London,1,4/1/11,2011-04,2011-Q2,2011.0,4/1/11,4/1/11,,4
