In [61]:
import requests
import pandas as pd
from pymongo import MongoClient
from pymongo import ASCENDING, DESCENDING
from bson.json_util import dumps
import re

In [62]:
client = MongoClient("mongodb://localhost/datamad0320")
db = client.get_database()

In [63]:
# Voy a selecconar los datos que me interesan para buscar startups de tech con más de $1M de cantidad alcanzada 
all_companies = db.companies.find({},{"name":1,"offices":1,"total_money_raised":1,"number_of_employees":1,
                                   "founded_year":1,"category_code":1})

In [64]:
df = pd.DataFrame(all_companies)
display(df.head(),df.shape)

Unnamed: 0,_id,name,category_code,number_of_employees,founded_year,total_money_raised,offices
0,52cdef7c4bab8bd675297d8b,AdventNet,enterprise,600.0,1996.0,$0,"[{'description': 'Headquarters', 'address1': '..."
1,52cdef7c4bab8bd675297d92,Flektor,games_video,,,$0,"[{'description': None, 'address1': '8536 Natio..."
2,52cdef7c4bab8bd675297d8c,Zoho,software,1600.0,2005.0,$0,"[{'description': 'Headquarters', 'address1': '..."
3,52cdef7c4bab8bd675297d8a,Wetpaint,web,47.0,2005.0,$39.8M,"[{'description': '', 'address1': '710 - 2nd Av..."
4,52cdef7c4bab8bd675297d93,Fox Interactive Media,web,0.0,1979.0,$0,"[{'description': '', 'address1': '407 N Maple ..."


(18801, 7)

In [65]:
# convierto los id en formato json para que al guardarlos en formato json no me den problema después
for i in range(len(df["_id"])):
    df["_id"].iloc[i] = dumps(df["_id"].iloc[i])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


In [66]:
# separar los objetos de las listas offices
df = df.explode("offices")
display(df.head(),df.shape)

Unnamed: 0,_id,name,category_code,number_of_employees,founded_year,total_money_raised,offices
0,"{""$oid"": ""52cdef7c4bab8bd675297d8b""}",AdventNet,enterprise,600.0,1996.0,$0,"{'description': 'Headquarters', 'address1': '4..."
1,"{""$oid"": ""52cdef7c4bab8bd675297d92""}",Flektor,games_video,,,$0,"{'description': None, 'address1': '8536 Nation..."
2,"{""$oid"": ""52cdef7c4bab8bd675297d8c""}",Zoho,software,1600.0,2005.0,$0,"{'description': 'Headquarters', 'address1': '4..."
3,"{""$oid"": ""52cdef7c4bab8bd675297d8a""}",Wetpaint,web,47.0,2005.0,$39.8M,"{'description': '', 'address1': '710 - 2nd Ave..."
3,"{""$oid"": ""52cdef7c4bab8bd675297d8a""}",Wetpaint,web,47.0,2005.0,$39.8M,"{'description': '', 'address1': '270 Lafayette..."


(21762, 7)

In [67]:
#función para obtener los datos de ciudad donde se encuentran las oficinas
def getCity(row):
    of=row.offices
    if type(of) == dict:
        if "city" in of:
            if of["city"]:
                return of["city"]
            else:
                return "NoCity"
        else:
            return None
    else:
        return "NoOffice"

city = pd.DataFrame(df.apply(getCity, axis=1))
city.columns=["city"]
city.head()

Unnamed: 0,city
0,Pleasanton
1,Culver City
2,Pleasanton
3,Seattle
3,New York


In [68]:
city_processed = pd.concat([df,city], axis=1)
city_processed = city_processed.drop(columns=["offices"])
display(city_processed.head(),city_processed.shape)

Unnamed: 0,_id,name,category_code,number_of_employees,founded_year,total_money_raised,city
0,"{""$oid"": ""52cdef7c4bab8bd675297d8b""}",AdventNet,enterprise,600.0,1996.0,$0,Pleasanton
1,"{""$oid"": ""52cdef7c4bab8bd675297d92""}",Flektor,games_video,,,$0,Culver City
2,"{""$oid"": ""52cdef7c4bab8bd675297d8c""}",Zoho,software,1600.0,2005.0,$0,Pleasanton
3,"{""$oid"": ""52cdef7c4bab8bd675297d8a""}",Wetpaint,web,47.0,2005.0,$39.8M,Seattle
3,"{""$oid"": ""52cdef7c4bab8bd675297d8a""}",Wetpaint,web,47.0,2005.0,$39.8M,New York


(21762, 7)

In [69]:
city_processed.to_csv("../output/cities")

In [70]:
city_processed["city"][city_processed["category_code"]=="design"]

1733     San Francisco
1733       Collingwood
1733            Berlin
15555       Ellensburg
16280         Brooklyn
18165           London
Name: city, dtype: object

In [71]:
city_processed["city"][city_processed["category_code"]=="games_video"].value_counts().head()

NoOffice         244
New York          75
San Francisco     68
NoCity            64
London            36
Name: city, dtype: int64

In [72]:
city_processed["founded_year"].describe()

count    15936.000000
mean      2002.062123
std         12.554885
min       1800.000000
25%       2000.000000
50%       2005.000000
75%       2007.000000
max       2013.000000
Name: founded_year, dtype: float64

In [73]:
# teniendo en cuenta que el mayor valor de founded_year es 2013, para seleccionar las startups voy a suponer que este analisis 
# se está realizando en 2014. Por lo tanto, selecciono solo aquellas cuyo founded_year sea >= 2010 (3 años antes de 2013)
city_processed2 = city_processed[city_processed["founded_year"]>=2010]

In [74]:
display(city_processed2.head(),city_processed2.shape)

Unnamed: 0,_id,name,category_code,number_of_employees,founded_year,total_money_raised,city
281,"{""$oid"": ""52cdef7c4bab8bd675297ea7""}",Mokitown,web,,2011.0,$0,NoCity
499,"{""$oid"": ""52cdef7c4bab8bd675297f78""}",CircleUp,finance,11.0,2011.0,$9M,San Francisco
522,"{""$oid"": ""52cdef7c4bab8bd675297f94""}",PeekYou,search,20.0,2012.0,$1.83M,New York
532,"{""$oid"": ""52cdef7c4bab8bd675297f9e""}",GENWI,mobile,25.0,2010.0,$7.1M,Los Altos
549,"{""$oid"": ""52cdef7c4bab8bd675297fb1""}",headr,web,8.0,2012.0,$0,Berlin


(172, 7)

In [75]:
# quiero startups de tech, así que voy a ver que hay en category code
print(list(city_processed2["category_code"].value_counts().index))

['web', 'games_video', 'software', 'mobile', 'enterprise', 'other', 'advertising', 'ecommerce', 'analytics', 'photo_video', 'travel', 'network_hosting', 'consulting', 'education', 'cleantech', 'biotech', 'security', 'hospitality', 'search', 'fashion', 'social', 'finance', 'messaging']


In [76]:
index_NOtechs = city_processed2.index[(city_processed2["category_code"]!="web")&(city_processed2["category_code"]!= "software")&
                                      (city_processed2["category_code"]!="games_video")&(city_processed2["category_code"]!="mobile")&
                                      (city_processed2["category_code"]!="network_hosting")&(city_processed2["category_code"]!="cleantech")&
                                      (city_processed2["category_code"]!="biotech")&(city_processed2["category_code"]!="photo_video")&
                                      (city_processed2["category_code"]!="hardware")&(city_processed2["category_code"]!="messaging")]

In [77]:
index_NOtechs

Int64Index([  499,   522,   817,  1103,  1106,  1359,  1359,  1359,  1404,
             1435,  1471,  2176,  2965,  3179,  3945,  4376,  4588,  4588,
             4696,  5077,  5228,  5383,  5502,  5502,  6005,  6262,  6314,
             7168,  7221,  7947,  8030,  8030,  8030,  8030,  8030,  8030,
             8698,  9005,  9153,  9726, 10149, 10505, 10505, 10576, 10653,
            11072, 11120, 11223, 12446, 13078, 13598, 13860, 14341, 14579,
            14647, 14917, 15214, 15334, 16284, 16467, 16581, 17436, 17985,
            18768, 18768],
           dtype='int64')

In [78]:
city_processed3 = city_processed2.drop(index=index_NOtechs)
city_processed3.head()

Unnamed: 0,_id,name,category_code,number_of_employees,founded_year,total_money_raised,city
281,"{""$oid"": ""52cdef7c4bab8bd675297ea7""}",Mokitown,web,,2011.0,$0,NoCity
532,"{""$oid"": ""52cdef7c4bab8bd675297f9e""}",GENWI,mobile,25.0,2010.0,$7.1M,Los Altos
549,"{""$oid"": ""52cdef7c4bab8bd675297fb1""}",headr,web,8.0,2012.0,$0,Berlin
549,"{""$oid"": ""52cdef7c4bab8bd675297fb1""}",headr,web,8.0,2012.0,$0,Hannover
612,"{""$oid"": ""52cdef7c4bab8bd675297fec""}",Fixya,web,30.0,2013.0,$8M,San Mateo


In [79]:
# para que sea una startups tiene que tener menos de 100 empleados
city_processed3 = city_processed3[(city_processed3["number_of_employees"]!= None) & (city_processed3["number_of_employees"]<=100)]
display(city_processed3.head(),city_processed3.shape)

Unnamed: 0,_id,name,category_code,number_of_employees,founded_year,total_money_raised,city
532,"{""$oid"": ""52cdef7c4bab8bd675297f9e""}",GENWI,mobile,25.0,2010.0,$7.1M,Los Altos
549,"{""$oid"": ""52cdef7c4bab8bd675297fb1""}",headr,web,8.0,2012.0,$0,Berlin
549,"{""$oid"": ""52cdef7c4bab8bd675297fb1""}",headr,web,8.0,2012.0,$0,Hannover
612,"{""$oid"": ""52cdef7c4bab8bd675297fec""}",Fixya,web,30.0,2013.0,$8M,San Mateo
1428,"{""$oid"": ""52cdef7c4bab8bd67529831a""}",Social Gaming Network,games_video,100.0,2011.0,$17.1M,Los Angeles


(63, 7)

In [80]:
print(list(city_processed3["total_money_raised"].value_counts().index))

['$0', '$1M', '$750k', '$17.1M', '$300k', '$930k', '$59k', '$8M', '$800k', '$4M', '$7.1M', '$7.55M', '$15k']


In [81]:
def getMoney(value):
        if value:
            if re.search("^C\$\d+",value):
                value = value.split("C$")[1]
                if "k" in value:
                    return round(float(value.split("k")[0])*0.71,4)
                elif "M" in value:
                    return round(float(value.split("M")[0])*1000*0.71,4)
                else:
                    return round(float(value)*0,71/1000,4)
            elif re.search("\$\d+",value):
                value = value.split("$")[1]
                if "k" in value:
                    return round(float(value.split("k")[0]),4)
                elif "M" in value:
                    return round(float(value.split("M")[0])*1000,4)
                else:
                    return round(float(value)/1000,4)
            elif re.search("\£\d+",value):
                value = value.split("£")[1]
                if "k" in value:
                    return round(float(value.split("k")[0])*1.25,4)
                elif "M" in value:
                    return round(float(value.split("M")[0])*1.25*1000,4)
                else:
                    return round(float(value)*1.25/1000,4)
            elif re.search("\€\d+",value):
                value = value.split("€")[1]
                if "k" in value:
                    return round(float(value.split("k")[0])*1.09,4)
                elif "M" in value:
                    return round(float(value.split("M")[0])*1.09*1000,4)
                else:
                    return round(float(value)*1.09/1000,4)
                
            else:
                return round(float(value)/1000,4)
            
city_processed3["total_money_Dollars (k)"] = city_processed3["total_money_raised"].apply(getMoney)

In [82]:
print(list(city_processed3["total_money_Dollars (k)"].value_counts().index))

[0.0, 1000.0, 17100.0, 750.0, 930.0, 300.0, 7550.0, 800.0, 59.0, 15.0, 4000.0, 8000.0, 7100.0]


In [83]:
city_processed4 = city_processed3[city_processed3["total_money_Dollars (k)"]>=1000]
display(city_processed4.head(),city_processed4.shape)

Unnamed: 0,_id,name,category_code,number_of_employees,founded_year,total_money_raised,city,total_money_Dollars (k)
532,"{""$oid"": ""52cdef7c4bab8bd675297f9e""}",GENWI,mobile,25.0,2010.0,$7.1M,Los Altos,7100.0
612,"{""$oid"": ""52cdef7c4bab8bd675297fec""}",Fixya,web,30.0,2013.0,$8M,San Mateo,8000.0
1428,"{""$oid"": ""52cdef7c4bab8bd67529831a""}",Social Gaming Network,games_video,100.0,2011.0,$17.1M,Los Angeles,17100.0
1428,"{""$oid"": ""52cdef7c4bab8bd67529831a""}",Social Gaming Network,games_video,100.0,2011.0,$17.1M,Beverly Hills,17100.0
1912,"{""$oid"": ""52cdef7c4bab8bd675298509""}",Skydeck,mobile,9.0,2012.0,$4M,Palo Alto,4000.0


(9, 8)

In [84]:
city_processed5["city"].value_counts().head()

New York         4
San Francisco    3
Palo Alto        2
Santa Monica     2
Paris            2
Name: city, dtype: int64

San Francisco es la segunda ciudad donde más startups tech con valor de más de 1M de dolares hay, la segunda ciudad con más oficinas de empresas dedicadas a gaming. Además hay una empresa de design. Asi que me centro en San Francisco.

In [85]:
# Export as json to do mongodb import
# $ mongoimport --db datamad0320 --collection companies_prepared --jsonArray companies_clean.json
geo2.to_json("../output/geocompanies.json",orient="records")

NameError: name 'geo2' is not defined