In [269]:
import requests
import pandas as pd
from pymongo import MongoClient
from pymongo import ASCENDING, DESCENDING
from bson.json_util import dumps
import re

In [270]:
client = MongoClient("mongodb://localhost/datamad0320")
db = client.get_database()

In [271]:
# Voy a selecconar los datos que me interesan para buscar startups de tech con más de $1M de cantidad alcanzada 
all_companies = db.companies.find({},{"name":1,"offices":1,"total_money_raised":1,"number_of_employees":1,
                                   "founded_year":1,"category_code":1})

In [272]:
df = pd.DataFrame(all_companies)
display(df.head(),df.shape)

Unnamed: 0,_id,name,category_code,number_of_employees,founded_year,total_money_raised,offices
0,52cdef7c4bab8bd675297d8b,AdventNet,enterprise,600.0,1996.0,$0,"[{'description': 'Headquarters', 'address1': '..."
1,52cdef7c4bab8bd675297d92,Flektor,games_video,,,$0,"[{'description': None, 'address1': '8536 Natio..."
2,52cdef7c4bab8bd675297d8c,Zoho,software,1600.0,2005.0,$0,"[{'description': 'Headquarters', 'address1': '..."
3,52cdef7c4bab8bd675297d8a,Wetpaint,web,47.0,2005.0,$39.8M,"[{'description': '', 'address1': '710 - 2nd Av..."
4,52cdef7c4bab8bd675297d93,Fox Interactive Media,web,0.0,1979.0,$0,"[{'description': '', 'address1': '407 N Maple ..."


(18801, 7)

In [273]:
# convierto los id en formato json para que al guardarlos en formato json no me den problema después
for i in range(len(df["_id"])):
    df["_id"].iloc[i] = dumps(df["_id"].iloc[i])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


In [274]:
# separar los objetos de las listas offices
df = df.explode("offices")
display(df.head(),df.shape)

Unnamed: 0,_id,name,category_code,number_of_employees,founded_year,total_money_raised,offices
0,"{""$oid"": ""52cdef7c4bab8bd675297d8b""}",AdventNet,enterprise,600.0,1996.0,$0,"{'description': 'Headquarters', 'address1': '4..."
1,"{""$oid"": ""52cdef7c4bab8bd675297d92""}",Flektor,games_video,,,$0,"{'description': None, 'address1': '8536 Nation..."
2,"{""$oid"": ""52cdef7c4bab8bd675297d8c""}",Zoho,software,1600.0,2005.0,$0,"{'description': 'Headquarters', 'address1': '4..."
3,"{""$oid"": ""52cdef7c4bab8bd675297d8a""}",Wetpaint,web,47.0,2005.0,$39.8M,"{'description': '', 'address1': '710 - 2nd Ave..."
3,"{""$oid"": ""52cdef7c4bab8bd675297d8a""}",Wetpaint,web,47.0,2005.0,$39.8M,"{'description': '', 'address1': '270 Lafayette..."


(21762, 7)

In [275]:
#hay 5057 valores nulos en offices
df.offices.isnull().value_counts()

False    16705
True      5057
Name: offices, dtype: int64

In [276]:
# transform office object into GeoPoint for office
def officeToGeoPoint(row):
    office = row.offices
    if type(office) == dict:
        if 'latitude' in office and 'longitude' in office:
            if office["latitude"] and office["longitude"]:
                return ({
                    "type":"Point",
                    "coordinates":[float(office["longitude"]),float(office["latitude"])]
                },"success")
            else:
                return(None,"Invalid lat and long")
        else:
            return (None,"No lat and long keys in office dict")
    return (None,"No office")

In [277]:
cleaned_offices = df.apply(officeToGeoPoint, axis=1, result_type="expand")
cleaned_offices.columns = ["office","state"]
display(cleaned_offices.head(),cleaned_offices.shape)

Unnamed: 0,office,state
0,"{'type': 'Point', 'coordinates': [-121.904945,...",success
1,"{'type': 'Point', 'coordinates': [-118.379768,...",success
2,"{'type': 'Point', 'coordinates': [-121.904945,...",success
3,"{'type': 'Point', 'coordinates': [-122.333253,...",success
3,"{'type': 'Point', 'coordinates': [-73.9964312,...",success


(21762, 2)

In [279]:
#hay 5057 valores No office (que corresponden a los nulos de antes) y ahora se han sumado 5871 valores de las oficinas que 
# no tienen datos de lat y long
cleaned_offices.state.value_counts()

success                 10834
Invalid lat and long     5871
No office                5057
Name: state, dtype: int64

In [280]:
company_processed = pd.concat([df,cleaned_offices], axis=1)
display(company_processed.head(),company_processed.shape)

Unnamed: 0,_id,name,category_code,number_of_employees,founded_year,total_money_raised,offices,office,state
0,"{""$oid"": ""52cdef7c4bab8bd675297d8b""}",AdventNet,enterprise,600.0,1996.0,$0,"{'description': 'Headquarters', 'address1': '4...","{'type': 'Point', 'coordinates': [-121.904945,...",success
1,"{""$oid"": ""52cdef7c4bab8bd675297d92""}",Flektor,games_video,,,$0,"{'description': None, 'address1': '8536 Nation...","{'type': 'Point', 'coordinates': [-118.379768,...",success
2,"{""$oid"": ""52cdef7c4bab8bd675297d8c""}",Zoho,software,1600.0,2005.0,$0,"{'description': 'Headquarters', 'address1': '4...","{'type': 'Point', 'coordinates': [-121.904945,...",success
3,"{""$oid"": ""52cdef7c4bab8bd675297d8a""}",Wetpaint,web,47.0,2005.0,$39.8M,"{'description': '', 'address1': '710 - 2nd Ave...","{'type': 'Point', 'coordinates': [-122.333253,...",success
3,"{""$oid"": ""52cdef7c4bab8bd675297d8a""}",Wetpaint,web,47.0,2005.0,$39.8M,"{'description': '', 'address1': '270 Lafayette...","{'type': 'Point', 'coordinates': [-73.9964312,...",success


(21762, 9)

In [282]:
#función para obtener los datos de ciudad donde se encuentran las oficinas
def getCity(row):
    of=row.offices
    if type(of) == dict:
        if "city" in of:
            if of["city"]:
                return of["city"]
            else:
                return "NoCity"
        else:
            return None
    else:
        return "NoOffice"

city = pd.DataFrame(company_processed.apply(getCity, axis=1))
city.columns=["city"]
city.head()

Unnamed: 0,city
0,Pleasanton
1,Culver City
2,Pleasanton
3,Seattle
3,New York


In [283]:
city_processed = pd.concat([company_processed,city], axis=1)
city_processed = city_processed.drop(columns=["offices"])
display(city_processed.head(),city_processed.shape)

Unnamed: 0,_id,name,category_code,number_of_employees,founded_year,total_money_raised,office,state,city
0,"{""$oid"": ""52cdef7c4bab8bd675297d8b""}",AdventNet,enterprise,600.0,1996.0,$0,"{'type': 'Point', 'coordinates': [-121.904945,...",success,Pleasanton
1,"{""$oid"": ""52cdef7c4bab8bd675297d92""}",Flektor,games_video,,,$0,"{'type': 'Point', 'coordinates': [-118.379768,...",success,Culver City
2,"{""$oid"": ""52cdef7c4bab8bd675297d8c""}",Zoho,software,1600.0,2005.0,$0,"{'type': 'Point', 'coordinates': [-121.904945,...",success,Pleasanton
3,"{""$oid"": ""52cdef7c4bab8bd675297d8a""}",Wetpaint,web,47.0,2005.0,$39.8M,"{'type': 'Point', 'coordinates': [-122.333253,...",success,Seattle
3,"{""$oid"": ""52cdef7c4bab8bd675297d8a""}",Wetpaint,web,47.0,2005.0,$39.8M,"{'type': 'Point', 'coordinates': [-73.9964312,...",success,New York


(21762, 9)

In [287]:
list(city_processed["founded_year"].value_counts().index)

[2008.0,
 2007.0,
 2006.0,
 2005.0,
 2004.0,
 2009.0,
 1999.0,
 2003.0,
 2000.0,
 2001.0,
 2002.0,
 1998.0,
 1996.0,
 1997.0,
 1995.0,
 1994.0,
 1993.0,
 1992.0,
 1989.0,
 1991.0,
 2010.0,
 1986.0,
 1990.0,
 1988.0,
 2011.0,
 1987.0,
 1983.0,
 1967.0,
 1978.0,
 1982.0,
 1985.0,
 2012.0,
 1980.0,
 1984.0,
 1981.0,
 1979.0,
 1969.0,
 1976.0,
 1972.0,
 1974.0,
 1971.0,
 2013.0,
 1968.0,
 1970.0,
 1973.0,
 1975.0,
 1966.0,
 1959.0,
 1962.0,
 1930.0,
 1955.0,
 1954.0,
 1946.0,
 1947.0,
 1950.0,
 1898.0,
 1933.0,
 1921.0,
 1922.0,
 1890.0,
 1943.0,
 1923.0,
 1958.0,
 1840.0,
 1926.0,
 1977.0,
 1952.0,
 1963.0,
 1915.0,
 1888.0,
 1847.0,
 1920.0,
 1906.0,
 1964.0,
 1912.0,
 1902.0,
 1901.0,
 1961.0,
 1960.0,
 1800.0,
 1957.0,
 1945.0,
 1833.0,
 1951.0,
 1851.0,
 1879.0,
 1965.0,
 1897.0,
 1865.0,
 1937.0,
 1938.0,
 1846.0,
 1854.0,
 1939.0,
 1919.0,
 1940.0,
 1909.0,
 1892.0,
 1903.0,
 1948.0,
 1889.0,
 1880.0,
 1900.0,
 1914.0,
 1941.0,
 1936.0,
 1867.0,
 1844.0,
 1802.0,
 1899.0,
 1881.0,
 

In [284]:
#voy a seleccionar startups, por lo tanto, selecciono solo aquellas cuyo founded year sea >= 2014
city_processed2 = city_processed[city_processed["founded_year"]>=2014]

In [285]:
display(city_processed2.head(),city_processed2.shape)

Unnamed: 0,_id,name,category_code,number_of_employees,founded_year,total_money_raised,office,state,city


(0, 9)

In [249]:
city_processed2["city"].value_counts()

NoOffice         312
San Francisco     50
New York          39
NoCity            31
London            27
                ... 
Marinette          1
Covington          1
Panama City        1
paris              1
Des Moines         1
Name: city, Length: 279, dtype: int64

In [250]:
print(list(city_processed2["category_code"].value_counts().index))

['web', 'software', 'games_video', 'mobile', 'advertising', 'other', 'ecommerce', 'enterprise', 'public_relations', 'consulting', 'search', 'network_hosting', 'cleantech', 'travel', 'analytics', 'biotech', 'photo_video', 'security', 'hardware', 'social', 'education', 'messaging', 'legal', 'hospitality', 'design', 'fashion', 'music', 'finance', 'medical', 'news']


In [251]:
index_NOtechs = city_processed2.index[(city_processed2["category_code"]!="web")&(city_processed2["category_code"]!= "software")&
                                      (city_processed2["category_code"]!="games_video")&(city_processed2["category_code"]!="mobile")&
                                      (city_processed2["category_code"]!="network_hosting")&(city_processed2["category_code"]!="cleantech")&
                                      (city_processed2["category_code"]!="biotech")&(city_processed2["category_code"]!="photo_video")&
                                      (city_processed2["category_code"]!="hardware")&(city_processed2["category_code"]!="messaging")]

In [252]:
index_NOtechs

Int64Index([  499,   522,   817,  1103,  1106,  1359,  1359,  1359,  1404,
             1435,
            ...
            18699, 18713, 18762, 18768, 18768, 18769, 18771, 18771, 18779,
            18796],
           dtype='int64', length=351)

In [253]:
city_processed3 = city_processed2.drop(index=index_NOtechs)
city_processed3

Unnamed: 0,_id,name,category_code,number_of_employees,founded_year,total_money_raised,offices,office,state,city
281,"{""$oid"": ""52cdef7c4bab8bd675297ea7""}",Mokitown,web,,2011.0,$0,"{'description': None, 'address1': None, 'addre...","{'type': 'Point', 'coordinates': [-95.712891, ...",success,NoCity
532,"{""$oid"": ""52cdef7c4bab8bd675297f9e""}",GENWI,mobile,25.0,2010.0,$7.1M,"{'description': '', 'address1': '4966 El Camin...","{'type': 'Point', 'coordinates': [-111.9035, 3...",success,Los Altos
549,"{""$oid"": ""52cdef7c4bab8bd675297fb1""}",headr,web,8.0,2012.0,$0,"{'description': '', 'address1': 'Ritterstr. 12...","{'type': 'Point', 'coordinates': [13.4109071, ...",success,Berlin
549,"{""$oid"": ""52cdef7c4bab8bd675297fb1""}",headr,web,8.0,2012.0,$0,"{'description': '', 'address1': 'Fischerstr. 1...",,Invalid lat and long,Hannover
612,"{""$oid"": ""52cdef7c4bab8bd675297fec""}",Fixya,web,30.0,2013.0,$8M,"{'description': '', 'address1': 'One Franklin ...","{'type': 'Point', 'coordinates': [-122.323895,...",success,San Mateo
...,...,...,...,...,...,...,...,...,...,...
18761,"{""$oid"": ""52cdef7f4bab8bd67529c6d3""}",Gambolio,web,1.0,2009.0,$0,,,No office,NoOffice
18764,"{""$oid"": ""52cdef7f4bab8bd67529c6dc""}",acrossair,mobile,6.0,2009.0,$0,"{'description': '198 High Holborn', 'address1'...","{'type': 'Point', 'coordinates': [-0.1165365, ...",success,London
18772,"{""$oid"": ""52cdef7f4bab8bd67529c6e4""}",Getyoo,mobile,10.0,2009.0,€1.13M,"{'description': 'HQ', 'address1': 'Rue des Tan...","{'type': 'Point', 'coordinates': [4.3464721, 5...",success,Brussels
18782,"{""$oid"": ""52cdef7f4bab8bd67529c6eb""}",DocASAP,web,7.0,2012.0,$800k,"{'description': '', 'address1': '115 5th Ave',...",,Invalid lat and long,New York


In [254]:
city_processed3["city"].value_counts()

NoOffice                  195
San Francisco              25
London                     19
NoCity                     18
New York                   18
                         ... 
CornellÃ  de Llobregat      1
Shanghai                    1
Phoenix                     1
Oakland                     1
none                        1
Name: city, Length: 197, dtype: int64

In [256]:
print(list(city_processed3["total_money_raised"].value_counts().index))

['$0', '$34.8M', '€3M', '$2M', '$46.6M', '$100k', '$200k', '€1M', '$1M', '$500k', '$341M', '$4.6M', '$15.1M', '$14.3M', '$8M', '$17.1M', '$147M', '$750k', '$6M', '$300k', '$1.5M', '$40k', '€200k', '$3M', '$930k', '$800k', '$10k', '$20k', '$6.75M', '$1.2M', '$20M', '$4M', '$6.5M', '$2.6M', '$10.2M', '$5.46M', '$15k', '$350k', '€1.5M', '€1.13M', '$4.26M', '$16M', '$13.4M', '€141k', '$44M', '$55.8M', '$400k', '$77.5M', '$7.55M', '£25k', '$4.03M', '$3.72M', '$78.1M', '€50k', '£50k', '$25k', '$21.1M', '£500k', '£11k', '$50k', 'C$1M', '£3M', '$2.98M', '$570k', '€25k', '$122M', '$550k', '$5.08M', '$3.64M', '$13M', '$17.2M', '$357M', '$3.1M', '€25M', '$41.3M', '$59k', '$18.5M', '$7.97M', '$1.6M', '$68.8M', '$7.1M', '$600k', '$1.7M']


In [263]:
def getMoney(value):
        if value:
            if re.search("\$\d+",value):
                value = value.split("$")[1]
                if "k" in value:
                    return round(float(value.split("k")[0]),4)
                elif "M" in value:
                    return round(float(value.split("M")[0])*1000,4)
                else:
                    return round(float(value)/1000,4)
            elif re.search("\£\d+",value):
                value = value.split("£")[1]
                if "k" in value:
                    return round(float(value.split("k")[0])*1.25,4)
                elif "M" in value:
                    return round(float(value.split("M")[0])*1.25*1000,4)
                else:
                    return round(float(value)*1.25/1000,4)
            elif re.search("\€\d+",value):
                value = value.split("€")[1]
                if "k" in value:
                    return round(float(value.split("k")[0])*1.09,4)
                elif "M" in value:
                    return round(float(value.split("M")[0])*1.09*1000,4)
                else:
                    return round(float(value)*1.09/1000,4)
                
            else:
                return round(float(value)/1000,4)
            
city_processed3["total_money_Dollars (k)"] = city_processed3["total_money_raised"].apply(getMoney)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [265]:
print(list(city_processed3["total_money_Dollars (k)"].value_counts().index))

[0.0, 34800.0, 2000.0, 3270.0, 1000.0, 100.0, 200.0, 46600.0, 1090.0, 15100.0, 341000.0, 4600.0, 14300.0, 500.0, 8000.0, 750.0, 3000.0, 6750.0, 147000.0, 10.0, 20.0, 218.0, 40.0, 300.0, 800.0, 17100.0, 930.0, 6000.0, 1500.0, 4030.0, 400.0, 600.0, 1600.0, 50.0, 59.0, 62.5, 31.25, 27250.0, 13400.0, 4260.0, 1200.0, 54.5, 25.0, 16000.0, 7550.0, 15.0, 570.0, 550.0, 3720.0, 21100.0, 5460.0, 4000.0, 55800.0, 3100.0, 13.75, 27.25, 2980.0, 6500.0, 44000.0, 5080.0, 17200.0, 41300.0, 122000.0, 357000.0, 1700.0, 77500.0, 2600.0, 1635.0, 7100.0, 350.0, 3750.0, 7970.0, 20000.0, 13000.0, 10200.0, 153.69, 78100.0, 3640.0, 625.0, 18500.0, 68800.0, 1231.7]


In [266]:
city_processed4 = city_processed3[city_processed3["total_money_Dollars (k)"]>=1000]
city_processed4.head()

Unnamed: 0,_id,name,category_code,number_of_employees,founded_year,total_money_raised,offices,office,state,city,total_money_Dollars,total_money_Dollars (k)
532,"{""$oid"": ""52cdef7c4bab8bd675297f9e""}",GENWI,mobile,25.0,2010.0,$7.1M,"{'description': '', 'address1': '4966 El Camin...","{'type': 'Point', 'coordinates': [-111.9035, 3...",success,Los Altos,7100.0,7100.0
612,"{""$oid"": ""52cdef7c4bab8bd675297fec""}",Fixya,web,30.0,2013.0,$8M,"{'description': '', 'address1': 'One Franklin ...","{'type': 'Point', 'coordinates': [-122.323895,...",success,San Mateo,8000.0,8000.0
638,"{""$oid"": ""52cdef7c4bab8bd675298009""}",Mobovivo,mobile,22.0,2009.0,C$1M,"{'description': 'Canadian', 'address1': '1400 ...",,Invalid lat and long,Calgary,1000.0,1000.0
659,"{""$oid"": ""52cdef7c4bab8bd67529801f""}",Wamba,web,120.0,2013.0,€3M,,,No office,NoOffice,3270.0,3270.0
1046,"{""$oid"": ""52cdef7c4bab8bd6752981a0""}",Pinger,messaging,,2012.0,$18.5M,"{'description': '', 'address1': '97 S. 2nd Str...",,Invalid lat and long,San Jose,18500.0,18500.0


In [268]:
city_processed4["city"].value_counts().head()

San Francisco    11
New York          7
NoCity            4
Palo Alto         4
London            3
Name: city, dtype: int64

San Francisco es la ciudad donde más startups tech con valor de más de 1M de dolares hay. Asi que me centro en San Francisco

In [41]:
geo2 = GeoCompanies.drop(columns="_id")
geo2.shape

(16705, 2)

In [42]:
# Export as json to do mongodb import
# $ mongoimport --db datamad0320 --collection companies_prepared --jsonArray companies_clean.json
geo2.to_json("../output/geocompanies.json",orient="records")