# **Analysis to choose a city**

In [13]:
import pandas as pd

In [14]:
df_clean = pd.read_csv("../output/clean_dataset.csv")

In [15]:
df_clean.head()

Unnamed: 0.1,Unnamed: 0,_id,name,category_code,number_of_employees,founded_year,city,location,latitude,longitude,total_money_raised ($k)
0,0,"{""$oid"": ""52cdef7c4bab8bd675297d8b""}",AdventNet,enterprise,600.0,1996.0,Pleasanton,"{'type': 'Point', 'coordinates': [-121.904945,...",37.692934,-121.904945,0.0
1,1,"{""$oid"": ""52cdef7c4bab8bd675297d92""}",Flektor,games_video,,,Culver City,"{'type': 'Point', 'coordinates': [-118.379768,...",34.025958,-118.379768,0.0
2,2,"{""$oid"": ""52cdef7c4bab8bd675297d8c""}",Zoho,software,1600.0,2005.0,Pleasanton,"{'type': 'Point', 'coordinates': [-121.904945,...",37.692934,-121.904945,0.0
3,3,"{""$oid"": ""52cdef7c4bab8bd675297d8a""}",Wetpaint,web,47.0,2005.0,Seattle,"{'type': 'Point', 'coordinates': [-122.333253,...",47.603122,-122.333253,39800.0
4,3,"{""$oid"": ""52cdef7c4bab8bd675297d8a""}",Wetpaint,web,47.0,2005.0,New York,"{'type': 'Point', 'coordinates': [-73.9964312,...",40.7237306,-73.9964312,39800.0


#### **First, cities where there are companies doing design**

In [16]:
df_clean["city"][df_clean["category_code"]=="design"].value_counts()

San Francisco    1
London           1
Berlin           1
Brooklyn         1
Collingwood      1
Ellensburg       1
Name: city, dtype: int64

#### **Next, cities where more tech startups exist with a "total_money_raised" greater than or equal to $1M. The "founded_year" of the company has to be after 2009**

In [17]:
# "founded_year" after 2009 because this dataset does not contain a company whose "founded_year" is after 2013 (this dataset may 
# be from 2014). So, "founded_year" will be 3 years before 2013
df_clean["city"][((df_clean["category_code"]=="web")|(df_clean["category_code"]== "software")|
                  (df_clean["category_code"]=="games_video")|(df_clean["category_code"]=="mobile")|
                  (df_clean["category_code"]=="network_hosting")|(df_clean["category_code"]=="cleantech")|
                  (df_clean["category_code"]=="biotech")|(df_clean["category_code"]=="photo_video")|
                  (df_clean["category_code"]=="hardware")|(df_clean["category_code"]=="messaging"))
                 & (df_clean["founded_year"]>=2010) & (df_clean["total_money_raised ($k)"]>=float(1000))].value_counts()

San Francisco          3
NoOffice               2
Denver                 2
Scottsdale             2
Hopkinton              2
Tel Aviv               1
Santa Monica           1
Los Angeles            1
Bangalore              1
Santa Clara            1
New York               1
SOUTH BOSTON           1
Fremont                1
NoCity                 1
Waterloo, ON           1
Palo Alto              1
Los Altos              1
San Mateo              1
San Jose               1
Beverly Hills          1
London                 1
South San Francisco    1
Name: city, dtype: int64

#### **Finally, cities classified by number companies (or offices)**

In [18]:
df_clean["city"].value_counts()

NoOffice              5057
San Francisco          906
New York               837
NoCity                 746
London                 616
                      ... 
Brookfield               1
Las Rozas (Madrid)       1
Rossland                 1
Gentbrugge               1
Yorba Linda              1
Name: city, Length: 3126, dtype: int64

#### **San Francisco is the city with more companies, with a design company and with more successful tech startups. So, San Francisco is going to be the city where the company will be located**

In [19]:
# Choosing companies (or offices) located in San Francisco
companiesSF = df_clean[df_clean["city"]=="San Francisco"]

In [20]:
# Removing "_id" column because several "_id" are repeated (due to the explode() function) and MongoDB does not import documents 
# with the same "_id"
# Also, removing unncessary coolumns
companiesSF= companiesSF.drop(columns=["_id","Unnamed: 0","city"])

In [21]:
display(companiesSF.head(),companiesSF.shape)

Unnamed: 0,name,category_code,number_of_employees,founded_year,location,latitude,longitude,total_money_raised ($k)
7,Digg,news,60.0,2004.0,"{'type': 'Point', 'coordinates': [-122.394523,...",37.764726,-122.394523,45000.0
9,Scribd,news,50.0,2007.0,"{'type': 'Point', 'coordinates': [-122.404052,...",37.789634,-122.404052,25800.0
15,StumbleUpon,web,,2002.0,"{'type': 'Point', 'coordinates': [-122.419204,...",37.775196,-122.419204,18500.0
26,Twitter,social,1300.0,2006.0,"{'type': 'Point', 'coordinates': [-122.4169244...",37.7768052,-122.4169244,1160000.0
28,Powerset,search,60.0,2006.0,"{'type': 'Point', 'coordinates': [-122.395289,...",37.778613,-122.395289,22500.0


(906, 8)

#### **Save dataset into json and csv formats**

In [22]:
companiesSF.to_json("../output/companiesSF.json",orient="records")
companiesSF.to_csv("../output/companiesSF.csv")