In [None]:
####################################################################################################################
# Esther Lowe - Random Cities Generator - 06/22/2019                                                               #
#                                                                                                                  #
# This program generates a randomized list of over 8000 cities with their geoname IDs, latitudes, and longitudes.  #
# Data for this project comes from a MAXMIND free database: https://dev.maxmind.com/geoip/geoip2/geolite2/         #
####################################################################################################################

In [1]:
import csv
import os
import pandas as pd

# Load in cities csv
cities_csv = "Resources/GeoLite2-City-Locations-en.csv"
random_cities.csv
cities_df = pd.read_csv(cities_csv)
cities_df.head()

Unnamed: 0,geoname_id,locale_code,continent_code,continent_name,country_iso_code,country_name,subdivision_1_iso_code,subdivision_1_name,subdivision_2_iso_code,subdivision_2_name,city_name,metro_code,time_zone,is_in_european_union
0,5822,en,EU,Europe,CY,Cyprus,02,Limassol,,,Zanakia,,Asia/Nicosia,1
1,49518,en,AF,Africa,RW,Rwanda,,,,,,,Africa/Kigali,0
2,49747,en,AF,Africa,SO,Somalia,BK,Bakool,,,Oddur,,Africa/Mogadishu,0
3,51537,en,AF,Africa,SO,Somalia,,,,,,,Africa/Mogadishu,0
4,53654,en,AF,Africa,SO,Somalia,BN,Banaadir,,,Mogadishu,,Africa/Mogadishu,0


In [7]:
#get rid of unnecessary columns, keeping only "geoname_id" and "city_name":

cities_df = cities_df[["geoname_id","city_name"]]
cities_df.head()

Unnamed: 0,geoname_id,city_name
0,5822,Zanakia
1,49518,
2,49747,Oddur
3,51537,
4,53654,Mogadishu


In [8]:
# Load in lat_long_csv

lat_long_csv = "Resources/GeoLite2-City-Blocks-IPv6.csv"
lat_long_df = pd.read_csv(lat_long_csv)
lat_long_df.head()

Unnamed: 0,network,geoname_id,registered_country_geoname_id,represented_country_geoname_id,is_anonymous_proxy,is_satellite_provider,postal_code,latitude,longitude,accuracy_radius
0,9c:1e::95:69:9d:41/128,4704203,,,0,0,75770,32.1608,-95.5927,100
1,2000:db8::/32,5332921,,,0,0,93614,37.2502,-119.7513,100
2,2001:200::/40,1861060,1861060.0,,0,0,,36.0,138.0,100
3,2001:200:100::/41,1848354,1861060.0,,0,0,223-0051,35.5475,139.6445,1
4,2001:200:180::/42,1861060,1861060.0,,0,0,,36.0,138.0,100


In [9]:
#get rid of unnecessary columns, keeping only "geoname_id", "latitude" and "longitude":

lat_long_df = lat_long_df[["geoname_id","latitude","longitude"]]
lat_long_df.head()

Unnamed: 0,geoname_id,latitude,longitude
0,4704203,32.1608,-95.5927
1,5332921,37.2502,-119.7513
2,1861060,36.0,138.0
3,1848354,35.5475,139.6445
4,1861060,36.0,138.0


In [20]:
# Merge the two dataframes together on "geoname_id"
combined_data_df = pd.merge(cities_df, lat_long_df, how='outer', on='geoname_id')
combined_data_df.head()


Unnamed: 0,geoname_id,city_name,latitude,longitude
0,5822,Zanakia,,
1,49518,,-2.0,30.0
2,49518,,-2.0,30.0
3,49518,,-2.0,30.0
4,49518,,-2.0,30.0


In [21]:
# using dropna to remove all rows without city_name, latitude, & longitude and find the length of the dataframe:

combined_cities_df = combined_data_df.dropna()
print(len(combined_cities_df))
combined_cities_df.head()

151348


Unnamed: 0,geoname_id,city_name,latitude,longitude
79,98182,Baghdad,33.3406,44.4009
88,99072,Mosul,36.335,43.1189
304,105343,Jeddah,21.5168,39.2192
305,105343,Jeddah,21.5168,39.2192
306,105343,Jeddah,21.5168,39.2192


In [29]:
#Removing duplicate city entries and rest the index:
consolidated_cities_df = combined_cities_df.drop_duplicates().reset_index(drop=True)

consolidated_cities_df.head()

Unnamed: 0,geoname_id,city_name,latitude,longitude
0,98182,Baghdad,33.3406,44.4009
1,99072,Mosul,36.335,43.1189
2,105343,Jeddah,21.5168,39.2192
3,106281,Ha'il,27.5219,41.6907
4,107797,Dhahran,26.3032,50.1353


In [36]:
# The current consolidated cities DataFrame has 34,773 unique cities in it. I will work with 1/4 of that data:
print(len(consolidated_cities_df))
# Create a selection of 500 random cities:
random_cities_df = consolidated_cities_df.sample(frac=1/4).reset_index(drop=True)
random_cities_df.head(50)

34773


Unnamed: 0,geoname_id,city_name,latitude,longitude
0,2111149,Sendai,38.2661,140.8496
1,2111325,Otawara,36.8629,140.0183
2,3029974,Brive-la-Gaillarde,45.15,1.5333
3,3049896,Kiskoros,46.6214,19.2853
4,4392388,Jefferson City,38.5462,-92.1525
5,4160021,Jacksonville,30.1548,-81.63
6,5805734,Omak,48.4143,-119.5272
7,5257754,Janesville,42.6713,-89.007
8,6101321,Peribonka,48.7668,-72.049
9,4509177,Columbus,39.9046,-82.9703


In [37]:
# Double check that the length of the randomized cities df is small enough to make use of it somewhat time efficient:
print(len(random_cities_df))

8693


In [38]:
# save the dataframe to a csv file:

random_cities_df.to_csv("Resources/random_cities.csv", encoding="utf-8", index=False)