In [1]:
import pandas as pd

In [2]:
raw_country_cities_file = '../data/raw/raw_country_cities.csv'
parsed_country_cities_file = '../data/parsed/parsed_country_cities.csv'
parsed_country_cities_grouped_file = '../data/parsed/parsed_country_cities_grouped.csv'

In [3]:
country_cities_df = pd.read_csv(raw_country_cities_file, index_col=False, encoding='ISO-8859-1', low_memory=False).dropna()

In [4]:
# Remove unnecessary columns
country_cities_df = country_cities_df.drop(['AccentCity', 'Region', 'Population', 'Latitude', 'Longitude'], axis=1)

# Fix country codes
country_cities_df['Country'] = country_cities_df['Country'].apply(lambda x: x.strip().upper())
country_cities_df.head()

Unnamed: 0,Country,City
6,AD,andorra la vella
20,AD,canillo
32,AD,encamp
49,AD,la massana
53,AD,les escaldes


In [5]:
# Write to un-grouped parsed file
country_cities_df.to_csv(parsed_country_cities_file, index=False, encoding='utf-8', compression='gzip')

In [6]:
# Create a dictionary with the country as key and list of cities as values
country_cities_dict = {}
for index, row in country_cities_df.iterrows():
    country = str(row['Country'])
    city = str(row['City'])
    country_cities_dict[country] = country_cities_dict.get(country, list()) + [city]
    
# Fix dict format for datafrmae convertion
for country, cities in country_cities_dict.items():
    country_cities_dict[country] = {'Cities': country_cities_dict[country]}

In [7]:
# Create datafrmae from dict
country_cities_df = pd.DataFrame.from_dict(country_cities_dict, orient='index')

# Reformat the dataframe for CSV file storing
country_cities_df = country_cities_df.reset_index()
country_cities_df = country_cities_df.rename(columns={'index': 'Country'})
country_cities_df.head()

Unnamed: 0,Country,Cities
0,AD,"[andorra la vella, canillo, encamp, la massana..."
1,AE,"[abu dhabi, dubai, sharjah]"
2,AF,"[acin, anar darreh, andarab, asadabad, asmar, ..."
3,AG,"[all saints, carlisle, codrington, freetown, l..."
4,AI,[the valley]


In [8]:
# Storing the grouped dataframe to file
country_cities_df.to_csv(parsed_country_cities_grouped_file, index=False, encoding='utf-8', compression='gzip')