# Case Study Ch3 

# Dictionaries for Data Science

In [None]:
# Zip lists: zipped_lists
zipped_lists = zip(feature_names, row_vals)

# Create a dictionary: rs_dict
rs_dict = dict(zipped_lists)

# Print the dictionary
print(rs_dict)

# Writing a function

In [None]:
# Define lists2dict()
def lists2dict(list1, list2):
    """Return a dictionary where list1 provides
    the keys and list2 provides the values."""

    # Zip lists: zipped_lists
    zipped_lists = zip(list1, list2)

    # Create a dictionary: rs_dict
    rs_dict = dict(zipped_lists)

    # Return the dictionary
    return(rs_dict)

# Call lists2dict: rs_fxn
rs_fxn = lists2dict(feature_names, row_vals)

# Print rs_fxn
print(rs_fxn)

# Using a list comprehension

In [None]:
# Print the first two lists in row_lists
print(row_lists[0])
print(row_lists[1])

# OUTPUT FROM ABOVE LINE:
# ['Arab World', 'ARB', 'Adolescent fertility rate (births per 1,000 women ages 15-19)', 'SP.ADO.TFRT', '1960', '133.56090740552298']
# ['Arab World', 'ARB', 'Age dependency ratio (% of working-age population)', 'SP.POP.DPND', '1960', '87.7976011532547']

# feature_names
# ['CountryName', 'CountryCode', 'IndicatorName', 'IndicatorCode', 'Year', 'Value']

# Turn list of lists into list of dicts: list_of_dicts
list_of_dicts = [lists2dict(feature_names,sublist) for sublist in row_lists]

# Print the first two dictionaries in list_of_dicts
print(list_of_dicts[0])
# {'Value': '133.56090740552298', 'IndicatorCode': 'SP.ADO.TFRT', 'CountryCode': 'ARB', 'Year': '1960', 'CountryName': 'Arab World', 'IndicatorName': 'Adolescent fertility rate (births per 1,000 women ages 15-19)'}
print(list_of_dicts[1])
# {'Value': '87.7976011532547', 'IndicatorCode': 'SP.POP.DPND', 'CountryCode': 'ARB', 'Year': '1960', 'CountryName': 'Arab World', 'IndicatorName': 'Age dependency ratio (% of working-age population)'}


# Turning this all into a DataFrame

In [None]:
# Import the pandas package
import pandas as pd

# Turn list of lists into list of dicts: list_of_dicts
list_of_dicts = [lists2dict(feature_names, sublist) for sublist in row_lists]

# Turn list of dicts into a DataFrame: df
df = pd.DataFrame(list_of_dicts)

# Print the head of the DataFrame
print(df.head())

#  CountryCode CountryName   IndicatorCode  \
#    0         ARB  Arab World     SP.ADO.TFRT   
#    1         ARB  Arab World     SP.POP.DPND   
#    2         ARB  Arab World  SP.POP.DPND.OL   
#    3         ARB  Arab World  SP.POP.DPND.YG   
#    4         ARB  Arab World  MS.MIL.XPRT.KD   
    
#                                           IndicatorName               Value  Year  
#    0  Adolescent fertility rate (births per 1,000 wo...  133.56090740552298  1960  
#    1  Age dependency ratio (% of working-age populat...    87.7976011532547  1960  
#    2  Age dependency ratio, old (% of working-age po...   6.634579191565161  1960  
#    3  Age dependency ratio, young (% of working-age ...   81.02332950839141  1960  
#    4        Arms exports (SIPRI trend indicator values)           3000000.0  1960

# Processing data in chunks (1)

Sometimes, data sources can be so large in size that storing the entire dataset in memory becomes too resource-intensive. In this exercise, you will process the first 1000 rows of a file line by line, to create a dictionary of the counts of how many times each country appears in a column in the dataset.

In [None]:
# Open a connection to the file
with open('world_dev_ind.csv') as file:

    # Skip the column names
    file.readline()

    # Initialize an empty dictionary: counts_dict
    counts_dict = {}

    # Process only the first 1000 rows
    for j in range(1000):

        # Split the current line into a list: line
        line = file.readline().split(',')

        # Get the value for the first column: first_col
        first_col = line[0]

        # If the column value is in the dict, increment its value
        if first_col in counts_dict.keys():
            counts_dict[first_col] += 1

        # Else, add to the dict and set value to 1
        else:
            counts_dict[first_col] = 1

# Print the resulting dictionary
print(counts_dict)

#    {'Europe & Central Asia (developing only)': 89, 'Euro area': 119, 'East Asia & Pacific (developing only)': 123, 'European Union': 116, 'Central Europe and the Baltics': 71, 'East Asia & Pacific (all income levels)': 122, 'Fragile and conflict affected situations': 76, 'Heavily indebted poor countries (HIPC)': 18, 'Arab World': 80, 'Europe & Central Asia (all income levels)': 109, 'Caribbean small states': 77}

# Writing a generator to load data in chunks (2)

In the previous exercise, you processed a file line by line for a given number of lines. What if, however, you want to do this for the entire file?

In this case, it would be useful to use generators. Generators allow users to lazily evaluate data. This concept of lazy evaluation is useful when you have to deal with very large datasets because it lets you generate values in an efficient manner by yielding only chunks of data at a time instead of the whole thing at once.

In this exercise, you will define a generator function read_large_file() that produces a generator object which yields a single line from a file each time next() is called on it. The csv file 'world_dev_ind.csv' is in your current directory for your use.

Note that when you open a connection to a file, the resulting file object is already a generator! So out in the wild, you won't have to explicitly create generator objects in cases such as this. However, for pedagogical reasons, we are having you practice how to do this here with the read_large_file() function. 

In [None]:
# Define read_large_file()
def read_large_file(file_object):
    """A generator function to read a large file lazily."""

    # Loop indefinitely until the end of the file
    while True:

        # Read a line from the file: data
        data = file_object.readline()

        # Break if this is the end of the file
        if not data:
            break

        # Yield the line of data
        yield data

# Open a connection to the file
with open('world_dev_ind.csv') as file:

    # Create a generator object for the file: gen_file
    gen_file = read_large_file(file)

    # Print the first three lines of the file
    print(next(gen_file))
    print(next(gen_file))
    print(next(gen_file))
    
# output:
#    CountryName,CountryCode,IndicatorName,IndicatorCode,Year,Value
#    
#    Arab World,ARB,"Adolescent fertility rate (births per 1,000 women ages 15-19)",SP.ADO.TFRT,1960,133.56090740552298
#    
#    Arab World,ARB,Age dependency ratio (% of working-age population),SP.POP.DPND,1960,87.7976011532547


# NOTE: that since a file object is already a generator, you don't have to explicitly create a generator object with 
# your read_large_file() function. However, it is still good to practice how to create generators - well done!

# Writing a generator to load data in chunks (3)

Great! You've just created a generator function that you can use to help you process large files.

Now let's use your generator function to process the World Bank dataset like you did previously. You will process the file line by line, to create a dictionary of the counts of how many times each country appears in a column in the dataset. For this exercise, however, you won't process just 1000 rows of data, you'll process the entire dataset!

In [None]:
# Initialize an empty dictionary: counts_dict
counts_dict = {}

# Open a connection to the file
with open('world_dev_ind.csv') as file:

    # Iterate over the generator from read_large_file()
    for line in read_large_file(file):

        row = line.split(',')
        first_col = row[0]

        if first_col in counts_dict.keys():
            counts_dict[first_col] += 1
        else:
            counts_dict[first_col] = 1

# Print            
print(counts_dict)

# {'Latin America & Caribbean (developing only)': 133, 'High income: OECD': 127, 'Middle income': 138, 'Low income': 80, 
# 'Small states': 69, 'Caribbean small states': 77, 'European Union': 116, 'CountryName': 1, 'OECD members': 130, 
# 'Lower middle income': 126, 'East Asia & Pacific (developing only)': 123, 'Euro area': 119, 
# 'Europe & Central Asia (developing only)': 89, 'South Asia': 36, 'East Asia & Pacific (all income levels)': 122, 
# 'Fragile and conflict affected situations': 76, 'Other small states': 63, 'Europe & Central Asia (all income levels)': 109, 
# 'Central Europe and the Baltics': 71, 'North America': 123, 'Low & middle income': 138, 'High income': 131, 
# 'Heavily indebted poor countries (HIPC)': 99, 'Pacific island small states': 66, 
# 'Least developed countries: UN classification': 78, 'Middle East & North Africa (developing only)': 94, 'Arab World': 80, 
# 'High income: nonOECD': 68, 'Middle East & North Africa (all income levels)': 89, 
# 'Latin America & Caribbean (all income levels)': 130}

# Writing an iterator to load data in chunks (1)

Another way to read data too large to store in memory in chunks is to read the file in as DataFrames of a certain length, say, 100. For example, with the pandas package (imported as pd), you can do pd.read_csv(filename, chunksize=100). This creates an iterable reader object, which means that you can use next() on it.

In this exercise, you will read a file in small DataFrame chunks with read_csv(). You're going to use the World Bank Indicators data 'ind_pop.csv', available in your current directory, to look at the urban population indicator for numerous countries and years.

In [None]:
# Import the pandas package
import pandas as pd

# Initialize reader object: df_reader
df_reader = pd.read_csv('ind_pop.csv', chunksize=10)

# Print two chunks
print(next(df_reader))
print(next(df_reader))

# Writing an iterator to load data in chunks (2)

In the previous exercise, you used read_csv() to read in DataFrame chunks from a large dataset. In this exercise, you will read in a file using a bigger DataFrame chunk size and then process the data from the first chunk.

To process the data, you will create another DataFrame composed of only the rows from a specific country. You will then zip together two of the columns from the new DataFrame, 'Total Population' and 'Urban population (% of total)'. Finally, you will create a list of tuples from the zip object, where each tuple is composed of a value from each of the two columns mentioned.

You're going to use the data from 'ind_pop_data.csv', available in your current directory. Pandas has been imported as pd.

In [None]:
# Initialize reader object: urb_pop_reader
urb_pop_reader = pd.read_csv('ind_pop_data.csv', chunksize=1000)

# Get the first DataFrame chunk: df_urb_pop
df_urb_pop = next(urb_pop_reader)

# Check out the head of the DataFrame
print(df_urb_pop.head())

# Check out specific country: df_pop_ceb
df_pop_ceb = df_urb_pop[df_urb_pop['CountryCode'] == 'CEB']

# Zip DataFrame columns of interest: pops
pops = zip(df_pop_ceb['Total Population'], 
df_pop_ceb['Urban population (% of total)'])

# Turn zip object into list: pops_list
pops_list = list(pops)

# Print pops_list
print(pops_list)

# Writing an iterator to load data in chunks (3)

You're getting used to reading and processing data in chunks by now. Let's push your skills a little further by adding a column to a DataFrame.

Starting from the code of the previous exercise, you will be using a list comprehension to create the values for a new column 'Total Urban Population' from the list of tuples that you generated earlier. Recall from the previous exercise that the first and second elements of each tuple consist of, respectively, values from the columns 'Total Population' and 'Urban population (% of total)'. The values in this new column 'Total Urban Population', therefore, are the product of the first and second element in each tuple. Furthermore, because the 2nd element is a percentage, you need to divide the entire result by 100, or alternatively, multiply it by 0.01.

You will also plot the data from this new column to create a visualization of the urban population data.

The packages pandas and matplotlib.pyplot have been imported as pd and plt respectively for your use.

In [None]:
# Code from previous exercise
urb_pop_reader = pd.read_csv('ind_pop_data.csv', chunksize=1000)
df_urb_pop = next(urb_pop_reader)

# df_urb_pop
#                                         CountryName CountryCode  Year  \
# 0                                        Arab World         ARB  1960   
# 1                            Caribbean small states         CSS  1960   
# 2                    Central Europe and the Baltics         CEB  1960   
# 3           East Asia & Pacific (all income levels)         EAS  1960   
# 4             East Asia & Pacific (developing only)         EAP  1960 

#      Total Population  Urban population (% of total)  
# 0        9.249590e+07                      31.285384  
# 1        4.190810e+06                      31.597490  
# 2        9.140158e+07                      44.507921  
# 3        1.042475e+09                      22.471132  
# 4        8.964930e+08                      16.917679

# ...
# [1000 rows x 5 columns]

df_pop_ceb = df_urb_pop[df_urb_pop['CountryCode'] == 'CEB']

#                        CountryName CountryCode  Year  Total Population  \
# 2    Central Europe and the Baltics         CEB  1960        91401583.0   
# 244  Central Europe and the Baltics         CEB  1961        92237118.0   
# 486  Central Europe and the Baltics         CEB  1962        93014890.0   
# 728  Central Europe and the Baltics         CEB  1963        93845749.0   
# 970  Central Europe and the Baltics         CEB  1964        94722599.0   

#      Urban population (% of total)  
# 2                        44.507921  
# 244                      45.206665  
# 486                      45.866565  
# 728                      46.534093  
# 970                      47.208743


pops = zip(df_pop_ceb['Total Population'], 
           df_pop_ceb['Urban population (% of total)'])
pops_list = list(pops)

# pops_list
#[(91401583.0, 44.5079211390026),
# (92237118.0, 45.206665319194),
# (93014890.0, 45.866564696018),
# (93845749.0, 46.5340927663649),
# (94722599.0, 47.2087429803526)]

# Use list comprehension to create new DataFrame column 'Total Urban Population'
# tup[0] references the first column within the "target" within the iterable "pops_list"
# tup[1] references the second column within the "target" within the iterable "pops_list"
df_pop_ceb['Total Urban Population'] = [int(tup[0] * tup[1] * 0.01) for tup in pops_list]

# Plot urban population data
df_pop_ceb.plot(kind='scatter', x='Year', y='Total Urban Population')
plt.show()

# Writing an iterator to load data in chunks (4)

In the previous exercises, you've only processed the data from the first DataFrame chunk. This time, you will aggregate the results over all the DataFrame chunks in the dataset. This basically means you will be processing the entire dataset now. This is neat because you're going to be able to process the entire large dataset by just working on smaller pieces of it!

You're going to use the data from 'ind_pop_data.csv', available in your current directory. The packages pandas and matplotlib.pyplot have been imported as pd and plt respectively for your use.

In [None]:
# Initialize reader object: urb_pop_reader
urb_pop_reader = pd.read_csv('ind_pop_data.csv', chunksize=1000)

# Initialize empty DataFrame: data
data = pd.DataFrame()

# Iterate over each DataFrame chunk
for df_urb_pop in urb_pop_reader:

    # Check out specific country: df_pop_ceb
    df_pop_ceb = df_urb_pop[df_urb_pop['CountryCode'] == 'CEB']

    # Zip DataFrame columns of interest: pops
    pops = zip(df_pop_ceb['Total Population'],
                df_pop_ceb['Urban population (% of total)'])

    # Turn zip object into list: pops_list
    pops_list = list(pops)

    # Use list comprehension to create new DataFrame column 'Total Urban Population'
    df_pop_ceb['Total Urban Population'] = [int(tup[0] * tup[1] * 0.01) for tup in pops_list]
    
    # Append DataFrame chunk to data: data
    data = data.append(df_pop_ceb)

# Plot urban population data
data.plot(kind='scatter', x='Year', y='Total Urban Population')
plt.show()

# Writing an iterator to load data in chunks (5)

This is the last leg. You've learned a lot about processing a large dataset in chunks. In this last exercise, you will put all the code for processing the data into a single function so that you can reuse the code without having to rewrite the same things all over again.

You're going to define the function plot_pop() which takes two arguments: the filename of the file to be processed, and the country code of the rows you want to process in the dataset.

Because all of the previous code you've written in the previous exercises will be housed in plot_pop(), calling the function already does the following:

    Loading of the file chunk by chunk,
    Creating the new column of urban population values, and
    Plotting the urban population data.

That's a lot of work, but the function now makes it convenient to repeat the same process for whatever file and country code you want to process and visualize!

You're going to use the data from 'ind_pop_data.csv', available in your current directory. The packages pandas and matplotlib.pyplot has been imported as pd and plt respectively for your use.

After you are done, take a moment to look at the plots and reflect on the new skills you have acquired. The journey doesn't end here! If you have enjoyed working with this data, you can continue exploring it using the pre-processed version available on Kaggle.

In [None]:
# Define plot_pop()
def plot_pop(filename, country_code):

    # Initialize reader object: urb_pop_reader
    urb_pop_reader = pd.read_csv(filename, chunksize=1000)

    # Initialize empty DataFrame: data
    data = pd.DataFrame()
    
    # Iterate over each DataFrame chunk
    for df_urb_pop in urb_pop_reader:
        # Check out specific country: df_pop_ceb
        df_pop_ceb = df_urb_pop[df_urb_pop['CountryCode'] == country_code]

        # Zip DataFrame columns of interest: pops
        pops = zip(df_pop_ceb['Total Population'],
                    df_pop_ceb['Urban population (% of total)'])

        # Turn zip object into list: pops_list
        pops_list = list(pops)

        # Use list comprehension to create new DataFrame column 'Total Urban Population'
        df_pop_ceb['Total Urban Population'] = [int(tup[0] * tup[1] * 0.01) for tup in pops_list]
    
        # Append DataFrame chunk to data: data
        data = data.append(df_pop_ceb)

    # Plot urban population data
    data.plot(kind='scatter', x='Year', y='Total Urban Population')
    plt.show()

# Set the filename: fn
fn = 'ind_pop_data.csv'

# Call plot_pop for country code 'CEB'
plot_pop('ind_pop_data.csv','CEB')

# Call plot_pop for country code 'ARB'
plot_pop('ind_pop_data.csv','ARB')