# Code Workshop

# Setup and Import Libraries

First, let's import all of the libraries we'll be using for this notebook.

In [None]:
# If running on Google Collab, run this cell to install the required packages
%pip install pywaffle wordcloud folium

In [None]:
import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
from pywaffle import Waffle
from wordcloud import WordCloud, STOPWORDS
import urllib
import folium

# Optional: Theme Matplotlib

You can theme all of your matplotlib plots using the below code. Let's change it to `Solarize_Light2`!

In [None]:
mpl.style.use(['Solarize_Light2'])

# Prepare Data

For most of this, we'll be using the Immigration to Canada from 1980-2013 dataset from the labs in coursera. 

In [None]:
# Read the data into a dataframe
df = pd.read_csv('https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-DV0101EN-SkillsNetwork/Data%20Files/Canada.csv')
df = df.set_index('Country') # Set the 'Country' column as the index of the dataframe

# Let's make a list of years to use throughout this notebook.
years = list(map(str, range(1980, 2014)))

# Let's set a constant of (11,7) to be our figure size. This way, everything is consistent and if we want to change the size of all the figures, we can do it here.
fsize = (11, 7)

# Line Plot

Let's plot a line plot of the immigration to Canada from Afghanistan.

In [None]:
afghanistan = df.loc['Afghanistan', years] # Get the immigration data for Afghanistan
# Create the plot
afghanistan.plot(kind='line', 
                 figsize=fsize # Set the size of the plot
                 ) 
plt.title('Immigration from Afghanistan to Canada 1980-2013') # Title the plot
plt.xlabel('Year') # Label the x-axis
plt.ylabel('Immigrants') # Label the y-axis
# NOTE: You can also do this all as one line of code, by setting arguments in the plot() function. For example:
# afghanistan.plot(kind='line', title='Immigration from Afghanistan to Canada 1980-2013', xlabel='Year', ylabel='Immigrants')

plt.show() # Show the plot. Don't forget this part!

# Area Plot

Let's plot the top 5 countries in an area plot.

In [None]:
 # Sort the dataframe by the 'Total' column in descending order, then get the top 5 entries
df_top5 = df.sort_values(['Total'], ascending=False, axis=0).head()

# transpose the dataframe (Switch the country names to be the columns and the years to be the rows)
df_top5 = df_top5[years].transpose()

# let's change the index values of df_top5 to type integer for plotting
df_top5.index = df_top5.index.map(int)

df_top5.plot(kind='area', figsize=fsize)
plt.title('Immigration Trend of Top 5 Countries')
plt.ylabel('Number of Immigrants')
plt.xlabel('Years')
plt.show()

You can also plot the data as not stacked, which will plot each country on a separate line.

In [None]:
df_top5.plot(kind='area',
             stacked=False,
             alpha=0.25, # Changes the transparency of the plot
             figsize=fsize)

plt.title('Immigration Trend of Top 5 Countries')
plt.ylabel('Number of Immigrants')
plt.xlabel('Years')

plt.show()

# Histogram

Let's plot a histogram of the immigration from Denmark, Norway and Sweden for 1980-2013.

In [None]:
df_dns = df.loc[['Denmark', 'Norway', 'Sweden'], years].transpose() # Pull the data and then transpose it

# Let's get the bin edges (the range of values that will be in each bin) for the xticks.
# For this example, we'll using 15 bins.
count, bin_edges = np.histogram(df_dns, 15)

df_dns.plot(kind='hist', 
            bins=15, # You can change the number of bins to change the granularity of the plot. Default is 10 bins. (Make sure this matches the above line!)
            figsize=fsize,
            xticks=bin_edges # Set the x ticks to be the bin edges
            )

plt.title('Histogram of Immigration from Denmark, Norway, and Sweden from 1980 - 2013')
plt.ylabel('Number of Years')
plt.xlabel('Number of Immigrants')

plt.show()

# Bar Charts and Chart Annotation

Let's make a bar chart of Iceland's immigration from 1980-2013 and annotate it to indicate the Icelandic financial crisis of 2008-2011.

In [None]:
df_iceland = df.loc['Iceland', years] # Get the data for Iceland

df_iceland.plot(kind='bar',
                figsize=fsize) # Bar charts are pretty dang easy to plot

plt.xlabel('Year') # add to x-label to the plot
plt.ylabel('Number of immigrants') # add y-label to the plot
plt.title('Icelandic immigrants to Canada from 1980 to 2013') # add title to the plot

# You can annotate charts using the plt.annotate() function like so:

# Annotate arrow
plt.annotate('',  # s: str. will leave it blank for no text
             xy=(32, 70),  # place head of the arrow at point (year 2012 , pop 70)
             xytext=(28, 20),  # place base of the arrow at point (year 2008 , pop 20)
             xycoords='data',  # will use the coordinate system of the object being annotated
             arrowprops=dict(arrowstyle='->', connectionstyle='arc3', color='blue', lw=2)
             )

# Annotate Text
plt.annotate('2008 - 2011 Financial Crisis',  # text to display
             xy=(28, 30),  # start the text at at point (year 2008 , pop 30)
             rotation=72.5,  # based on trial and error to match the arrow
             va='bottom',  # want the text to be vertically 'bottom' aligned
             ha='left',  # want the text to be horizontally 'left' algned.
             )

plt.show()

You can also plot a bar chart horizontally. Let's chart the top 15 countries immigrating to canada.

In [None]:
# Let's get the top 15 values for "Total"
df_top15 = df.sort_values(['Total'], ascending=False, axis=0)['Total'].head(15)

df_top15.plot(kind='barh', # Use kind=barh to plot a horizontal bar. 
              figsize=fsize)

plt.xlabel('Number of Immigrants')
plt.title('Top 15 Countries Contributing to the Immigration to Canada between 1980 - 2013')

# Now, let's annotate the labels on the bars. We can do this using the plt.annotate() function.
for index, value in enumerate(df_top15): 
    label = format(int(value), ',') # format int with commas
    # place text at the end of bar (subtracting 47000 from x, and 0.1 from y to make it fit within the bar)
    plt.annotate(label, xy=(value - 47000, index - 0.10), color='white', fontsize=10)

plt.show()

# Pie Chart

Let's plot the proportions of immigrants to Canada from 1980-2013 by continent in a pie chart.

In [None]:
# group countries by continents and apply sum() function 
df_continents = df.groupby('Continent').sum()

# We can offset some of the data in the pie chart to make it clearer
explode_list = [0.1, 0, 0, 0, 0.1, 0.1] # ratio for each continent with which to offset each wedge.

df_continents['Total'].plot(kind='pie',
                            figsize=fsize,
                            autopct='%1.1f%%', # add in percentages
                            startangle=90, # start angle 90°
                            labels=None, # turn off labels on pie chart
                            pctdistance=1.12, # the ratio between the center of each pie slice and the start of the text generated by autopct 
                            explode=explode_list # 'explode' lowest 3 continents
                            )

plt.title('Immigration to Canada by Continent [1980 - 2013]')
plt.axis('equal') # Sets the pie chart to look like a circle.
plt.legend(labels=df_continents.index, loc='upper left') 

plt.show()

# Box Plot

Let's plot the box plot for the Japanese immigrants between 1980 - 2013.

In [None]:
# Get the data for Japan from 1980-2013
df_japan = df.loc[['Japan'], years].transpose()

df_japan.plot(kind='box', figsize=fsize)

plt.title('Box plot of Japanese Immigrants from 1980 - 2013')
plt.ylabel('Number of Immigrants')

plt.show()

# Scatter Plot

Let's plot a scatter plot for all immigration to Canada from 1980-2013.

In [None]:
# Get the data. We can use the sum() method to get the total population per year
df_tot = pd.DataFrame(df[years].sum())

# change the years to type int (makes the labels easier to read)
df_tot.index = map(int, df_tot.index)
df_tot = df_tot.reset_index()

# rename columns
df_tot.columns = ['year', 'total']

df_tot.plot(kind='scatter', x='year', y='total', figsize=fsize)

plt.title('Total Immigration to Canada from 1980 - 2013')
plt.xlabel('Year')
plt.ylabel('Number of Immigrants')

plt.show()

# Bubble Plot

Let's create a bubble plot for the population of Brazil from 1980-2013.

In [None]:
# Step 1: Prepare the data
df_t = df[years].transpose() # Transpose the dataframe to make the years the index
df_t.index = map(int, df_t.index) # cast the Years (the index) to type int
df_t.index.name = 'Year' # name the index 'Year' to make it easier to reference
df_t = df_t.reset_index() # reset index to bring the Year in as a column

# Step 2: Normalize the data we need, this will help with the size of the bubbles
norm_brazil = (df_t['Brazil'] - df_t['Brazil'].min()) / (df_t['Brazil'].max() - df_t['Brazil'].min())

df_t.plot(kind='scatter',
          x='Year',
          y='Brazil',
          figsize=fsize,
          alpha=0.5,  # transparency
          s=norm_brazil * 2000 + 10,  # pass in weights 
          xlim=(1975, 2015)
          )

plt.ylabel('Number of Immigrants')
plt.title('Immigration from Brazil from 1980 to 2013')
plt.legend(['Brazil'], loc='upper left', fontsize='x-large')

plt.show()

# Waffle Chart

Let's create a waffle chart showing the proportion of immigration from Denmark, Norway and Sweden.

In [None]:
# Create a new dataframe for these three countries 
df_dsn = df.loc[['Denmark', 'Norway', 'Sweden'], :]

plt.figure(FigureClass = Waffle,
            rows = 20, columns = 30, # pass the number of rows and columns for the waffle 
            values = df_dsn['Total'], # pass the data to be used for display
            legend = {'labels': [f"{k} ({v})" for k, v in zip(df_dsn.index.values,df_dsn.Total)],
                       'loc': 'lower left', 'bbox_to_anchor':(0,-0.1),'ncol': 3},
            figsize = fsize
           )    
plt.title("Immigration from Denmark, Norway and Sweden 1980-2013")
plt.show()

# Word Cloud

Let's make a word cloud of all the text from the classic novel "Pride and Prejudice!"

In [None]:
# First, let's download the e-book from Project Gutenberg and store it in a variable.
pride_novel = urllib.request.urlopen('https://www.gutenberg.org/cache/epub/1342/pg1342.txt').read().decode("utf-8")

In [None]:
# Set the stopwords up! We'll use the default set, and add "said" to it.
stopwords = set(STOPWORDS)
stopwords.add("said")
# Create the WordCloud object
pride_wc = WordCloud(width=1100,  # PROTIP: You can set the width and height of the WordCloud (in pixels) using those arguments! 
                     height=700, # This means it won't get as pixelated like it did in coursera if you make the figure bigger.
                     stopwords=stopwords # If you're using a custom list of stop words, you have to pass this argument in.
                                         # Strangely enough, Coursera forgot to do this.
                     )
# Generate the Word Cloud
pride_wc.generate(pride_novel)

# Display the Word Cloud
plt.figure(figsize=fsize)
plt.imshow(pride_wc, interpolation='bilinear')
plt.axis('off')
plt.show()

# Regression Plot

With seaborn, you can plot regression plots easily. Let's do one using the data `df_tot` from the scatter plot we did earlier, and customize it a bit for good measure.

In [None]:
plt.figure(figsize=fsize)

ax = sns.regplot(x='year',
                 y='total',     
                 data=df_tot, 
                 color='red', 
                 marker='+', # Set the marker. This has a bunch of options, you can check them out at https://seaborn.pydata.org/tutorial/properties.html#marker
                 scatter_kws={'s': 200} # This just adjusts the size of the markers.
                )

ax.set(xlabel='Year', ylabel='Total Immigration')
ax.set_title('Total Immigration to Canada from 1980 - 2013')

plt.show()

# Geospatial Maps

In [None]:
world_map = folium.Map(location=[56.130, -106.35], # Where you want the map centered on
                       zoom_start=4, # The default zoom level of the map
                       tiles='OpenStreetMap' # The theme of the map. Can also be "Cartodb Positron" or "Cartodb dark_matter"
                       )

world_map

You can also add markers to a map as well:

In [None]:
# Get the data needed for markers
df_incidents = pd.read_csv('https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-DV0101EN-SkillsNetwork/Data%20Files/Police_Department_Incidents_-_Previous_Year__2016_.csv')
df_incidents = df_incidents.iloc[0:100, :] # Only take the first 100 crimes, otherwise this will take forever

sanfran_map = folium.Map(location=[37.77, -122.42], zoom_start=12) # Create the map centered around San Francisco

# instantiate a feature group for the incidents in the dataframe
incidents = folium.map.FeatureGroup()

# loop through the 100 crimes and add each to the incidents feature group
for lat, lng, in zip(df_incidents.Y, df_incidents.X):
    incidents.add_child(
        folium.vector_layers.CircleMarker(
            [lat, lng],
            radius=5, # define how big you want the circle markers to be
            color='yellow', # Define the color of the circle markers
            fill=True, # Whether you want the circle markers to be filled with color
            fill_color='blue', # The fill color of the circle markers
            fill_opacity=0.6 # The opacity of the circle markers
        )
    )

# add incidents to map
sanfran_map.add_child(incidents)


# Choropleth Maps

Let's generate a choropleth map showing the proportion of immigrants to Canada worldwide.

In [None]:
# Download the geojson file and store it in a variable
world_geo = urllib.request.urlopen(r'https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-DV0101EN-SkillsNetwork/Data%20Files/world_countries.json').read().decode("utf-8") # geojson file

# Reset the index of the original dataframe and store it in a new one since "Country" needs to be a column, not the index
df_world = df.reset_index()

world_map = folium.Map(location=[0, 0], zoom_start=2)
# generate choropleth map using the total immigration of each country to Canada from 1980 to 2013
folium.Choropleth(
    geo_data=world_geo,
    data=df_world,
    columns=['Country', 'Total'],
    key_on='feature.properties.name',
    fill_color='YlOrRd', 
    fill_opacity=0.7, 
    line_opacity=0.2,
    legend_name='Immigration to Canada',
    reset=True
).add_to(world_map)

# display map
world_map

Folium is a very complex and powerful tool. For more uses, refer to the Generating Maps in Python lab in Module 4 of coursera.

# Notes

- When plotting a dataframe, you can also use df.plot.*type* to plot it instead of using `kind='type'`. For example, `df.plot.line()`