In [None]:
## CLASS 8 + 9: Advanced Spreadsheet Manipulation

In [None]:
# PART ONE - Advanced CSV Manipulation - The Power of the DataFrame
# If you're familiar with R, this should be familiar

# csv.reader and csv.writer are fine for basic manipulations - but for more advanced, we need pandas!

import pandas as pd # the 'as' just means that we get to call pandas 'pd' in our code

# The read_csv function in the pandas library grabs a CSV file nice and quick.
# It puts it into a DataFrame object
df = pd.read_csv("Input Files/Presidential Proclamations, 1789-2016.csv")

# Now we have our DataFrame object, affectionately named df 
print(df.head(5))
# While you could just do print(df), the .head(N) method ensures only the first N rows are displayed.
# Telling Jupyter Notebook to load over 7,000 rows takes a long time!

In [None]:
# Why use Pandas?
# Using csv.reader/writer can be simpler at first
# However, when moving beyond anything but the most basic modifications of data,
# Pandas very quickly surpasses csv.reader/writer in functionality
# That being said, it does have a learning curve.

In [None]:
# Display the LAST 5 rows instead
print(df.tail(5))

In [None]:
# You can use list slicing on DataFrames - this cuts off the last 3 rows from the bottom, and stores this newly sliced
# DataFrame in df_2
df_2 = df[:-3]
print(df_2.tail(5))

In [None]:
# You can also print out some basic information about your DataFrame

# The names of the columns
print(df.columns)
# How many columns exist
print(len(df.columns))
# How many rows exist
print(df.index)

In [None]:
# You can also rename just a single column. But, take special note of inplace below, an optional parameter
# inplace=True means that you OVERWRITE the OLD dataframe with the NEW one.
# By default, inplace is false, and that means Python creates a DataFrame copy with your change in it
# Sometimes you'll want to keep your old dataframe, other times you'll want to overwrite it.
# Some pandas methods don't have an inplace parameter - for those, most make a new dataframe

# Look closely - this will trip you up later if you don't pay special attention.
# Without using inplace, you need to assign df to a new variable.
df2 = df.rename(columns={'date': 'date_of_issuance'})

# Using inplace, you can just run it on its own (not setting it equal to anything) and it will overwrite itself
df.rename(columns={'President': 'President_In_Office'}, inplace=True)

# Let's check out the difference?
print(df2.head(5))
print(df.head(5))

In [None]:
# Inserting new columns is also easy
new_dataframe = pd.DataFrame(columns=['Dummy','B'])

# concat() comes from concatenate. Just as you might concatenate strings,
# Python lets you concatenate DataFrames together
df_new = pd.concat([df,new_dataframe])

df_new.head(5)

In [None]:
# Replace all 'empty' values (what Pandas calls NaN values) with the specified value of your choice
df = df.fillna(0)

In [None]:
# To insert/update rows, you can do it either with indexes or with string names (if the rows are named).
# This will update (i.e. overwrite) the first row
df.loc[0] = ["John Doe","January 1st, 1900","A Random Proclamation","http://google.com"] 

# This will write a new row since a row named "Silly" doesn't exist
df.loc['Silly'] = ["John Doe","January 1st, 1900","A Random Proclamation","http://google.com"]

# This always adds a row to the very end.
df.loc[len(df)] = ["John Doe","January 1st, 1900","A Random Proclamation","http://google.com"]

# However, note that you need to have the same number of rows in your list as there are rows in the DataFrame
print(df.head(5))
print(df.loc[0])

In [None]:
# To delete a row or column, there's just a single method
# The key is the axis parameter - 0 means that it's a row, 1 means that it's a column
# Here we use the df.columns list that we saw earlier.
if 'Dummy' in df.columns: # If there is a Dummy column...
    df.drop('Dummy', axis=1, inplace=True) # Drop it from the columns, since axis is 1
if 'Silly' in df.index: # If there is a row at index 2...
    df.drop('Silly', axis=0, inplace=True) # Drop the row at index 'Silly' from the rows, since axis is 0
print('Dummy' in df.columns) # Returns True or False, depending on if 'Dummy' is the name of a column in df
print('Silly' in df.index)
print(df.head(5)) #AFTER

In [None]:
# It's also very easy to swap rows and columns, although it makes little sense for this specific dataset
df_new = df.transpose() #They didn't include an inplace parameter for the transpose method. You MUST make a new one
print(df.head(5))
print(df_new.head(5))

In [None]:
# You can also edit individual cells via the set_value() method. 
# Row, Column, Value
df.set_value("2000", 'President', 'John Doe')

# And you can grab individual cells via the get_value() method
# Row, Column
my_value = df.get_value("2000", 'President')
# First value is the row index (X), second value is the column name (Y), third value is what to insert
# Unless you're editing a whole row or column in one line, this is the preferred way to edit a single cell
print(my_value)

In [None]:
## Useful Tips and Tricks

# Trick 1:
# Pandas extends Python's slicing capabilities with their DataFrame objects to allow for STATA/R-like editing
# Check out what's below
i_like_prez = ["George Washington","John Adams","Thomas Jefferson"]
df_only = df.loc[df['President_In_Office'].isin(i_like_prez)]

# Inside df.loc, not only can you give it an index number or a name of a row (if it's named)
# You can also insert special pandas functions in it.
# What the above code does is it creates a new DataFrame containing only those rows where the
# 'President_In_Office' column is among the list provided.
df_only.head(5)

In [None]:
# Trick 2: More advanced slicing

# You can have multiple conditions - in this one, df_new will only contain rows with the proper President
# AND the proper year.
df_new = df.loc[(df["President_In_Office"].isin(i_like_prez) & (df['Date'].str.endswith("1795")))]
print(df_new.head(5))

# Looking above, you may ask, "why are you using & and not 'and'?" (You also use | instead of 'or')
# The difference between & and "and" is a pretty complex topic without much meaning to social scientists
# If you're REALLY just that curious, research "short-circuiting boolean operators".
# What's important to know is that whenever you're inside square brackets [], you need to use &/| intead of and/or.
# The 'and' statement will NOT function properly inside square brackets! Nor will 'or'!

In [None]:
# Trick 3: Applying functions to individual columns (also works for individual rows or all columns or all rows)

# Let's say we wanted to modify the Date column in a complex way - to remove all the information
# except for the year. There's an easy way to apply a function to every row with just a few lines of code

# First, let's define a function. The "x" parameter will be the content of each cell.
def year_only(x):
    split_date = str(x).split(", ")
    # We have to make sure that the Date actually exists - some dates might be in a bad format.
    if len(split_date) > 1:
        # The below function will ensure that only the year is returned
        return str(x).split(", ")[1]
    else:
        return x

# Finally, we use the apply() method to apply a function that we've created to each cell within the column
# Note that we're not writing year_only(x), just year_only
# This is because you're actually passing the function itself into the apply() function
df['Date'] = df['Date'].apply(year_only)
print(df['Date'])

In [None]:
# Trick 4: Printing out crosstabs
tab = pd.crosstab(df["President_In_Office"],df["Date"],margins=True)
print(tab)

In [None]:
#Finally, you can send your DataFrame out as a CSV with one line
# The index=False parameter (optional) means that I don't want Python to generate an
# extra index column for me from 0 onward. But sometimes you would like an index column to be created
df.to_csv("Output Files/your_spreadsheet.csv",index=False)
tab.to_csv("Output Files/your_crosstab.csv",index=False)

In [None]:
## NUMPY: Python's Number-manipulation package
import numpy as np # numpy is highly useful for quickly creating, processing, and storing numerical data
# You aren't required to write "as np", but it's considered a standard Python naming convention
# instead of writing numpy.something or numpy.that, you can write np.something or np.that.
# Numpy has a number of useful utilities
# Numpy uses an object called an array, which is quite similar to the Python list except that it's specically made
# to process numerical data quickly and to apply mathematical functions quickly

# Numpy 1: make a 10x4 "array of arrays" (i.e. a list of lists, or a spreadsheet) of numbers in a normal distribution
my_list = np.random.randn(10,4)
print(my_list)

In [None]:
# Numpy 1a: Applying math functions to every value in a numpy array
# The fascinating part about numpy is that you can easily apply math functions to all elements.
# For example, dividing by two actually divides every element in the 10x4 list by two. Same with multiplying
my_list = np.random.randn(10,4)
print(my_list)
list_2 = my_list*10
print(list_2)

In [None]:
# Numpy 1b: What if we wanted to go from -1 to 1 instead of a normal distribution
my_array = np.random.randint(0,100,20)/100
print(my_array)

# Let's transform it from 20x1 into a 4x5 array
my_array = my_array.reshape((4,5))
print(my_array)

In [None]:
# Numpy 2: Pandas Likes Numpy
# Pandas' DataFrame function can accept a numpy-created array to create a new DataFrame
df_random = pd.DataFrame(np.random.randn(10,4))
print(df_random)

In [None]:
# Numpy 3: arange and reshape

a = np.arange(15)
# a = range(0,15) basically produces the same result
# Why use arange then? See below...

# The whole numpy array
print(a)
# the "shape" of the numpy array
print(a.shape)
# the number of dimensions in the numpy array
print(a.ndim)

# reshape transforms a 1-dimensional array into a 2-dimensional array
a = a.reshape(3, 5)
print("Reshaped:")
print(a)
print(a.shape)
print(a.ndim)

In [None]:
# Numpy 4: Python Lists vs Numpy Arrays: Which to Use?

numpy_array = np.array([6, 7, 8, 9])
print(numpy_array)
python_list = [6,7,8,9]
print(python_list)
# Basics: Numpy arrays are computationally faster and take up less memory.
# However, you lose a lot of Python's useful methods and flexibility
# For instance, numpy arrays should only have one TYPE of object in them.
# If you use multiple types, numpy will attempt to forcibly change their
# types to match.

# What does this mean for you?
# If you're after blazing fast speed for larger datasets, use numpy arrays
# If you're after flexibility and fancy functions and methods, use Python lists

In [None]:
# Numpy 5: Mass-create 2D arrays of various types
print(np.zeros((3,4)))
print("----------------------")
print(np.ones((3,4)))
print("----------------------")
print(np.arange(10, 30, 5))
print("----------------------")
print(np.linspace(0, 2, 9))