# Data Workflows and Automation

In [None]:
# Author: Martin Callaghan
# Date: 2021-05-17
# Lesson link: https://arctraining.github.io/python-2021-04/06-loops-and-functions/index.html

In [1]:
# Connect my Google Drive to Google Colab
from google.colab import drive
drive.mount ('/content/gdrive')

Mounted at /content/gdrive


In [2]:
# Load the python packages we need
import pandas as pd

In [3]:
# Remember that we need to link back to the file and folder we permanently stored in our Google Drive
# But having to include this long path every time is a pain so
filepath = "/content/gdrive/MyDrive/Colab Notebooks/intro-python-2021-04/data/"

## For loops

Loops allow us to repeat a workflow (or series of actions) a given number of times or while some condition is true. 

We could use a loop to automatically process data that’s stored in multiple files (daily values with one file per year, for example). 

In [4]:
# Let's visit the zoo...
animals = ['lion', 'tiger', 'crocodile', 'vulture', 'hippo']
print(animals)

['lion', 'tiger', 'crocodile', 'vulture', 'hippo']


In [5]:
# Let's iterate across this list
for creature in animals:
    print(creature)

lion
tiger
crocodile
vulture
hippo


In [6]:
print (creature)

hippo


## Automate data processing

In [7]:
import os

In [8]:
os.mkdir (filepath + "yearly_files")

In [9]:
os.listdir(filepath)

['species.csv',
 'portal_mammals.sqlite',
 'surveys.csv',
 'plots.csv',
 'bouldercreek_09_2013.txt',
 'surveys2002.csv',
 'surveys2001.csv',
 'speciesSubset.csv',
 'README.txt',
 'out.csv',
 'weight_for_year.csv',
 'yearly_files']

In [10]:
# Load in the data
surveys_df = pd.read_csv (filepath + "surveys.csv")

In [11]:
# Only need data from 2002
surveys2002 = surveys_df [surveys_df.year == 2002]

In [12]:
# Write out the new df
surveys2002.to_csv (filepath + "yearly_files/surveys2002.csv")

In [15]:
# We need the years
surveys_df['year'].unique()

array([1977, 1978, 1979, 1980, 1981, 1982, 1983, 1984, 1985, 1986, 1987,
       1988, 1989, 1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998,
       1999, 2000, 2001, 2002])

In [17]:
# Use these in the loop to get the filenames
for year in surveys_df['year'].unique():
  filename = (filepath + "yearly_files/surveys" + str(year) + ".csv")
  print (filename)

/content/gdrive/MyDrive/Colab Notebooks/intro-python-2021-04/data/yearly_files/surveys1977.csv
/content/gdrive/MyDrive/Colab Notebooks/intro-python-2021-04/data/yearly_files/surveys1978.csv
/content/gdrive/MyDrive/Colab Notebooks/intro-python-2021-04/data/yearly_files/surveys1979.csv
/content/gdrive/MyDrive/Colab Notebooks/intro-python-2021-04/data/yearly_files/surveys1980.csv
/content/gdrive/MyDrive/Colab Notebooks/intro-python-2021-04/data/yearly_files/surveys1981.csv
/content/gdrive/MyDrive/Colab Notebooks/intro-python-2021-04/data/yearly_files/surveys1982.csv
/content/gdrive/MyDrive/Colab Notebooks/intro-python-2021-04/data/yearly_files/surveys1983.csv
/content/gdrive/MyDrive/Colab Notebooks/intro-python-2021-04/data/yearly_files/surveys1984.csv
/content/gdrive/MyDrive/Colab Notebooks/intro-python-2021-04/data/yearly_files/surveys1985.csv
/content/gdrive/MyDrive/Colab Notebooks/intro-python-2021-04/data/yearly_files/surveys1986.csv
/content/gdrive/MyDrive/Colab Notebooks/intro-pyth

In [18]:
# Full code
surveys_df = pd.read_csv (filepath + "surveys.csv")

for year in surveys_df['year'].unique():

  # Select data fro the year
  surveys_year = surveys_df[surveys_df.year == year]
  # Write out the new data
  filename = (filepath + "yearly_files/surveys" + str(year) + ".csv")
  surveys_year.to_csv(filename)

In [19]:
# We can turn this into a reusable function

def one_year_csv_writer (a_year, all_data):
  """
  Writes a csv file for data from a given year.

  a_year -- year for the data to be extracted
  all_data -- dataframe containing the multi-year data
  """

  # Select data for the year
  surveys_year = all_data[all_data.year == a_year]

  # Write dataframe to csv
  filename = filepath + "yearly_files/function_surveys" + str(a_year) + ".csv"
  surveys_year.to_csv(filename)


In [20]:
one_year_csv_writer?

In [21]:
# To all the function
one_year_csv_writer (2002, surveys_df)