In [None]:
# CLASS 3 + 4: READING AND WRITING TO FILES

In [None]:
# Writing to a text document

# Note that the text file does NOT NEED to preivously exist when writing - Python will automatically
# create the text document for you.
# The little "w" string in the second parameter below just means that the file will be opened in "writing" mode.
# Python understands what the "w" means. my_writing_file stores a File object that the open() function "returns"
# (i.e. sends you back to put in a variable, just like a String object, an Integer object, a List object, etc.)

# Store our new File object, which is set in writing mode, into the my_writing_file variable
# The default folder that the file will be created in is the same folder as your Notebook file.
my_writing_file = open("Output Files/text_file_1.txt","w")
# If you want to output to an entirely different folder, you can enter the entire path:
# i.e. open("C:/Program Files/My Folder/text_file_1.txt","w")

# The .write() method should be clear - write the string to the document
# Note the \n at the end - this means I want it to type in an "Enter" key after each line
# \n stands for the "newline" character in Python. 
# Note that the backslash (\) is a special "escape" character, denoting that you want Python to do something
# special in the string. Another example is \t, which is the tab key. If you actually want to print a "\", you need
# to type "\\"
my_writing_file.write("Here is the first line!\n")
my_writing_file.write("Here is the second line!\n")
my_writing_file.write("Here is the third line!\n")
my_writing_file.write("Here is the fourth line!\n")

# Make sure to close the file when you're done! Otherwise, problems can occur later
my_writing_file.close() 
print("Done!")

# CRITICAL NOTE: WHEN READING OR WRITING, MAKE SURE THE FILE ITSELF IS CLOSED ON YOUR COMPUTER!
# YOUR COMPUTER MAY NOT GIVE PYTHON PERMISSION TO READ/WRITE THE FILE IF YOU HAVE IT OPEN!

In [None]:
# Reading from a text document

# CRITICAL NOTE: You must have a file named text_file_1.txt in your Output Files folder for this to work
# If you want to open a file in another directory, you need to copy/paste the whole path
# Luckily, the cell right above this one creates text_file_1.txt in the proper folder

# Open the file in reading mode (note the "r" instead of "w")
my_reading_file = open("Output Files/text_file_1.txt","r")

# Reads the WHOLE text file into a single string
text_in_file = my_reading_file.read()

# Close the file
my_reading_file.close()

# Lets see what we have?
print(text_in_file)
print("Done!")

In [None]:
# Alternate - Read a file as a list of strings of text instead one giant text blob

my_reading_file = open("Output Files/text_file_1.txt","r")

# .readlines() makes a LIST, with each list element being one line of text
list_of_lines = my_reading_file.readlines()
my_reading_file.close()

print(list_of_lines)

# We can access individual lines easily, just like any other list
print(list_of_lines[1])
print("Done!")

In [None]:
# Writing to CSV Files (i.e. spreadsheets)

import unicodecsv as csv # The unicodecsv package - Creating spreadsheets with your output

# This is a little tricky to explain. Python has a library called "csv" already. However, Python's generic CSV
# library has a major problem for social scientists - it doesn't work very well with non-English characters!
# It has issues quite reguarly with the incredibly frustrating "Unicode(De/En)codeError", which I pray you never get.
# Thankfully, one ingenious Python developer decided to make a "wrapper" library called unicodecsv. The unicodecsv
# library does everything that the normal csv library does, but it also accepts international text more easily.

# IMPORTANT: YOU MUST INSTALL THE NEW LIBRARY BEFORE IT WILL WORK! To do this, open up the command prompt/terminal
# in a new window (you can't use the one that's running Jupyter Notebook)
# and type in "pip install unicodecsv" (without quotes), then press Enter. Wait for it to finish

# Note the "wb" here instead of "w". Unless you're writing to a basic text file, you should generally use
# "wb", which signifies to Python that you want to write in something called "Byte Mode", or in raw computer code
# rather than computer code translated into text. Thankfully, the CSV will still display properly becuase
# Microsoft Excel is extremely good at translating computer code into text for us in a spreadsheet
opened_file = open('Output Files/test_spreadsheet.csv','wb')

# Finally, you call the unicodecsv library. Note the "as csv" at the top of the cell - all that means is that
# when you write "csv.", Pyhton understands it to mean "unicodecsv.". It saves a bit of typing over time.

# The writer() function from the csv library takes in a regular File object (set to byte mode) as input, and returns
# a new CSVFile object. Unlike normal File objects, the CSVFile object has special spreadsheet capabilities.

# Let's save the CSVFile object in the my_csv_file variable.
my_csv_file = csv.writer(opened_file)

my_variable = "Hey Look"

# The single most useful method is the .writerow() method.
# We can understand why normal File objects don't have a writerow() method...
# What does writing a row mean in a normal text document? It's nonsensical.
# The input for the writerow() method is a regular list, and it transforms
# that list into a spreadsheet row.
my_csv_file.writerow([my_variable,4.6543,7.3453,2,1232])
my_csv_file.writerow([123,345,567,754,"Bunny Rabbit","Niagra Falls","Python!"])
# And there, you just wrote two rows

# NOTE: You only need to close the ORIGINAL File object, you DON'T close the CSVFile object.
opened_file.close()
print("Done!")

# Why not check out your new file?

In [None]:
# Reading from CSV Files - pretty similar

import unicodecsv as csv

# Right after it is written (from the previous cell), there's no reason you can't read it.
# This time, lets put the filepath into a variable for more readability
read_filepath = 'Output Files/test_spreadsheet.csv'

# Note the "rb" once again! Reading or writing CSVs, must be in Byte Mode!
file_to_read = open(read_filepath,'rb')

# As a cousin of csv.writer, csv.reader lets you read a CSV file that already exists
csv_to_read = csv.reader(file_to_read)

# Now let's make an empty list in preparation
my_new_list = []

# To read the CSVFile contents, you use a For loop.
# Each iteration of the for loop (each time it goes through the indented code), it returns you the next row
# Later on we'll be looking into advanced methods of MODIFYING CSV files, but for now we're focusing mostly
# on READING and WRITING

for row in csv_to_read:
    # Maybe we only want to get a certain column into our list
    # Remember the list object's .append() method?
    my_new_list.append(row[2])
    # Lets think over the logic. If the For Loop goes through every row, and in each row we pick up the 3rd
    # spreadsheet cell (indexes start with 0, remember) and put it into a list, 
    # then naturally at the very end of the For Loop we will have a new
    # column created that is identical to the column in the spreadsheet.
file_to_read.close()

# What does the for loop above get us? A column! (based on the index used)
print(my_new_list)
print("Done!")

In [None]:
# A short aside - Nested Dictionaries
# Just as you can have lists-inside-lists, you can have dictionaries-inside-dictionaries
# Nested dictionaries are at the core of what online databases send to Python when you ask them for information

# IMPORTANT: Don't worry if the dca_airport_data variable below is incomprehensible, 
# just run this cell and you'll see it printed out much clearer.
# 99.9% of the time, you don't have to make your own complex dictionaries from scratch, 
# you just have to modify what you get from online databases, which generally arrive to Python in nested dictionary
# format

# pprint is the Pretty Print package - it prints out complex data like nested dictionaries into something more readable
import pprint 

# This is a complex nested dictionary!
dca_airport_data = {'IATA': 'DCA', 'ICAO': 'KDCA', 'city': 'Washington', 'delay': 'false', 
                    'name': 'Ronald Reagan Washington National','state': 'District of Columbia', 
                    'status': {'avgDelay': '','closureBegin': '','closureEnd': '','endTime': '',
                               'maxDelay': '','minDelay': '','reason': 'No known delays for this airport.',
                               'trend': '','type': ''},
                    'weather': {'meta': {'credit': "NOAA's National Weather Service",'updated': '8:52 PM Local',
                                         'url': 'http://weather.gov/'},
                                'temp': '77.0 F (25.0 C)','visibility': 10.0,'weather': 'Mostly Cloudy',
                                'wind': 'North at 5.8mph'}}

print("Here is the regular print nested dictionary:\n")
print(dca_airport_data) # Normal print looks like a total mess!

print("Here is the pretty print nested dictionary:\n")

# pprint looks a bit confusing below - the pprint LIBRARY has a FUNCTION called pprint().
pprint.pprint(dca_airport_data) # Pretty Print does a much better job!

# It shows you the dictionary-inside-dictionary values much more clearly
# Take a look at the 'status' key - the value isn't a string like the others, but it's a whole dictionary itself.
# If you wanted to access the 'reason' value, for example, you would have to type
# reason_value = dca_airport_data['status']['reason']
# Just like you can use the square brackets twice for nested list indexes, the same works with nested dictionaries
print("Done!")

In [None]:
# Another use of data
# Note that this cell will produce errors if you don't run the previous cell first
my_nested_dictionary = dca_airport_data['weather']['meta']
pprint.pprint(my_nested_dictionary)

In [None]:
# Converting PDF files to Text files
import PyPDF2 # You need to do "pip install PyPDF2" first

pdfFileObj = open('Input Files/pdf-sample.pdf', 'rb') # Always read/write PDF files in Byte mode!

# Similar to CSV files, we're now creating a PdfFileReader object from our File object
pdf_file_reading_object = PyPDF2.PdfFileReader(pdfFileObj)

# NEW CONCEPT: OBJECT ATTRIBUTES
# Object attributes are just smaller variables that are stored inside bigger objects
# For example, all PdfFileReader objects have a numPages attribute
# This numPages attribute is just an Integer object storing the number of pages in the PDF file
# You can access attributes exactly the same way as an object's methods, but without the () at the end
# That lack of () specifies that we are not accessing a method, but simply an attribute.
# Methods run all sorts of code in the background, whereas attributes are just simple variables
print("There are " + str(pdf_file_reading_object.numPages) + " pages.")

# Let's prepare a string to store the PDF data in
mega_string = ""

# Now let's open a new text file
new_file = open('Output Files/pdf-sample-as-text.txt','w')

# The RANGE function returns a list from the first number to the second.
# Thus, range(0,10) gives you a list like [0,1,2,3,4,5,6,7,8,9]
# It's very useful if you want to run a for loop X number of times.
# In the example below, we want to go through each page of the PDF
# Since the sample PDF only has one page, it'll just get that one page
# However, any size works, just change the read file name
for i in range(0,pdf_file_reading_object.numPages):
    # PdfFileObject's have a getPage() method with a single Integer parameter (i) specifying the page number
    # Since i changes with each iteration, this will naturally pick up every page.
    # Let's store the current page in pageObj
    pageObj = pdf_file_reading_object.getPage(i)
    
    # From the Page object, you need to call the extractText() method to actually get the text
    current_page_text = pageObj.extractText()
    
    # The += is just a fancy way of concatenating the first string together with the second
    # It's the same as mega_string = mega_string + current_page_text, but less verbose
    mega_string += current_page_text

# Finally, after the For loop finishes (note the no longer indented code)
# we write the repeatedly concatenated string into our text file and then close it.
new_file.write(mega_string)
new_file.close()
print("Done!")
# Unfortunately, the formatting of the PDF can take a pretty nasty beating

In [None]:
"""
Core Project: Get only odd-numbered lines into a new text document
1) Read in from Input Files/class_3_4_text_file.txt
2) Open a new file to WRITE
3) Go through the lines and only write the ODD NUMBERED LINES
4) Don't forget to CLOSE BOTH FILES WHEN DONE OR THEY WON'T WRITE PROPERLY
---------------------------------------------------------------------------
Advanced Project: Merging Spreadsheets
1) Read in BOTH spreadsheets in READING mode
2) Create a new file in WRITING mode
3) Use FOR LOOPS to go through the list looking for matches

"""

# HINT 1: the append() method for a list is really useful!
# You can have a row, use append() to add a string to the end of it
# then writerow() your new list :)

# HINT 2: Check if a string is inside of a list for MATCHING:
# if my_string_variable in some_list:
# the IN command is very useful here! You can check if a string is held
# inside a list. So for instance, maybe you have a list made up of a row
# and you want to see if your string matches ANY of the values inside the list

# HINT 3: Nested For Loops
# You may want to consider a for loop inside a for loop. For example, every
# row you look at in Spreadsheet A, you want to look through every single row
# of Spreadsheet B to look for a match

# HINT 4: the BREAK statement
# To exit immediately out of a for loop, use the BREAK statement!

# HINT 5: Convert CSVFile to List immediately
# You can use new_list = list(my_csv_file) to immediately convert a CSVFile object to a nested list