# CHAPTER 2:

# Whitespace Formatting:

In [None]:
# Python uses identation to start/end code, instead of brackets
for i in [1, 2, 3]:
    print(i)                  # first line in "for i" block
    for j in [1, 2, 3]:       
        print(j)              # first line in "for j" block
        print (i+j)           # last line "for j" block
    print(i)                  # last line "for i" block

# ...so be VERY careful for code, but inside brackets python ignores it:
list = [[1, 2, 3], [4, 5, 6]]
easier_to_read_list = [[1, 2, 3],
                       [4, 5, 6]]

# You can also use backslash to indicate statement continues:
hello = 2 + \
        3

# Jupyter has a nice "paste%" option so pasted-in text formats correctly

# Modules:

In [None]:
# You already saw most of what you need to know, BUT be sure to <import math as math2>
# if you've already used "math" in code, or if u just wanna make it easier to type.

# Also, NEVER
<import ___ *>
# ...you'll overwrite previous versions of that code

# Functions:

In [None]:
# These are "first-class" commands that take an input and produce an ouput.
# <def> means define, <return> means "do that".
def double(x)
    ""You can explain what the function does here, 
    eg; this one multiples the input by 2.""
    return x * 2

def apply_to_one(f)
    "Calls the function f with 1 as its argument."
    return f(1)

my_double = double
x = apply_to_one(my_double) # is equal to 2.

# You can also make short functions, called "lambdas".
y = apply_to_one(lambda x: x + 4) # is equal to 5.

def my_print(message = "my default message")
    print(message)

my_print("hello") # prints 'hello'.
my_print( ) # prints 'my default message'.

# Basically, whatever you put in that first <def> <return> will be
# a placeholder is nothing else is specified.
def full_name(first = "What's his name", last = "Something")
    return first + " " + last

full_name("Joel", "Grus") # "Joel Grus". Those quotes are from <return>.
full_name("Joel") # "Joel Something".

# Strings:

In [None]:
# These guys can be:
single_quoted_string = 'hello'
double_quoted_string = "hello"

# </> is used to encode special characters:
tab_string = "\t" # represents tab character.
len(tab_string) # is 1.

# But if you backslashes to be themselves, use <r" "> for raw string.
not_tab_string = r"\t" # represents "/" and "t".
len(not_tab_string) # is 2.

# If you want to make a multiline string, use """ """.
"""First line
second line
third line."""

# Use the f-string <f> to combine strings.
first_name = "Joel"
last_name = "Grus"
full_name = f"{first_name} {last_name}"

# Exceptions:

In [None]:
# When something goes wrong, Python creates an "exception";
# unhandled, they'll cause your program to crash, but you can
# "handle" them with <try> and <except>.

try:
    print(0 / 0)
except: ZeroDivisionError:
    print("cannot divide by zero")

# Long as you do this, there's no problem in using exceptions!

# Lists:

In [None]:
# An ordered collection, like a buff array.
integer_list = [1, 2, 3]
heterogenous_list = ["string", 0.1, True]
list_of_lists = [integer_list, hetergenous_list, []]

list_length = len(integer_list) # equals 3.
list_sum = sum(integer_list) # equals 6.

# You can also get to nth element with brackets.
x = [0, 1, 2, 3, 4, 5]
zero = x[0] # equals 0, remember 0-indexed.
five = x[-1] # equals 5, 'Pythonic' for last element: wraps around.
four = x[-2] # equals 4, 'Pythonic'.

# Or you can change values in list.
x[0] = -1 # now it's [-1, 1, 2, 3, 4, 5]

# You can also slice... remember that the end is non-inclusive (-1)!
first_three = x[:3] # [-1, 1, 2]
three_to_end = x[3:] # [3, 4, 5]
two_to_four = x[2:5] # [2, 3, 4]
last_three = x[-3:] # [3, 4, 5]
without_first_or_last = x[1:-1] # [1, 2, 3, 4]
copy_of_x = x[:] # [-1, 1, ..., 5]

# You can also step (skip over).
every_third = x[::3] # [-1, 3]
four_to_one = x[4:1:-] # [4, 3, 2, 1] ... step every "1" don't do nothing

# You can check for if something's included in a list with <in>.
1 in [1, 2, 3] # True
0 in [1, 2, 3] # False

# Concatenate lists with <.extend>.
x = [1, 2, 3]
x.extend([4, 5, 6]) # x is now [1, 2, 3, 4, 5, 6]

# Or, if you want to keep x itself but still combine:
y = x + [4, 5, 6] # x is still [1, 2, 3], y is desired.

# Unpack a list sequentially, when you know # elements inside...
x, y = [1, 2] # x is 1, y is 2

# ...or give one a <_> if you don't care about one element.
_, x = [1, 2] # y == 2, other can be thrown away.

# Tuples:

In [None]:
# Are immutable lists, with a few caveats.
# You can use parenthesis, or nothing:
tuple = (1, 2)
other_tuple = 1, 2

# Useful to return multiple values from a function:
def sum_and_product(x, y):
    return (x + y), (x * y)

sp = sum_and_product(5, 6) # sp is (11, 30)
s, p = sum_and_product(5, 6) # s is 11, p is 30

# Useful for multiple assignments:
x, y = 1, 2
x, y = y, x # now x is 2, y is 1

# Dictionaries:

In [None]:
# Links "values" and "keys" so you can retrieve each quickly.
grades = {"Joel": 80, "Tim": 98}
joels_grade = grades["Joel"] # is 80.

# Check for key existence using <in>.
joel_has_grade = "Joel" in grades # True
tina_has_grade = "Tina" in grades # False

# Using <.get> produces default value (0, here) if no key exists.
joels_grade = grades.get("Joel", 0) # is 80.
tinas_grade = grades.get("Kate", 0) # is 0. 
no_ones_grade = grades.get("No One") # None.

# You can assign key/value pairs with brackets.
grades["Tim"] = 45 # replaces old value.
grades["Tina"] = 97 # adds third entry.
num_students = len(grades) # equals 3.

# You can also comb through all keys.
tweet = { }
    "user" : "joelgrus",
    "text" : "Hello Bro",
    "retweet_count" : 2,
    "hastags : ["greetingsbro", "okay"]
# pretend the pointy bracket is where this line begins...
# don't know why it won't work. 

tweet_keys = tweet.keys() # iterable for all keys.
tweet_values = tweet.values() # iterable for all values.
tweet_items = tweet.items() # iterbals for (key, value) tuples.

"user" in tweet_keys # True, not Pythonic of you.
"user" in tweet # Pythonic way of checking for keys.
"joelgrus" in tweet_values # True (slow, but only way to check). 

# You can't use lists as keys, instead use a tuple or string.

# defaultdict is like reg dict, but when you look up a nonexistent key,
# it'll create it and assign it a default value, as opposed to KeyError.
from collections import defaultdict

word_counts = defaultdict(int) # int() produces 0.
for word in document:
    word_counts[word] += 1

dd_list = defaultdict(list) # list() produces an empty list.
dd_list[2].append(1) # now dd_list contains {2: [1]}

dd_dict = defaultdict(dict) # dict() produces an empty dict.
dd_dict["Joel"]["City"] = "Seattle" # {"Joel" : {"City": Seattle"}}

dd_pair = defaultdict(lambda: [0, 0])
dd_pair[2][1] = 1 # now dd_pair contains {2: [0,1]}

#...very useful when using dictionaries to collect results by-key,
# but don't want to see if key exists every time.

# Counters:

In [None]:
# Basically a defaultdict which generates the number of times a key
# crops up in an element. For example:
from collections import Counter
c = Counter([0, 1, 2, 0]) # c = {0: 2, 1: 1, 2: 1}

# Easiest way to word count:
word_counts = Counter(document)

# ...or determine most common words, number of times they are.
for word, count in word_counts.most_common(10):
    print(word, count)

# Sets:

In [None]:
# A set is like a list with no repeating——distinct——elements.
primes_below_10 = {2, 3, 5, 7}

# You cannot, however, designte {} as an "empty set", since that's
# the notation for an empty dict. Instead:
s = set()
s.add(1) # s = {1}
s.add(2) # s = {1, 2}
s.add(2) # s = {1, 2} (see above)

# You use sets because the <in> command is very slow on lists, but not on sets:
stopwords_list = ["a", "an", "at"] + lotsa_others + ["yet", "zeebra"]
"zip" in stopwords_list # False, but takes forever (checks every element)

stopwords_set = set(stopwords_list)
"zip" in stopwords_set # False, very fast

# Another reason is to reduce a list only to its distinct elements:
item_list = [1, 2, 3, 2, 2, 3,]
item_set = set(item_list) # {1, 2, 3}
distinct_item_list = list(item_set)

# Control Flow

In [None]:
# Reference in-class notes for if, elif, else statements.

# You can also write if-then-else on one line (ternary):
parity = "even" if x % 2 == 0 else "odd"

# <While> loop:
x = 0
while x < 10:
    print(f"{x}is less than 10"}
    x += 1 # Would print "{num} is less than 10"

# <For> and <in>:
for x in range(10):
    print(f"{x} is less than 10") # Would print same as above

# <Continue> allows you to add more if statements:
for x in range(10):
    if x == 3:
        continue # Consider next
    if x == 5:
        break # Quit loop entirely (it'll stop at 5)
    print(x) # Will only print 0, 1, 2, 4

# Truthiness

In [2]:
# Booleans work as in most languages, except capitalized here:
one_is_less_than_two = 1 < 2 # True
true_equals_false = True == False # False

# <None> indicates nonexistent value, much like "null":
x = None
assert x == None # Not very Pythonic of you
assert x is None # Pythonic of you

# Python lets you use any value that it can judge with Bool, 
# all of the following are "falsy":
falsies = [False, None, [], {}, "", set(), 0, 0.0]

# ...everything else is treated as "True", meaning you can use
# <if> statements on empty lists, strings, dictionaries.
s = function_returning_blank_string()
if s:
    first_character = s[0] # The first character is equal to index 0...
else:
    first_character = "" # ...otherwise it's false.

# ...alternatively:
first_char = s and s[0]

# ...given <and> returns second value when truthy and first when not.
# Similarly, if x is either a number or "none":
safe_x = x or 0 # Number assured
safe_x = x if x is not None, else 0 # Same thing

# And finally, the <all> function takes an iterable and returns "True" when
# every element is truthy. An <any> function returns "True" when at least one is.
all([True, 1, {3}]) # True, all truthy
all([True, 1, {}]) # False, {} falsy
any([True, 1, {}]) # True, True truthy, as is 1
all([]) # True, no falsy elements in list
any([]) # False, no truthy elements in list

# Sorting:

In [6]:
# Simple:
x = [4, 1, 2, 3]
y = sorted(x) # y is [1, 2, 3, 4], x unchanged
x.sort() # now x is [1, 2, 3, 4]

# Or, sort by absolute value with <abs>, largest -> smallest with <reverse=True>:
x = sorted([-4, 1, -2, 3], key=abs, reverse=True) # [-4, 3, -2, 1]

# List Comprehensions:

In [None]:
# Transforming lists——making new lists of only certain elements of others
# the Pythonic way (the Pythonic way):
even_numbers = [x for x in range(5) if x % 2 == 0] # [0, 2, 4]
squares = [x * x for x in range(5)] # [0, 1, 4, 9, 16]
even_squares = [x * x for x in even_numbers] # [0, 4, 16]

# Dict and set transformations (the Pythonic way):
square_dict = {x: x * x for x in range(5)} # {0: 0, 1: 1, 2: 4, 3: 9, 4: 16}
square_set = {x * x for x in [1, -1]} # {1} ... note it's transforming a list

# <_> if you don't want it to spit out the value:
zeros = [0 for _ in even_numbers]

# You can also use multiple <fors>:
pairs = [(x, y)
         for x in range(10)
         for y in range(10)] # 100 pairs (0,0) (0,1) ... (9, 8), (9, 9)

# ...and <fors> can piggyback off one another:
increasing_pairs = [(x, y)                      # x will always < y
                    for x in range(10)          # list of nums in range(low, high)
                    for y in range(x + 1, 10)]  # equals [lowest, lowest + 1, ...,
                                                # highest - 2, highest - 1].

# Automated Testing and Assert

In [None]:
# <assert> statements cause your code to raise an "AssertionError" if chosen
# condition is not truthy:
assert 1 + 1 == 2
assert 1 + 1 == 2, "1 + 1 equals 2, didn't here" # Your message there

# Use, use, use <assert> to confirm that functions in your code are correct.
def smallest_item(xs):
    return min(xs)

assert smallest_item([10, 20, 5, 40]) == 5

# or, to <assert> about an input to a function:

def smallest_item(xs):
    assert xs, "empty list has no smallest item"
    return min(xs)

# Object-Oriented Programming:

In [None]:
# Classes encapsulate data and the functions that operate on them.
# Let's explain it with a "counting clicker", like the one they use at Grand
# while the system is down, to count the number of people swiping in.

# Our clicker will maintain a "count", can be "clicked", you can "read_count",
# and it can be "reset" back to zero.

# To define a class (there has to be a capital before every word!!):
class CountingClicker:
    """A class should have a docstring explaining it, like a function."""

# A class contains "member" functions within it, each one 
# takes the first parameter "self", which refers to its 
# instance——existence at a time.

# A "constructor" <_init_> takes whatever parameters you need to construct these
# instances and sets them up. This <_init_> "method name" is also called a "dunder"
# method, featuring special behaviors.

def _init_(self, count = 0):
    self.count = count         # Now, each count has an instance.

# These instances occur naturally whenever you use the class name, eg;
clicker1 = CountingClicker() # Intance when initialized to 0
clicker2 = CountingClicker(100) # ...when count = 100
clicker3 = CountingClicker(count = 200) # (same as above, just more explicit)

# Another method is <__repr__>, which creates the string representation
# of a class instance, as seen below:
def __repr__(self):
    return f"CountingClicker(count={self.count})"

# And finally, we need to make the public application progrmaming interface (API):

def click(self, num_times = 1):
    """Click the clicker some number of times."""
    self.count += num_times

def read(self):
    return self.count

def reset(self):
    self.count = 0

# He asserts it, I trust him.

# You can now create "subclasses" that inherit functionality from parent class.
# For example, you could create a non-resettable clicker:
class NoResetClicker(CountingClicker):
    def reset(self):
        pass           # Now the reset method does nothing.

# Iterables and Generators:

In [None]:
# I should've prefaced this with the fact an iterable is literally just anything
# that loops and/or can be "iterated" using the <for> function.

# Instead of making a list of a billion digits (sm space) to use at your will,
# just use a generator to produce those you need——with all of a list's benefits.
# One way you could use them is through the <yield> operator:
def generate_range(n):
    i = 0
    while i < n:
        yield i  # Every call to yield produces a value of the generator
        i += 1

# Then make a loop which will take in the yielded values one-by-one until 
# none are left:
for i in generate_range(10):
    print(f"i: {i}")

# A second way to create generators is using <for> wrapped in ( ):
evens_below_20 = (i for i in generate_range(20) if i % 2 == 0)

# Again, the benefit being that if doesn't actually create a shit ton of 
# numbers which will waste space until you iterate over it (tell it to "go");
# this has the added benefit of use in complicated data-processing.
data = natural_numbers()
evens = (x for x in data if x % 2 == 0)
even_squares = (x ** 2 for x in evens)
even_squares_ending_in_six = (x for x in even_squares if x % 10 == 6)

# Often, this'll be combined with the <enumerate> function:
names = {"John", "Quincy", "Darnell", "Stacy"}
for i, name in enumerate(names):
    print(f"name {i} is {name})

# Randomness:

In [None]:
# To generate random numbers:
import random
random.seed(10) # For same results every time, remember stats.

four_uniform_randoms = [random.random() for _ in range(4)] # Again, uniform
# simply meaning any other number was possible, no need to get confused.

# You can combine this with ranges:
random.randrange(10) # Chooses randomly from 1-->10

# Or shuffle a list:
list_to_ten = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
random.shuffle(list_to_ten)
print(list_to_ten)

# Or pick one element at random from list:
favclass = random.choice["Taming Big Data", "Baming Dig Tata", "Daming Tig Bata"]

# Or sample from a list without duplication:
lottery_numbers = range(60)
winning_numbers = random.sample(lottery_numbers, 2)

# Or with duplicates (think):
with_dupes = [random.choice(range(10)) for _ in range(4)] # Could be overlap.

# Regular Expressions

In [None]:
# <re> expressions allow you to search text:
import re
examples_of_re = [ # Bc they're true...
    not re.match("a", "cat"), # "cat" doesn't start with "a"
    re.search("a", "cat"), # "cat" does have "a" in it
    not re.search("c", "dog"), # "dog" doesn't have "c" in it
    3 == len(re.split("[ab]", "carbs")), # Split on a or b to c r s
    "R-D-" == re.sub("[0-9]", "-", "R2D2") # Replace digits with dashes
]
assert all(re_examples), "all the regex examples should be True" # ...bc.

# zip and Argument Unpacking:

In [None]:
# You can <zip> 2+ iterables together, creating one iterable of tuples:
list1 = ['a', 'b', 'c']
list2 = [1, 2, 3]

[pair for pair in zip(list1, list2)] # ...creates [('a', 1), ('b', 2), ('c', 3)]

# If you <zip> together two lists of varying length, it just stops when short ends.
# You can "unzip"——argument unpack——like so:
pairs = [('a', 1), ('b', 2), ('c', 3)]
letters, numbers = zip(*pairs)

# You can also do a lil quirky thing where you combine the unzipped:
add(*[1, 2]) # Equals 3!

# args and kwargs:

In [None]:
# This makes a little bit of sense, but not a lot. Read it now, try again.

# Type Annotations:

In [1]:
# Do them. (Genuinely fill out this section and the above if you more time).

# CHAPTER 3:

# matplot lib (Plots):

In [None]:
# (is used to create plots, charts, and visualize data generally)

from matplotlib import pyplot at plt

years = [1950, 1960, 1970, 1980, 1990]
gdp = [300.2, 300.3, 567.2, 4993.5, 9320.9]

# Create a line chart:
plt.plot(years, gdp, color = 'green', marker = 'o', linestyle = 'solid')

# Add a title:
plt.title("Nominal GDP")

# Add a label to the y-axis:
plt.ylabel("Billions of $")
plt.show() # Generates an upward-sloping, Nominal GDP graph with green line/points.

# Bar Charts:

In [None]:
movies = ["Annie Hall", "Ben-Hur", "Casablanca", "Gandhi"]
num_oscars = [5, 11, 3, 8]

# Plot bars with x-coordinates [0, 1, 2, 3, 4] and y's [num_oscars]
plt.bar(range(len(movies)), num_oscars)

plt.title("My Favorite Movies") # Add a title!
plt.ylabel("Number Academy Awards") # Label the y-axis!
plt.xticks(range(len(movies)), movies) # Label x-axis with movie names at 
                                       # bar centers.

plt.show() # Displays a beautiful bar graph of each movie's Academy Awards.

# You can also create histograms, dropping numbers into "buckets":

from collections import Counter
grades = [83, 95, 91, 87, ..., 21]

# Bucket grades (the variable, no confusion) by decile, but put 100 in with 90s:
histogram = Counter(min(grade // 10 * 10, 90) for grade in grades)
plt.bar([x + 5 for x in histogram.keys()] # Shift bars right by 5.
        histogram.values(), # Give each bar its appropriate height.
        10, # Give each bar a width of 10.
        edgecolor = (0, 0, 0)) # Black edges for each bar, for discernability.

plt.axis([-5, 105, 0, 5]) # x-axis from -5 to 105... give it a little space.

# ...and do all your labels as you did before.

# Line Charts:

In [None]:
# Good for showing trends:
varaince = [1, 2, 4, 8, 16, 32, 64, 128, 256]
bias_squared = [256, 128, 32, 16, 8, 4, 2, 1]
total_error = [x + y for x, y in zip(variance, bias_squared)]
xs = [i for i, _ in enumerate(variance)]

# Use the [heck] out of plt.plot to show multiple series on one chart:
plt.plot(xs, variance, 'g-', label = 'variance') # Green solid line.
plt.plot(xs, bias_squared, 'r-.', label = 'bias^2') # Green dashed line.
plt.plot(xs, total_error, 'b:', label = 'total error') # Green dotted line.

# And as a reward for assigning labels to each series, we get a free legend.
plt.legend(loc = 9) # Means "top center"
plt.xticks([]) # For nothing.

# ...and the rest is history—— a pretty line chart.

# Scatterplots:

In [None]:
# ...are the right choice for visualizing relationship b/t two related datasets:

friends = [140, 8, 100, 130, 40, 20] # IRLs...
minutes = [0, 150, 10, 5, 60, 110] # ...spent on LOL, of course.
plt.scatter(friends, minutes)

# Label each and every point:
for label, friend_count, minute_count in zip(labels, friends, minutes):
    plt.annotate(label,
        xy = (friend_count, minute_count), # Puts each label with its point.
        xytext = (5, -5),                  # But slightly offsets it.
        textcoords = 'offset points')      # Cursor over it, would read:

# Make sure to always have equal axes!

# CHAPTER 5:

# Describing a Single Set of Data:

Central Tendencies:

In [None]:
# Extremes:

maximum = max(num_friends)
minimum = min(num_friends)

def mean(xs: List[float]) -> float: # Mean is equal to...
    return sum(xs) / len(xs) # ...sum of all x's / # all x's.

mean(num_friends) # Returns 7.33 repeating.

# Median:

def _median_odd(xs: List[float]) -> float:
    """If len(xs) is odd, median is middle element."""
    return sorted(xs)[len(xs) // 2]

def _median_even(xs: List[float]) -> float:
    """If len(xs) is even, it's the average of middle two elements."""
    sorted_xs = sorted(xs)
    hi_midpoint = len(xs) // 2 # eg; length of 4 returns a hi-mid of 2
    return (sorted_xs[hi_midpoint - 1] + sorted_xs[hi_midpoint]) / 2

def median(v: List[float]) -> float:
    """Finds the 'middle-most' value of v."""
    return _median_even(v) if len(v) % 2 == 0 else _median_odd(v)

# Quartiles:

def quartile(xs: List[float], p: float) -> float:
    """Returns the pth-percentile value for x."""
    p_index = int(n * len(xs)) # For example, plugging in 0.1 would return
    return sorted(xs)[p_index] # 1 for our list, because it's a tenth of way up.

# Mode:

def mode(x: List[float]) -> List[float]:
    """Returns a list, since there could be more than one 'most' common."""
    counts = Counter(x)
    max_count = max(counts.values())
    return [x_i for x_i, count in counts.items()
            if count == max_count]

assert set(mode(num_friends)) == {1, 6} # Just for kicks.

Dispersion (measure of data spread):

In [None]:
# Range, measures through max - min, 0 being the tightest:

def data_range(xs: List[float]) -> float:
    return max(xs) - min(xs)

# Variance, the more refined range: 

def de_mean(xs: List[float]) -> List[float]:
    """Subtract mean of x's from each x, so mean now becomes 0."""
    x_bar = mean(xs) # Remember AP stats symbol.
    return [x - x_bar for x in xs]

def variance(xs: List[float]) -> float:
    """(Deviation from mean)^2 ... average of all of them."""
    assert len(xs) >= 2, "Variance requires at LEAST two x's."

    n = len(xs)
    deviations = de_mean(xs)
    return sum_of_squares(deviations) / (n - 1)

assert 81.54 < variance(num_friends) < 81.55

# And because standard deviation is the best measure of any:

import math
def standard_deviation(xs: List[float]) -> float:
    """The standard deviation is the square root of the variance."""
    return math.sqrt(vairance(xs))

# And because standard deviation is flawed, considering outliers:

def interquartile_range(xs: List[float]) -> float:
    """Returns the difference between the 75%-ile and the 25%-ile."""
    return quantile(xs, 0.75) - quantile(xs, 0.25)

# Which chops outliers, and probably gives you something more reasonable.

# Correlation:

In [1]:
# Covariance, the bastard son:

def covariance(xs: List[float], ys List[float]) -> float:
    assert len(xs) == len(ys) "Gotta have same number of x's and y's."
    return dot(de_mean(xs), de_mean(ys)) / (len(xs) - 1)

# Measures how two variables vary from their means, together. In other
# words, when stock x moves alongside stock y, positive covariance, when
# they move opposite ways, negative. When unrelated, zero.

# This is pretty good, but creates the illusion of larger covariance if
# everyone's values were multiplied by two, say.

# Correlation fixes this by dividing out the standard deviation:

def correlation(xs: List[float], ys: List[float]) -> float:
    """Measures how much xs and ys vary in tandem about their means."""
    stdev_x = standard_deviation(xs)
    xtdev_y = standard_deviation(ys)
    if stdev_x > 0 and stdev_y > 0:
        return covariance(xs, ys) / stdev_x / stdev_y
    else:
        return 0 # If there's no variation, the correlation is zero.

# Wait! Kill the outliers lest they kill your data!

outlier = num_friends.index(100) # Index
num_friends_cleaned = [x
                       for i, x in enumerate(num_friends)
                       if i != outlier] # If you remove the outlier.

daily_minutes_cleaned = [x
                         for i, x in enumerate(daily_minutes)
                         if i != outlier]

daily_hours_cleaned = [dm / 60 for dm in daily_minutes_cleaned]

SyntaxError: invalid syntax (2257367906.py, line 3)

"Simpson's Paradox" simply means, check your data for confounding variables / design your experiment properly.

"Correlational Caveats" include things you wouldn't otherwise know through the above methods!

# CHAPTER 6:

# Dependence and Indendence:

Independent: P(E, F) = P(E)P(F) ... the chance of both happening is simply chance E happens multiplied by chance F happens.

Dependent: P(E|F) = P(E, F)/P(F)       OR       P(E, F) = P(E|F)P(F) ... the chance E happens, given we KNOW F happens.

In [None]:
# Let's use the example of predicting boy/girl:

import enum, random # Enum is a typed set of enumerated values.
class Kid(enum.Enum):
    BOY = 0
    GIRL = 1

def random_kid() -> Kid:
    return random.choice([Kid.BOY, Kid.GIRL])

both_girls = 0
older_girl = 0
either_girl = 0

random.seed(0) # The nostalgia.

for _ in range(10000):
    younger = random_kid()
    older = random_kid()
    if older == Kid.GIRL:
        older_girl += 1
    if older == Kid.GIRL and younger == Kid.GIRL:
        both_girls += 1
    if older == Kid.GIRL or younger == Kid.GIRL:
        either_girl += 1

print("P(both | older):", both_girls / older_girl) # 0.514 ~ 1/2
print("P(both | either):", both_girls / either_girl) # 0.342 ~ 1/3

# Bayes's Theorem:

In [3]:
# Search it up, I'm not understanding it the way the book talks.

# Random Variables:

Any variable whose possible values can be mapped into a probability distribution. For example, a random variable set to = 1 if coin flip is heads, and set to = 0 if tails. Distribution would equal 0 with probability 0.5, and 1 with probability 0.5.

A range(10) would have a distribution with probability 0.1 for all numbers 0-9.

Thusly, an "expected value" would be the average of all probabilities, eg; coin flip = (0 * 1/2) + (1 * 1/2)... range(10) = 4.5 (0 * 0.1 + 1 * 0.1 ..., 9 * 0.1).

Always remember, you can "condition" your variables (change their probabilities)!

# Continuous Distributions:

In [None]:
# "Discrete distribution" = variable can only take on certain values,
# "probability density function" measures chances var is b/t two vals:
def uniform_pdf(x: float) -> float:
    return 1 if 0 <= x < 1 else 0

# A "cumulative distribution function" gives probability random var
# is less than or equal to certain value:
def uniform_cdf(x: float) -> float:
    """Returns probability a uniform random variable is < = x."""
    if x < 0: return 0 # Uniform random is never less than 0.
    elif x  1: return x # ex; P(X <= 0.4) = 0.4
    else: return 1 # ...but always less than 1.

# Normal Distribution:

Normal Distribution is classic bell-shaped curve you know and love, with Mew = 0 and stdev = 1 (producing the equations X = stdev(Z) + Mew ... for stedev norm var "Z", OR Z = (X - Mew)/stdev ... for norm var "X")

This (re-)introduces the concept of "invert_norm_cdf", which produces the value of a specific probability.

# The Central Limit Theorem:

Essentially states that the prob dist of the mean will be normal, given the size of the sample is large enough.

Yes, there are other formulae in here, but nothing very expressely related to the coding I think you'll be doing.

...(just know how to use cdf, pdf, and their inverses).

# CHAPTER 7:

Dazed.

In [1]:
# Remember nulls? Those H_0 : p = 0.5 vs H_1 : p =/ 0.5 from stats?
from typing import Tuple
import math

def normal_approximation_to_binomial(n: int, p: float) -> Tuple[float, float]:
    """Returns mu and sigma corresponding to a Binomial(n, p)"""
    mu = p * n
    sigma = math.sqrt(p * (1 - p) * n)
    return mu, sigma

# CHAPTER 8:

Let's say you have an array of numbers (a vector). Let's say you also have a function f(x) = x^2. To find out at what point in the vector the function is increasing the most——to "maximize it——you'd use a derivative. However, you can also use the less accurate "difference quotient", which is essentially a line between two points, and the closer those two points are together, the closer it gets to the actual tangent line, whose slope we need:

Ender Eye: The ender eye acts like the gradient in gradient descent. It shows you the direction of the nearest stronghold. If you throw the ender eye and it moves rapidly in a certain direction, it means the stronghold is in that direction, and you should head that way.

Step Size: The step size in gradient descent is like the distance you travel each time you throw the ender eye. If the ender eye moves a lot, you take a big step. If it moves only a little, you take a small step. This helps you navigate efficiently towards the stronghold.

Finding the Stronghold: You keep throwing the ender eye, adjusting your direction and step size based on its movement, until you eventually reach the stronghold. The larger the movement of the ender eye, the more confident you are that you're heading in the right direction.

In [6]:
from typing import Callable
def difference_quotient(f, x, h,):
    return(f(x + h) - f(x)) / h

def square(x):
    return x * x

def derivative(x):
    return 2 * x # You'd have to manually calculate this, and plug it in for
                 # every damn point. Instead, we'll use the difference_quotient
                 # estimation, increasing by a step of h = 0.001 in a direction.

xs = range(-10, 11)
actuals = [derivative(x) for x in xs]
estimates = [difference_quotient(square, x, h = 0.001) for x in xs]

# Which would return the fact that difference quotients get pretty damn
# close to the derivatives at each point (the f'(x) equations overlap).

Multivariable Functions? Not to worry:

In [7]:
# But what if you had multiple variables? Just hold the other variables as fixed
# and treate the "nth" partial derivative as a function... I guess:

def partial_difference_quotient(f, v, i, h):
    """Returns the nth partial difference quotient of f, at v."""
    w = [v_j + (h if j == i else 0)
         for j, v_j in enumerate(v)] # Add h to ONLY the nth element of v.
    
    return (f(w)-f(v)) / h

def estimate_gradient(f, v, h):
    return [partial_difference_quotient(f, v, i, h)
            for i in range(len(v))]

Estimating the Gradient (Continued):

In [None]:
# But let's imagine we had no idea what the function was, or the fact
# its f'(x) moves linearlly. We'd start a random point and just go:

import random
# from scratch.linear_algebra import distance, add, scalar_multiply

def gradient_step(v):
    assert len(v) = len(gradient)
    step = scalar.multiply(step_size, gradient)
    return add(v, step) # Move up by a little.

def sum_of_squares_gradient(v):
    return [2 * v_i for v_i in v]

# Pick random starting point:
v = [random.uniform(-10, 10) for i in range(3)]

for epoch in range(1000):
    grad = sum_of_squares_gradient(v)
    v = gradient_step(v, grad, -0.01) # I don't know, man.
    print(epoch, v)

assert distance(v, [0, 0, 0]) < 0.01

Using the Gradient:

In [8]:
# Now let's say you have a vector whose data moves in a pattern, but you
# don't the function——model——that best fits that pattern. You'd use a
# gradient to find how accurate estimates vs. actual data is, and adjust
# accordingly:

inputs = [(x, 20 * x + 5) for x in range(-50, 50)] # X, Y = 20x + 5
def linear_gradient(x, y):
    slope, intercept = theta
    predicted = slope * x + intercept # A prediction for the function.
    error = (predicted - y)           # Error equation.
    squared_error = error ** 2        # Minimize squared error...
    grad = [2 * error * x, 2 * error] # Gradient is however much off.
    return grad                       # It's so big because that way
                                      # big errors are even more
    # egrigous, and the smaller they get, the more correct exponentially.
    # And if the error is negative, the change is a lot less worse.

In [None]:
# For the whole dataset——that was just one point lmao——you'll need to:

# 1.) Start with random values for slope and intercept:
theta = [random.uniform(-1, 1), random.uniform(-1, 1)]

learning_rate = 0.001

for epoch in range(5000): # (Epoch simply a lot of iterations).
    # 2.) Compute the mean of the gradients.
    grad = vector_mean([linear_gradient(x, y, theta) for x, y in inputs])
    # 3.) Take a step in that direction.
    theta = gradient_step(theta, grad, -learning_rate)
    print(epoch, theta)

slope, intercept = theta
assert 19.9 < slope < 20.1, "slope should be about 20"
assert 4.9 < intercept < 5.1, "intercept should be less than 5"

Minibatch:

In [None]:
# As seems to be the pattern in this book, here's the better way of going
# about doing this... because the above method calculates it less
# effeciently, especially with big data sets:

from typing import TypeVar, List, Iterator

T = TypeVar('T') # This allows you to create "generic" functions

def minibatches(dataset, batch_size, shuffle):
    """Generates 'batch_size'-sized minibatches from the dataset."""
    # "Start" indexes from 0, batch_size, 2 * batch_size
    batch_starts = [start for start in range(0, len(dataset)), batch_size]
    if shuffle: random.shuffle(batch_starts) # Shuffle the batches.
    for start in batch_starts:
        end = start + batch_size
        yield dataset[start:end] # Wtf.

for epoch in range(1000):
    for batch in minibatches(inputs, batch_size = 20)
        grad = vector_mean([linear_gradient(x, y, theta) for x, y in batch])
        theta = gradient_step(theta, grad, -learning_rate)
    print(epoch, theta) # I'm going to assume this does the same gradient-
                        # averaging shit as the above code, except in terms
                        # of increasingly larger batches.
    
slope, intercept = theta
assert 19.9 < slope < 20.1, "slope should be about 20"
assert 4.9 < intercept < 5.1, "intercept should be less than 5"

Stochastic:

In [None]:
# You can also take steps based on one training example at a time:

theta = [random.uniform(-1, 1), random.uniform(-1, 1)]

for epoch in range(100):
    for x, y in inputs:
        grad = linear_gradient(x, y, theta)
        theta = gradient_step(theta, grad, -learning_rate)
    print(epoch, theta)

slope, intercept = theta
assert 19.9 < slope < 20.1, "slope should be about 20"
assert 4.9 < intercept < 5.1, "intercept should be less than 5"

# I genuinely have no idea how this is any different than before.

# CHAPTER 9:

# stdin and stdout:

In [None]:
# You can streamline data through <sys.stdin> and <sys.stdout>, for example,
# you can create an "egrep.py" system——one that searches for patterns or regular
# expressions in a specified location——like so:
# <sys.argv> is the list of command-line arguments (literally just the code).
# <sys.argv[0]> is the name of the program in-question.
# <sys.argv[1]> is the regular expression (regex) specified at the command line.
import sys, re
regex = sys.argv[1]
for line in sys.stdin:          # For every line of text passed into this program,
    if re.search(regex, line):  # if it matches a regular expression,
        sys.stdout.write(line)  # spit it back out!

# Here's one that counts the lines recieved and writes out the count (much like
# "line_count.py").
import sys
count = 0
for line in sys.stdin:
    count += 1
    print(count)
    
# Or count how many lines of a file contain numbers:
type SomeFile.txt | python egrep.py "[0-9]" | line_count.py
# The | is a pipe character, meaning "use the output of the left command as the
# input of the right command".

# Similarly, this script counts the words in its input and writes the most common
# ("most_common_words.py").
import sys
from collections import Counter

try:
    num_words = int(sys.argv[1]) # "Try" number of words as first argument.
except:
    print("usage: most_common_words.py num_words")
    sys.exit(1) # (Nonzero exit code would indicate an error)

counter = Counter(word.lower() # Lowercase words.
                  for line in sys.stdin
                  for word in line.strip().split() # Split on spaces
                  if word) # Skip empty "words".

for word, count in counter.most_common(num_words):
    sys.stdout.write(str(count))
    sys.stdout.write("\t")
    sys.stdout.write(word)
    sys.stdout.write("\n")

$ type the_bible.txt | python most_common_words.py 10 # Which would produce:
# 359330 the
# 321943 and
# .... and so on.

# Reading Files

THE BASICS:

In [None]:
# 'r' means "read-only", this is the assumed command if you leave one out:
file_for_reading = open('reading_file.txt', 'r')

# 'w' means write, it'll destroy the file if it already exists!
file_for_writing = open('writing_file.txt', 'w')

# 'a' is append, for adding to the end of the file:
file_for_appending = open('appending_file.txt', 'a')

# CLOSE your files when done:
file_for_writing.close()

# ...just so you don't forget to do so, use the above commands inside a
# <with> prompt so it closes automatically:
with open(filename) as f:
    data = function_that_gets_data_from(f)
process(data)

# If you need to read a whole text file, iterate over it using <for>:
starts_with_hash = 0
with open('input.txt') as f:
    for line in f:                 # Looks at each line in the file.
        if re.match("^#", line):   # Uses a regex to see if starts with #.
            starts_with_hash += 1  # If it does, add 1 to the count.

# You'll often want to <strip> the text into pieces. For example, you want
# to see the number of certain domains in a list of email adresses.
def get_domain(email_address: str) -> str:
    """Split on '@' and return the last piece."""
    return email_adress.lower().split("@")[-1]

assert get_domain('234242@glenbrook225.org') == 'glenbrook225.org'
assert get_domain('c.love.00029@gmail.com') == 'gmail.com'

from collections import Counter

with open('email_adresses.txt', 'r') as f:
    domain_counts = Counter(get_domain(line.strip())
                            for line in f
                            if "a" in line)

DELIMITED FILES:

In [None]:
# The previous example assumed you'd have one email per line, which is
# optimistic. To separate a chunk of text in a file, use <csv>. With 
# tab-delimited text (two columns with a space in-between):
import csv
with open('tab_delimited_stock_prices.txt') as f:
    tab_reader = csv.reader(f, delimiter = 't\)
    for row in tab_reader:
        date = row[0]
        symbol = row[1]
        closing_price = float(row[2]) # <float> is type annotation.
        process(date, symbol, closing_price) # See the TB for example data.

# Or with headers——dict at beginning for all data——everything in rows).
with open('colon_delimited_stock_prices.txt') as f:
    colon_reader = csv.DictReader(f, delimiter= ':')
    for dict_row in dict_row colon_reader:
        date = dict_row["date"]
        symbol = dict_row["symbol"]
        closing_price = float(dict_row["closing_price"])
        process(date, symbol, closing_price) # Again, see TB for context.

# You can also write out delimited data with <csv.writer>:

todays_prices = {'AAPL': 90.91, 'MSFT': 41.68}
with open('comma_delimited_stock_prices.txt', 'w') as f:
    csv_writer = csv.writer(f, delimiter = ',')
    for stock, price in todays_prices.items():
        csv_writer.writerow([stock, price])

# Given your og text has commas in it, all the data should write out well.

# Scraping the Web:

HTMLS AND PARSING THROUGH THEM:

In [None]:
# Instead of using python's built-in HTTP-request commands, install
# "Beautiful Soup library" and the <html5lib> parser. To stage an HTML:
from bs4 import BeautifulSoup
import requests
url = ("https:// ... .html")
text = requests.get(url).text
soup = BeautifulSoup(html, 'hmtl5lib')

# Now, you can use <tag> objects, which coorespond with the tags representing the
# structure on an HTML page. For example, to find the first <p> tag (paragraph):
first_paragraph = soup.find('p')

# Now get its text:
first_paragraph_text = soup.p.text
first_paragraph_words = soup.p.text.split()

# Extract its attributes:
first_paragraph_id = soup.p['id']       # Raises KeyError if no 'id'.
first_paragraph_id2 = soup.p.get('id')  # Returns "None" if no 'id'.

# Get multiple paragraphs:
all_paras = soup.find_all('p') # Or, just soup('p').
paras_w_ids = [p for p in soup('p') if p.get('id')]

# To find tags with a specific class:
important_paras = soup('p', {'class' : 'important'})
important_paras2 = soup('p', 'important')
important_paras3 = [p for p in soup('p')
                    if 'important' in p.get('class', [])]

# To find every element, say <span>, inside another, say <div>:
spans_inside_divs = [span
                     for div in soup('div')
                     for spain in div('span')]
# But be warned, if there one <span> sits within multiple <divs>, it'll return
# more than once, despite only being only element.

Go to the book for an excellent, excellent example of how to
use all of these to iterate over all congresspeoples' press releases for mentions of "data".

# Using APIs

Application programming interfaces allow you to request data in a structured format, saving you much of the trouble of above.

JSON AND XML:

In [None]:
# HTTP is a protocol for transferring text, so you need to "serialize" it into
# a string format, which often uses "JavaScript Object Notation" (JSON), which
# are essentially dicts:
{ "title" : "Data Science Book",
  "author" : "Joel Grus",
  "publicationYear" : 2019,
  "topics" : [ "data", "science", "data science" ] }

import json
serialized = """{ "title" : "Data Science Book",
                  "author" : "Joel Grus",
                  "publicationYear" : 2019,
                  "topics" : [ "data", "science", "data science" ] }"""

# "Parse" the JSON to create a dict using the <load> function:
deserialized = json.loads(serialized)
assert deserialized["publicationYear"] == 2019
assert "data science" in deserialized["topics"]

# "Sometimes an API provider hates you, and only provides responses in XML".
<Book>
  <Title>Data Science Book</Title>
  <Author>Joel Grus</Author> # ...and so on, so use BeautifulSoup to parse.

USING AN UNAUTHENTICATED API:

In [None]:
# As will be explained below, most APIs these days require authentication.
# To start, here's Github's API and what you can do with it unauthenticated:
import requests, json
github_user = "ConnorLove"
endpoint = f"https://api.github.com/users/{github_user}/repos"
repos = json.loads(requests.get(endpoint).text)

# At this point repos is a list of dicts, which we can use, say, to figure
# our which days of the week you're most likely to create a repo:
"created_at" : "2013-07-05T02:02:28Z"

python -m pip install python-dateutil
from collections import Counter
from dateutil.parser import parse

dates = [parse(repo["created_at"]) for repo in repos] # ...and so on.

# Or get the languages of the last five repos:

last_5_repos = sorted(repos,
                      key=lambda r: r["pushed_at"],
                      reverse = True) [:5]

last_5_langs = [repo["language"]
                for repo in last_5_repos]

# You don't need to know most of this.

FINDING APIS:

In [1]:
# If you need data from a specific site, look for a "developers" or "API"
# section of the site, and do a general internet search for "python
# <sitename> API" for the appropriate library. Or you could... scrape.

AN EXAMPLE USING TWITTER APIS:

In [None]:
# First, <python -m pip install twython> (install the twitter API library).
# Second, follow all his instructions for authenticating a developer 
# account with the API and secret API keys (always the trickiest part). 
# This is done so develops know you aren't wasting data.
# Following the rest of the steps in the book which I'm too lazy to 
# include, however also which are not very relevant to your education.

# CHAPTER 10

# Exploring Your Data:

One-Dimenstional Data:

In [None]:
# If you have a series of numbers, you can find mean/median/mode (easy),
# or create a histogram by dropping your data into distinct "buckets":
from typing import List, Dict
from collections import Counter
import math
import matplotlib.pyplot as plt

def bucketize(point: float, bucket_size: float) -> float:
    """Floor the point to the next lower multiple of bucket_size."""
    return bucket_size * math.floor(point / bucket_size)
    # Returns size of bucket and point divided by size of bucket?

def make_histogram(points: List[float], bucket_size: float) -> Dict[float, int]:
    """Buckets the points and counts how many in each bucket."""
    return Counter(bucketize(point, bucket_size) for point in points)
    # (See above)

def plot_histogram(points: List[float], bucket_size: float, title: str = ""):
    histogram = make_histogram(points, bucket_size)
    plt.bar(histogram.keys(), histogram.values(), width = bucket_size)
    plt.title(title)
    # Idk it makes a histogram.

Two Dimensions:

In [None]:
# Let's say you had two sets of ys for the same xs, eg: 

def random_normal() -> float:
    """Return a random draw from standard normal dist."""
    return inverse_normal_cdf(random.random())

xs = [random_normal() for _ in range(1000)]
ys1 = [x + random_normal() / 2 for x in xs]
ys1 = [-x + random_normal() / 2 for x in xs]

# If you plotted these (see that page), they'd have same xs, diff y correlations.

Many Dimensions:

In [None]:
# Um, you can use a correlation matrix, which I still sort of don't
# understand, but believe it displays the correlations between, say,
# x / y / z, and when a variable's column meets another's row, that's
# their correlation, represented.

# Using NamedTuples:

In [None]:
# If you have a lot of data, plugging it into Dicts over, and over, and over
# is prone to errors. Instead, use "NamedTuples", which will create suggestions
# in your editor when you're plugging in data (and you'll also save storage):

from collections import namedtuple

StockPrice = namedtuple('StockPrice', ['symbol', 'date', 'closing_price'])
price = StockPrice('MSFT', datetime.date(2018, 12, 14), 106.03)

assert price.symbol == 'MSFT' # T.A, because it's the right thing to do!
assert price.closing_price # See above!

# Great! But I want to type annotate, because it's the right thing to do:

from typing import Namedtuple

class StockPrice(NamedTuple):
    symbol: str
    date: datetime.date
    closing_price: float
    # (Ab)use the fact this is a class to create methods, remember, those
    # things you can use to define the class at large, not just individuals?

    def is_high_tech(self) -> bool:
        """It's a class, so we can add methods too!"""
        return self.symbol in ['MSFT', 'GOOG', 'FB', 'AMZN', 'AAPL']

price = StockPrice('MSFT', datetime.date(2018, 12, 14), 106.03)

# Blah blah blah, assert it all.

# Dataclasses:

In [None]:
# These are essentially NamedTuple's bastard cousin whose instance methods
# you can modify (again, instance methods are individual variables'
# alterations of the symbol, date, closing_price).

@dataclass # That decorator is the only change:
class StockPrice(NamedTuple):
    symbol: str
    date: datetime.date
    closing_price: float
    # (Ab)use the fact this is a class to create methods, remember, those
    # things you can use to define the class at large, not just individuals?

    def is_high_tech(self) -> bool:
        """It's a class, so we can add methods too!"""
        return self.symbol in ['MSFT', 'GOOG', 'FB', 'AMZN', 'AAPL']

price = StockPrice('MSFT', datetime.date(2018, 12, 14), 106.03)

price2.closing_price /= 2 # Stock split

# But, with the ability to alter dataclasses, comes the very same error we
# avoided with NamedTuples:

price2.cosing_price = 75 # Damn, "cosing".

# Cleaning and Munging:

In [None]:
# Your data may be dirt, clean it like so:

from dateutil.parser import parse

def parse_row(row: List[str]) -> StockPrice:
    symbol, date, closing_price = row
    return StockPrice(symbol = symbol,
                      date = parse(date).date(),
                      closing_price = float(closing_price))

stock = parse_row(["MSFT", "2018-12-14", "106.03"])

# (Assertions, yada yada yada)*.
# Now, let's test it out:

from typing import Optional
import re

def try_parse_row(row: List[str]) -> Optional[StockPrice]:
    symbol, date_, closing_price_ = row # The extra _'s are just for spacing.
    if not re.match(r"^[A-Z]+$", symbol): # "symbol" at end just means that's
        return None                       # the variable type it is.

    try:
        date = parse(date_).date()
    except ValueError                     # See exceptions, chapter two,
        return None                       # for the method.

    try:
        closing_price = float(closing_price_)
    except Value Error
        return None

    return StockPrice(symbol, date, closing_price)

# (*), except if you try to assert:
assert try_parse_row(["MSFT0", "2020-12-14", "106.03"]) is None # !!!!!

# Manipulating Data

In [None]:
date = [
    StockPrice(symbol = 'MSFT',
               date = datetime.date(2018, 12, 24),
               closing_price = 106.03),
    # ... and so on, including all of Apple's, and all entries generally.

# Let's manipulate, starting with an example of grabbing Apple's highest-ever
# closing price. Think about this logically, like you were a program.
# 1.) Restrict yourself to Apple rows.
# 2.) Grab "closing prices" from each row.
# 3.) Grab the "max" from each row.

max_appl_price = max(stock_price.closing_price
                     for stock_price in date
                     if stock_price.symbol == "APPL") # ...easy enough.

# But what if we wished for the highest closing price of each stock? Logic:
# 1.) Create a dict to keep track of the higest values, and their stocks.
# 2.) Iterate over your data, updating it every day:

from collections import deafultdict

max_prices: Dict[str, float] = defaultdict(lamba: float('inf'))

# for stockprice
for sp in data:
    symbol, closing_price = sp.symbol, sp.closing_price
    if closing_price > max_prices[symbol]:
        max_prices[symbol] = closing_price

# But what are the largest and smallest one-day percent-shifts in all stocks?
# 1.) Order the prices by date.
# 2.) Zip together pairs (previous, current).
# 3.) Convert zipped pairs into "percent change" rows.

from typing import List
from collections import defaultdict

# 0.) Group the prices by symbol.
prices: Dict[str, List[StockPrice]] = defaultdict(list)

# Add stock price to the symbol, creating the new variable symbol_prices.
for sp in data:
    prices[sp.symbol].append(sp)

# Now that we have a bunch of prices with same symbol, sort them by date
# instead, so they're sequential.

prices = {symbol: sorted(symbol_prices)
          for symbol, symbol_prices in prices.items()}

# Now define pct-change:

def pct_change(yesterday: StockPrice, Today: StockPrice) -> float:
    return today.closing_price / yesterday.closing_price - 1 # The equation.

class DailyChange(NamedTuple):
    symbol: str
    date: datetime.date
    pct_change: float

def day_over_day_changes(prices: List[StockPrice]) -> List[DailyChange]:
    """Assumes prices are for one stock and are in order."""
    return [DailyChange(symbol = today.symbol,
                        date = today.date,
                        pct_change = pct_change(yesterday, today))]
    for yesterday, today in zip(prices, prices[1:])]

# Now collect the pct-changes created by your function!

all_changes = [change
               for symbol_prices in prices.values()
               for change in day_over_day_changes(symbol_prices)]

# Now find your max change for a given day:
max_changes = max(all_changes, key = lambda change: change.pct_change)
assert max_change.symbol == 'APPL'
assert max_change.date == date.date(1997, 8, 6)
assert 0.33 < max_change.pct_change < 0.34      # Assertions just in case!

# Rescaling 

"TQDM"s, or "progress" bars, show how long a computation is taking.
They don't even need their own section, so just reference the book's
"An Aside: tqdm" section if you'd like to include them in your work.

# Dimensionality Reduction:

In [None]:
# Sometimes, the dimensions of a dataset are all wack because they're not
# actually on the y/x-axis, beginning with 0, 0. Fear not, here's how we
# go about fixing this / interpreting it:
# 1.) Translate data so each dimensions has a mean of 0. 

from scratch.linear_algebra import subtract
def de_mean(data: List[Vector]) -> List[Vector]:
    """Re-centers the data so it has a mean of 0 in every direction,
    effectively a 'properly'-square graph!"""
    mean = vector_mean(data)
    return [subtract(vector, mean) for vector in data]

# 2.) "Computer, which direction captures the greatest variance in data?"
# (Specifically, given a direction "d" = magnitude of 1, every point is
# labeled as a vector "w" with magnitude:

from scratch.linear_algebra import magnitude
def direction(w: Vector) -> Vector:
    mag = magnitude(w)
    return [w_i / mag for w_i in w]

# Now use all those mini-w's to compute a general-graph variance
# (with a big funny arrow):

from scratch.linear_algebra import dot
def directional_variance(data: List[Vector], w: Vector) -> float:
    """Returns the variance of x in the direction of w."""
    w_dir = direction(w)
    return sum(dot(v, w_dir ** 2 for v in data))

# Just read the rest of this section! It's mostly coding! You get a
# big arrow at the end! I'm almost certainly missing a key element
# called the "principal component", but I most certainly don't care.

# CHAPTER 11:

Models = Formulae (take in inputs, produce outputs).

Machine Learning = Models based on data, that might be able to predict new data.

Supervised Models = Computer can check predictions against actually correct answers.

Unsupervised = None supplied.

Semisupervised = Some data labeled as correct.

Online = Model continuously adjusts to new, incoming data.

Reinforcemnet = After model makes certain number of predictions, gets score.

Because you could theoretically use any model for any dataset (like you could use any graph for any dataset), we'll make an assumption that one (eg; decision tree, linear function) describes it best, then make the best version of one of those.

Underfitting = Not a good model

Overfitting = Only good for one set of data, bad for others (eg; uses inputs from that dataset for model, rather than predicting)

In [None]:
# To avoid this, you split up your data into thirds, 2/3 used to train the
# data, the remaining 1/3 tested upon:

import random
from typing import TypeVar, List, Tuple
X = TypeVar('X') # Generic type to rep data point (input)

def split_data(data):
    """Split data into fractions [prob, 1 - prob]"""
    data = data[:] # Make a copy!
    random.shuffle(data) # ...because shuffle modifies ur list
    cut = int(len(data) * prob) # Use prob to find a cutoff
    return data[:cut], data[cut:] # And split the list there

data = [n for n in range(1000)]
train, test = split_data(data, 0.75)

# Proportions should be correct:
assert len(train) == 750
assert len(test) ==250

# And o.g data should be preserved somehow
assert sorted(train + test) == data

# And you'll likely have input variables which correlate with outputs.
# So you'll need to put them together in either the training or test:

Y = TypeVar('Y') # Generic type to rep data point (input)

def train_test_split(xs, ys, test_pct):
    # Generate indices and split them
    idxs = [i for i in range(len(xs))]
    train_idxs, test_idxs = split_Date(idxs, 1 - test_pct)

    return ([xs[i]] for i in train idxs, # x_train
            .....) # You've essentially "tagged" each x with its y.

# After which you can do something like:

model = SomeKindOfModel()
x_train, x_test, y_train, y_test = train_test_split(xs, ys, 0.33)
model.train(x_train, y_train)
performance = model.test(x_test, y_test)

# This is pretty good, and if a model performs well on the training set
# there's a good chance it works on the test set.

# However, you could still run into the problem of the model simply
# IDENTIFYING variables in this set, rather than discovering relationships
# between ATTRIBUTES... especially if you have recurring variables (say,
# users' daily data usage).

# More problematically, you might've just found a model that works well
# on both, and have effectively just run two training sessions, as opposed
# to how it would perform on any other larger dataset.

Correctness:

- True positive = "The message is spam, and we correctly predicted spam."
- False positive = "The message is not spam, but we predicted spam."
- True negative = "The message is not spam, and we correctly predicted not spam."
- False negative = "The message is spam, but we predicted not spam." 

Take a hypothesis that kids named "Luke" will have Leukimia. You can calculate the accuracy (correct/total) of this hypothesis with a TP/FP/TN/FN table, and:

In [2]:
def accuracy(tp, fp, fn, tn):
    correct = tp + tn
    total = tp + fp + fn + tn
    return correct / total # Returns 0.98114... very impressive.

Precision = How accurate positive predictions are:

In [4]:
def precision(tp, fp, fn, tn):
    return tp / (tp + fp) # Returns 0.014... less impressive

Recall = How many positives did we identify:

In [3]:
def recall(tp, fp, fn, tn):
    return tp / (tp + fn) # Returns 0.005... even worse

F1 Score = Precision plus recall:

In [5]:
def f1_score(tp, fp, fn, tn):
    p = precision(tp, fp, fn, tn)
    r = recall(tp, fp, fn, tn)
    return 2 * p * r / (p + r)

This is otherwise known as the "Harmonic mean" of precision and recall, lying between them. Usually, all models involve tradeoff. A model that predicts “yes” when it’s even a little bit confident will probably have a high recall but a low precision; a model that predicts “yes” only when it’s extremely confident is likely to have a low recall and a high precision.

This applies to leukemia, bc if you are less confident more people will not be included in doctor's confirmation of having leukemia.

The Bias-Variance Tradeoff:

High Bias / Low Variance = Bad model for your dataset, but would be equally bad for all datasets (...underfitting)
- Add more features!

Low Bias / High Variance = Perfect model for your dataset, but would completely change for others (...overfitting)
- Remove features
- Give it more data to work through!

Features = Inputs to the model (ex; spam detector)
- Can be yes/no (eg; does the email contain word 'viagra')
- Can be quantative (eg; how many times does 'd' appear)
- Can be qualatative (eg; what was domain of the sender)

- ...which follow these categories:
    - Naives Bayes classifier (yes or no)
    - Regression models (use numbers, dummy variables, 0s and 1s)
    - Decision trees (numerical AND categorical)

# CHAPTER 12:

Nearest neighbors = Looking at closest resembling data in a dataset to make assumptions about a data point (eg; you leave in Northbrook, your literal closest neighbors are Dems, you are assumed to be a dem.)

In [None]:
# In real data, you'd have labels which categorize data, helpful
# for our purposes! If you wanted to create a nearest neighbor,
# you'd pick a number "K", find "K" nearest points in one single
# label, then produce new data point with label in the "middle":

from typing import List
from collections import Counter

def raw_majority_vote(labels):
    votes = Counter(labels)
    winner, _ = votes.most_common(1)[0]
    return winner

# In data set of ['a', 'b', 'c', 'b'], would return 'b'.

# What if you have tie?:

def majority_vote(labels):
    """Assumed that labels ordered nearest --> farthest."""
    vote_counts = Counter(labels)
    winner, winner_count = vote_counts.most_common(1)[0]
    num_winners = len([count
                       for count in vote_counts.values()
                       if count == winner_count])
    if num_winners == 1:
        return winner # If there's only one most common
    else:
        return majority_vote(labels[:-1]) # Try again w/out farthest
    
# In data set of ['a', 'b', 'c', 'b', 'a'], would return 'b'.

# You can create a classifier so the data IS, in fact, ordered:

from typing import NamedTuple
from scratch.linear_algebra import Vector, distance

class LabeledPoint(NamedTuple):
    point: Vector
    label: str

def knn_classify(k, labeled_points, new_point):
    # Order labeled points nearest --> farthest
    by_distance = sorted(labeled_points,
                         key = lambda lp: distance(lp.point, new_point))
    # Find labels for "K" closest
    k_nearest_labels = [lp.label for lp in by_distance[:k]]
    # Let those "K" closest "vote" on nearest neighbor
    return majority_vote(k_nearest_labels)

Ex; "The Iris Dataset"

In [None]:
# sepal_length, sepal_width, petal_length, petal_width, class
# ex; (5.1, 3.5, 1.4, 0.2, Iris-setosa)

# Turn all the rows into LabeledPoints:

def typing import Dict
import csv # (flower file)
from collections import defaultdict

def parse_iris_row(row):
    measurements = [float(value) for value in row[:-1]]
    # Class is "Iris-setosa, we only want Setosa"
    label = row[-1].split("-")[-1]
    return LabeledPoint(measurements, label)

with open('iris.data') as f:
    reader = csv.reader(f)
    iris_data = [parse_iris_row(row) for row in reader]

points_by_species: Dict = defaultdict
for iris in iris_data:
    points_by_species[iris.label].append(iris.point)

# He then proceeds to make a bunch of cool plots that I don't
# want to type out.

The Curse of Dimensionality:

In [None]:
# In higher dimensions (eg; with more variables to describe
# data), your data's gonna space out more, and won't be as useful
# for finding nearest neighbors.

# Just as good practice, let's generate some points and calculate
# the average distances between them:

def random_point(dim):
    return [random.random() for _ in range(dim)]

def random_distances(dim, num_pairs):
    return [distance(random_point(dim), random_point(dim))
            for _ in range(num_pairs)]

import tqdm
dimensions = range(1, 101) # Range 1 --> 100 (remember takes one off)

avg_distance = []
min_distance = []

random.seed(0)
for dim in tqdm.tqdm(dimensions, desc = "Curse of Dimensionality"):
    distances = random_distances(dim, 10000) # 10000 random pairs
    avg_distance.append(sum(distances) / 10000) # Track the avg.
    min_distance.append(min(distances)) # Track the min

# As the num dimensions incr, the avg distance b/t points incr.
# Closest pts aren’t much closer than average, so 2 pts being close
# don't mean shit.

# CHAPTER 14:

So let's say you've tested and come up with a good linear model to show relationship between x and y... you'll need to determine WHY it is that way:

y = Beta(x) + alpha + funky lookin' e

y = predicted output

Beta = constant multiplied by input

Alpha = y-intercept

The "e" = Margin of error

In [None]:
def predict (alpha, beta):
    return beta * x_i * alpha

def error(alpha, beta, x_i, y_i):
    """
    The error from predicting beta * x_i + alpha,
    when actual value is y_i
    """
    return predict(alpha, beta, x) - y

# But you'd like to know error over entire dataset, so you'll
# add up ALL squared errors:

from scratch.linear_algebra import Vector

def sum_of_sqerrors(alpha, beta, x, y):
    return sum(error(alpha, beta, x_i, y_i) ** 2
               for x_i, y_i in zip(x, y))

# The "least squares solution" is that bearing the lowest
# sum_of_sqerrors, its error-minimizing alphas and betas can
# be calculated as such:

from typing import Tuple
from scratch.linear_algebra import Vector
from scratch.statistics import correlation, standard_deviation, mean

def least_squares_fit(x, y):
    """Given two vectors x and y, find least-squares alpha/beta."""
    beta = correlation(x, y) * standard_deviation(y) / standard_deviation(x)
    alpha = mean(y) - beta * mean(x)
    return alpha, beta

# Trust the math.

How to do R-squared, the beefier cousin of above:

In [None]:
def scratch.statistics import de_mean

def total_sum_of_squares(y):
    """The total squared variations of y_i's from their means"""
    return sum(v ** 2 for v in de_mean(y))

def r_squared(alpha, beta, x, y):
    """
    The fraction of variation in y captured by the model,
    equaling 1 - the fraction of variation in y NOT captured
    by the model
    """
    return 1.0 - (sum_of_sqerrors(alpha, beta, x, y) /
                  total_sum_of_squares(y))

Gradient Descent Strikes Back:

In [None]:
# If we know theta = [alpha, beta], we could also compute
# the model fit using gradient descent:

import random
import tqdm
from scratch.gradient_descent import gradient_step

num_epochs = 10000
random.seed(0)

guess = [random.random(), random.random()] # Choose rand
                                           # value to start.
learning_rate = 0.00001 # Gradient step

with tqdm.trange(num_epochs) as t:
    for _ in t:
        alpha, beta = guess

        # Partial derivative of loss with respect to alpha
        grad_a = sum(2 * error(alpha, beta, x_i, y_i)
                     for x_i, y_i in zip(num_friends_good
                                         daily_minutes_good))
        # This is in the context of a problem correlating
        # minutes spent on a site with friends.

        # Partial derivative of loss with respect to beta
        grad_b = sum(2 * error(alpha, beta, x_i, y_i)
                     for x_i, y_i in zip(num_friends_good
                                         daily_minutes_good))
        
        # Compute los to plug into the tqdm description
        loss = sum_of_sqerrors(alpha, beta,
                               num_friends_good, daily_minutes_good)
        t.set_description(f"loss: {loss:.3f}")

        # Update the guess after computation:
        guess = gradient_step(guess, [grad_a, grad_b], -learning_rate)

# Which should produce pretty much the same R^2 !! For more work :D

Maximum Likelihood Estimation:

Imagine, instead of computing the probability of distribution based on an unknown theta, you computed the likelihood of that distribution based on a known theta. The most likely theta would then become that which maximizes the likelihood of your data spread occuring exactly as it does.

- Another thing that exists is the "likelihood based on an entire dataset is the product of the individual likelihood" ... okay!

# CHAPTER 25:

I'm boutta show you the best way to process big data:

1. Use a "mapper" to turn each item into a key/value pair
2. Collect all pairs with identical keys
3. Use a "reducer" on each collection of ^ assoc value, produce output values for each key

In [None]:
# Word Count:

# Here, the keys are words, and for each word we'll 
from typing import Iterator, Tuple
def wc_mapper(document):
    """For each word in the document, produce (word, 1)"""
    for word in tokenize(document):
        yield (word, 1)

from typing import Iterable
def wc_reducer(word, counts):
    yield (word, sum(counts))

from collections import defaultdict
def word_count(documents):
    """Count the words in the input documents using MapReduce"""
    collector = defaultdict(list) # To store grouped values.

    for document in documents:
        for word, count in wc_mapper(document):
            collector[word].append(count)

    return [output 
            for word, counts in collector.items()
            for output in wc_reducer(word, counts)]

# If we had three documents "data science", "big data", "science fiction",
# the collector would contain

{"data" : [1, 1],
 "science" : [1, 1],
 "big" : [1],
 "fiction" : [1]}

# And wc_reducer would produce the counts for each word:

[("data", 2), ("science", 2), ("big", 1), ("fiction", 1)]

It good because it fast, trust.

In [None]:
# To put this simply in generic terms:

from typing import Callable, Iterable, Any, Tuple

# A key/value pair is just a tuple:
KV = Tuple[Any, Any]

# A Mapper is a Callable which "activates" the iterable for a specific key:
Mapper = Callable[..., Iterable[KV]]

# A Reducer is a function that takes a key and an interable of values
# and returns a key/value pair:
Reducer = Callable[[Any, Iterable], KV]

# 1.) Mapper gives any input a key
# 2.) Reducer combines all inputs with that key into a single clump

ex;

In [None]:
def data_science_day_mapper(status_update):
    """Yields (day_of_week, 1) if status_update contains "data science" """
    if "data science" in status_update["text"].lower():
        day_of_week = status_update["created_at"].weekday()
        yield (day_of_week, 1)

data_science_days = map_reduce(status_updates,
                               data_science_day_mapper,
                               sum_reducer)

ex;

In [None]:
def words_per_user_mapper(status_update):
    user = status_update["username"]
    for word in tokenize(status_update["text"]):
        yield (user, (word, 1))

def most_popular_word_reducer(user):
    """
    Given a sequence of (word, count) pairs,
    return the word with the highest total count.
    """
    word_counts = Counter()
    for word, count in words_and_counts:
        word_counts[word] += count

    word, count = word_counts.most_common(1)[0]

    yield (user, (word, count))

user_words = map_reduce(status_updates,
                        words_per_user_mapper,
                        most_popular_word_reducer)

Matrix Multiplication:

I am too tired for this I'm so sorry if this comes up on a test. Just read it.

Combiners:

- The reason you don't use (word, None) and then just take the length of all (as opposed to what we do) is because sometimes you have one machine designated for mapping, and another for reduction.
- Trust :)