# Visual studio code setup
Install jupyter : https://jupyter.org/install

add jupyter extensions

# This is a markdown cell. 
We can press shift+enter to "run" it to make it into pretty text. 

Some common tools:
# Headings large
## Headings medium
### Headings small

``` foo(x,y) ```

code snippets 

*Italics* 
**bold**

Not the lack of space in the "*" with space we can make bulleted lists
* Item 1
* Item 2 
* Item 3

____ 

^ above makes a horizontal line. 

www.markdownguide.org/cheat-sheet/ to learn more about markdown

In [None]:
# Python cell 
# This is a python cell. # words is not a heading now. Instead it's a comment 
# we can do computations in one cell

y = 4 * ( 5 - 3)
y

In [None]:
# and the result will be stored throughtout 
x = y - 8 
x 

In [None]:
# So if we redefine y again, the above cell will change only when we rerun it
y = 2 
x

# More python basics

In [None]:
## Words and characters are stored in strings
school = "University of Minnesota"
age = "27"
print(len(school)) ## length gets number of characters

In [None]:
school.find("Minnesota") # we can search for specific words. Minnesota starts at position 14
## Look carefully. 

In [None]:
type(school)

In [None]:
type(age) # how to change this? 

In [None]:
age = int(age) # for integer
# 
age = float(age) # if we care about decimal places

# Lists, sets
1. List : an ordered list of item denoted by square brackets 

In [None]:
words = school.split(" ") # this splits school into pieces where each "space " is a seperator
print(words)

In [None]:
type(words)

Or we can make or own

In [None]:
spotify_history = ["Espresso", "Baboon", "Espresso", "Red Wine Supernova", "Espresso", "Red Wine Supernova"]

## how many songs did I listen to:
print(len(spotify_history))

## Did I listen to espresso"
print("Espresso" in spotify_history)


#when was the first time I listened to it
print(spotify_history.index("Espresso"))

# how often? 
print(spotify_history.count("Espresso"))


# Indexing
## What was the 2nd song I listened to
print(spotify_history[1]) # not 2. Counting starts at 0

## what was the 1 through 4 song I heard
print(spotify_history[0:4])

## we can add elements
spotify_history.append("Birds of a feather")
print(spotify_history)

## Or remove the element at a specific spot
spotify_history.pop(1)
## .pop(0) is the same as .pop() 
print(spotify_history)

2. Sets  : unordered. Good if we want to forgot duplicates, not care about order but do care about membership. Uses curly brackets and is similar to mathematical sets with interesection and union

In [None]:
spotify_history.count()

In [None]:
spotify_history_set = set(spotify_history) # turn into a set 


print(spotify_history_set) # only three entries

# Makes it easier to compare the differences
spotify_friend_history_set = {"Espresso", "JUMP", "Golden" } # In line declaration 

print(spotify_history_set.union(spotify_friend_history_set)) # what songs did either of us listen to

print(spotify_history_set.intersection(spotify_friend_history_set) ) ## did we both listen to 

# we can add a song 
spotify_friend_history_set.add("The Dead Dance")
print(spotify_friend_history_set)

# we can also remove a random element from the set
# Makes sense if we want to process items one at a time but order doesn't matter
print(spotify_friend_history_set.pop())
# note doing so gives us the removed element

print(spotify_friend_history_set)

## Using loops with data structures.
The main point of putting elements into a datastructure is to process later. 
For loops are popular

In [None]:
for song in spotify_history:
    print(song, len(song))


In [None]:
# finding the sone with the longest title process
max_song_length = 0
max_song_index = -1
for i in range(len(spotify_history)):
    if( len(spotify_history[i]) > max_song_length):
        max_song_length = len(spotify_history[i])
        max_song_index = i 
print(max_song_index, spotify_history[max_song_index])

# Functions
What we have been doing is called functions but we can write our own as. 

``` 
    def function-name(param1, param2) : 
        do stuff 
        do stuff
        do stuff
        return result 
```

In [None]:
def circle_area(radius):
    pi = 3.1415
    return pi * radius * radius 
    # or return pi * radius**2

In [None]:
circle_area(4)

# Installing packages
In the terminal, we can do 

``` conda install numpy scikit-learn matplotlib pandas ```

Then the following should work 

In [None]:
import numpy  as np 
import matplotlib.pyplot as plt 
import pandas as pd 
import sklearn 

In [None]:
import statsmodel  # I will show how to install this one live. 

# Importing data

Pandas is the default tool to use for loading and managing data. Works over "dataframe" something like a excel sheet i.e. data in rows and columns with named columns. 

In [None]:
flower_data_raw = pd.read_csv("Iris.csv") 

In [None]:
flower_data_raw

Here are some useful things to do in pandas. 

Getting just one column:

In [None]:
flower_data_raw["Id"] 

Select multiple rows

In [None]:
flower_data_raw[ ["Id", "SepalLengthCm"]]

Basic statistics over segments of the data

In [None]:
flower_data_raw.describe()

#  Plotting

In [None]:
plt.plot(flower_data_raw["SepalWidthCm"])

# Adding more

In [None]:
plt.plot(flower_data_raw["SepalWidthCm"], label="SepalWidth")
plt.xlabel("Sample Number")
plt.ylabel("Width")
plt.title("Flower Sepal Width")

But in this case the x axis doesn't really make sense.
Option 1: sort the data and plot that to get an idea of the the distribution

In [None]:
plt.plot(sorted(flower_data_raw["SepalWidthCm"]), label="SepalWidth")
plt.xlabel("Index")
plt.ylabel("Width")
plt.title("Flower Sepal Width")

In [None]:
# do a histogram
plt.hist(flower_data_raw["SepalWidthCm"], bins=10, label="SepalWidth")
plt.xlabel("Width")
plt.ylabel("Frequency")
plt.title("Flower Sepal Width")

In [None]:
# Or we could plot against something else

In [None]:
plt.scatter(flower_data_raw["SepalLengthCm"], flower_data_raw["SepalWidthCm"], label="SepalWidth")
plt.xlabel("Sample Length")
plt.ylabel("Sample Width")
plt.title("Flower Sepal Width")

In [None]:
import statsmodel # I don't have this one. Oops 

# Numpy 
Numpy is similar to Pandas but better for numerical operations

In [None]:
# make a numpy array 
A = np.zeros((4,4))
for i in range(4):
    for j in range(4):
        A[i,j] = i+j

v = np.zeros((4,1))
for i in range(4):
    v[i] = np.random.normal(4, 2.3)
print(A), print(v)

In [None]:
# special operations 
A @ v   # @ makes this matrix multiplication 

In [None]:
np.linalg.solve(A, v)  

In [None]:
np.linalg.inv(A) @ v # equivalent result but top is prefered for numerical reasons

In [None]:
np.linalg.eigvals(A)

# Sklearn

Machine learning package for python. Popular and very well docuented. 
Lets us sklearn to figure out if there is a strong relationship between the different flower datas

In [None]:
from sklearn import linear_model
reg = linear_model.LinearRegression()
reg.fit(flower_data_raw[["SepalLengthCm"]], flower_data_raw["PetalLengthCm"] )

In [None]:
reg.coef_

In [None]:
reg.intercept_

In [None]:
predictions = [reg.intercept_ + reg.coef_ * flower_data_raw["SepalLengthCm"].iloc[i] for i in range(len(flower_data_raw["SepalLengthCm"]))]

In [None]:
plt.scatter(flower_data_raw["SepalLengthCm"], flower_data_raw["PetalLengthCm"], label="Actual Data")
plt.plot(flower_data_raw["SepalLengthCm"], predictions, color="black", ls="dashdot", label="prediction" )
plt.legend()
plt.xlabel("Sepal Length")
plt.ylabel("Petal Length")
plt.title("Flower Petal Length")

# Statsmodel
Most "R" like package

In [None]:
import statsmodels.formula.api as smf

In [None]:
results = smf.ols("PetalLengthCm ~ SepalLengthCm", data= flower_data_raw).fit()

In [None]:
print(results.summary())