In [23]:
import numpy as np
import pandas as pd

## Creating Dataframes
- From scratch
- From other collections
- From Files
- From SQL Queries

In [56]:
# DataFrame from scratch
fruits = pd.DataFrame()
print(type(fruits))
fruits

<class 'pandas.core.frame.DataFrame'>


In [58]:
# Column access with dot notation
# Column access with bracket notation 
# When we create a new column, we'll use bracket notation
fruits["fruits"] = ["kiwi", "mango", "guava"]
fruits["quantity"] = [12, 4, 2]
fruits

Unnamed: 0,fruits,quantity
0,kiwi,12
1,mango,4
2,guava,2


In [59]:
# Resetting a column to be the index
fruits = fruits.set_index("fruits")
fruits

Unnamed: 0_level_0,quantity
fruits,Unnamed: 1_level_1
kiwi,12
mango,4
guava,2


In [60]:
# Go back to the default numeric index
fruits = fruits.reset_index()
fruits

Unnamed: 0,fruits,quantity
0,kiwi,12
1,mango,4
2,guava,2


In [51]:
# DataFrame from List of Lists or Array of Arrays
tic_tac_toe = [[1, 2, 3], [4, 5, 6], [7, 8, 9]]
df = pd.DataFrame(tic_tac_toe)
df

Unnamed: 0,0,1,2
0,1,2,3
1,4,5,6
2,7,8,9


In [53]:
df.index = ["a", "b", "c"]
df.columns = ["X", "Y", "Z"]
df

Unnamed: 0,X,Y,Z
a,1,2,3
b,4,5,6
c,7,8,9


In [55]:
df.index = range(1, 4)
df

Unnamed: 0,X,Y,Z
1,1,2,3
2,4,5,6
3,7,8,9


In [62]:
# DataFrame from List of dictionaries
ingredients = [
    {
        "name": "sweet onion",
        "quantity": 1,
        "price": 0.89
    },
    {
        "name": "avocado",
        "quantity": 5,
        "price": 1
    },
    {
        "name": "serrano peppers",
        "quantity": 2,
        "price": .25
    },
    {
        "name": "lime",
        "quantity": 2,
        "price": .10
    }
]

guacamole = pd.DataFrame(ingredients)
guacamole

Unnamed: 0,name,quantity,price
0,sweet onion,1,0.89
1,avocado,5,1.0
2,serrano peppers,2,0.25
3,lime,2,0.1


In [65]:
## Now let's determine how much this guacamole recipe will cost to make
guacamole["item_total"] = guacamole.quantity * guacamole.price

## .sum the item_total
guacamole["item_total"].sum()

6.59

In [70]:
# DataFrame from Dictionaries containing lists
scales = {
    "note_number": [1, 2, 3, 4, 5, 6, 7, 8],
    "c_major_scale": ["c", "d", "e", "f", "g", "a", "b", "c"],
    "d_major_scale": ["D", "E", "F♯", "G", "A", "B", "C♯", "D"],
    "b_flat_major_scale": ["B♭", "C", "D", "E♭", "F", "G", "A", "B♭"]
}

scales = pd.DataFrame(scales)

scales.c_major_scale = scales.c_major_scale.str.upper()
scales

Unnamed: 0,note_number,c_major_scale,d_major_scale,b_flat_major_scale
0,1,C,D,B♭
1,2,D,E,C
2,3,E,F♯,D
3,4,F,G,E♭
4,5,G,A,F
5,6,A,B,G
6,7,B,C♯,A
7,8,C,D,B♭


In [71]:
# DataFrame from a .CSV file
quotes = pd.read_csv("quotes.csv")
quotes

Unnamed: 0,quote,author
0,"To go fast, go alone. To go far, go together",African Proverb
1,"In fact, the only way to manage stress is to b...",anomymous
2,Predispose yourself to practice,anonymous
3,The secret to building great products is not c...,Kathy Sierra
4,Writing is Nature's way of exposing how sloppy...,Guindon
5,The biggest issue on software teams is making ...,Martin Fowler
6,Promise me you'll always remember: you're brav...,Winnie the Pooh
7,The first rule of style is to have something t...,George Pólya
8,It's more fun to talk with someone who doesn't...,Winnie the Pooh
9,You don't start out writing good stuff. You st...,Octavia Butler


In [16]:
# DataFrame from a .json file

more_quotes = pd.read_json("more_quotes.json")
more_quotes

Unnamed: 0,quote,author
0,"In many cases, the more you try to compete, th...",Kathy Sierra
1,Median is safe when you could use mean. Median...,Maggie Giust
2,"If you get, give. If you learn, teach",Maya Angelou
3,"It's tough to make predictions, especially abo...",Yogi Berra


In [78]:
## Create a DataFrame from a SQL Query

# 1st step to having pandas get SQL results
# 1st step = all the right installs w/ python environment
# 2nd step = author syntactically correct SQL
# 3rd step = create a connection string for pandas (contains your hostname, username, password)
# 4th step = put everything together with pd.read_sql() and we get a dataframe

# mysql+pymysql://codeup:p@assw0rd@123.123.123.123/some_db

from env import host, user, password

url = f'mysql+pymysql://{user}:{password}@{host}/employees'

In [79]:
query = """SELECT * FROM salaries ORDER BY salary DESC LIMIT 10"""
df = pd.read_sql(query, url)
df

Unnamed: 0,emp_no,salary,from_date,to_date
0,43624,158220,2002-03-22,9999-01-01
1,43624,157821,2001-03-22,2002-03-22
2,254466,156286,2001-08-04,9999-01-01
3,47978,155709,2002-07-14,9999-01-01
4,253939,155513,2002-04-11,9999-01-01
5,109334,155377,2000-02-12,2001-02-11
6,109334,155190,2002-02-11,9999-01-01
7,109334,154888,2001-02-11,2002-02-11
8,109334,154885,1999-02-12,2000-02-12
9,80823,154459,2002-02-22,9999-01-01


## Aggregating DataFrame Values



## Grouping with `.groupby`

## Concatenating dataframes

## Merging Dataframes