**Comparison with SQL**

In [2]:
 import pandas as pd
 import numpy as np

In [4]:
 url = (
   ...:     "https://raw.github.com/pandas-dev"
   ...:     "/pandas/master/pandas/tests/io/data/csv/tips.csv"
   ...: )
 tips = pd.read_csv(url)
 tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


**SELECT**

In SQL, selection is done using a comma-separated list of columns you’d like to select.

SELECT total_bill, tip, smoker, time

FROM tips

LIMIT 5; 

In [6]:
 tips[["total_bill", "tip", "smoker", "time"]].head(5)

Unnamed: 0,total_bill,tip,smoker,time
0,16.99,1.01,No,Dinner
1,10.34,1.66,No,Dinner
2,21.01,3.5,No,Dinner
3,23.68,3.31,No,Dinner
4,24.59,3.61,No,Dinner


**In SQL, you can add a calculated column:**

SELECT *, tip/total_bill as tip_rate

FROM tips

LIMIT 5;

**With pandas, you can use the DataFrame.assign() method of a DataFrame to append a new column:**

In [7]:
In [7]: tips.assign(tip_rate=tips["tip"] / tips["total_bill"]).head(5)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_rate
0,16.99,1.01,Female,No,Sun,Dinner,2,0.059447
1,10.34,1.66,Male,No,Sun,Dinner,3,0.160542
2,21.01,3.5,Male,No,Sun,Dinner,3,0.166587
3,23.68,3.31,Male,No,Sun,Dinner,2,0.13978
4,24.59,3.61,Female,No,Sun,Dinner,4,0.146808


**WHERE**

Filtering in SQL is done via a WHERE clause.

SELECT *

FROM tips

WHERE time = 'Dinner'

LIMIT 5;

**DataFrames can be filtered in multiple ways; the most intuitive of which is using boolean indexing**

In [9]:
tips[tips["time"] == "Dinner"].head(5)


Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [13]:
is_dinner = tips["time"] == "Dinner"
is_dinner.value_counts()

True     176
False     68
Name: time, dtype: int64

In [11]:
tips[is_dinner].head(5)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


**Just like SQL’s OR and AND, multiple conditions can be passed to a DataFrame using | (OR) and & (AND).**

In [None]:
SELECT *
FROM tips
WHERE time = 'Dinner' AND tip > 5.00;

In [16]:
# tips of more than $5.00 at Dinner meals

tips[(tips["time"] == "Dinner") & (tips["tip"] > 5.00)]

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
23,39.42,7.58,Male,No,Sat,Dinner,4
44,30.4,5.6,Male,No,Sun,Dinner,4
47,32.4,6.0,Male,No,Sun,Dinner,4
52,34.81,5.2,Female,No,Sun,Dinner,4
59,48.27,6.73,Male,No,Sat,Dinner,4
116,29.93,5.07,Male,No,Sun,Dinner,4
155,29.85,5.14,Female,No,Sun,Dinner,5
170,50.81,10.0,Male,Yes,Sat,Dinner,3
172,7.25,5.15,Male,Yes,Sun,Dinner,2
181,23.33,5.65,Male,Yes,Sun,Dinner,2


In [None]:
#tips by parties of at least 5 diners OR bill total was more than $45
SELECT *
FROM tips
WHERE size >= 5 OR total_bill > 45;

In [17]:
#tips by parties of at least 5 diners OR bill total was more than $45

tips[(tips["size"] >= 5) | (tips["total_bill"] > 45)]

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
59,48.27,6.73,Male,No,Sat,Dinner,4
125,29.8,4.2,Female,No,Thur,Lunch,6
141,34.3,6.7,Male,No,Thur,Lunch,6
142,41.19,5.0,Male,No,Thur,Lunch,5
143,27.05,5.0,Female,No,Thur,Lunch,6
155,29.85,5.14,Female,No,Sun,Dinner,5
156,48.17,5.0,Male,No,Sun,Dinner,6
170,50.81,10.0,Male,Yes,Sat,Dinner,3
182,45.35,3.5,Male,Yes,Sun,Dinner,3
185,20.69,5.0,Male,No,Sun,Dinner,5


**NULL checking is done using the notna() and isna() methods.**

In [None]:
frame = pd.DataFrame(
   {"col1": ["A", "B", np.NaN, "C", "D"], "col2": ["F", np.NaN, "G", "H", "I"]}
   )
   

In [21]:
frame

Unnamed: 0,col1,col2
0,A,F
1,B,
2,,G
3,C,H
4,D,I


**Assume we have a table of the same structure as our DataFrame above. We can see only the records where col2 IS NULL with the following query:**

SELECT *
FROM frame
WHERE col2 IS NULL;

In [22]:
frame[frame["col2"].isna()]

Unnamed: 0,col1,col2
1,B,


**Getting items where col1 IS NOT NULL can be done with notna().**

SELECT *
FROM frame
WHERE col1 IS NOT NULL;

In [23]:
frame[frame["col1"].notna()]

Unnamed: 0,col1,col2
0,A,F
1,B,
3,C,H
4,D,I


**GROUP BY**

SELECT sex, count(*)
FROM tips
GROUP BY sex;

#
Female     87

Male      157


In [24]:
 tips.groupby("sex").size()

sex
Female     87
Male      157
dtype: int64

**Notice that in the pandas code we used size() and not count(). This is because count() applies the function to each column, returning the number of not null records within each.**

In [25]:
tips.groupby("sex").count()

Unnamed: 0_level_0,total_bill,tip,smoker,day,time,size
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Female,87,87,87,87,87,87
Male,157,157,157,157,157,157


In [26]:
tips.groupby("sex")["total_bill"].count() #Alternatively, we could have applied the count() method to an individual column:

sex
Female     87
Male      157
Name: total_bill, dtype: int64

In [None]:
SELECT day, AVG(tip), COUNT(*)
FROM tips
GROUP BY day;
/*
Fri   2.734737   19
Sat   2.993103   87
Sun   3.255132   76
Thur  2.771452   62
*/

In [27]:
tips.groupby("day").agg({"tip": np.mean, "day": np.size})

Unnamed: 0_level_0,tip,day
day,Unnamed: 1_level_1,Unnamed: 2_level_1
Fri,2.734737,19
Sat,2.993103,87
Sun,3.255132,76
Thur,2.771452,62


In [None]:
Grouping by more than one column is done by passing a list of columns to the groupby() method.

SELECT smoker, day, COUNT(*), AVG(tip)
FROM tips
GROUP BY smoker, day;
/*
smoker day
No     Fri      4  2.812500
       Sat     45  3.102889
       Sun     57  3.167895
       Thur    45  2.673778
Yes    Fri     15  2.714000
       Sat     42  2.875476
       Sun     19  3.516842
       Thur    17  3.030000
*/

In [28]:
tips.groupby(["smoker", "day"]).agg({"tip": [np.size, np.mean]})

Unnamed: 0_level_0,Unnamed: 1_level_0,tip,tip
Unnamed: 0_level_1,Unnamed: 1_level_1,size,mean
smoker,day,Unnamed: 2_level_2,Unnamed: 3_level_2
No,Fri,4.0,2.8125
No,Sat,45.0,3.102889
No,Sun,57.0,3.167895
No,Thur,45.0,2.673778
Yes,Fri,15.0,2.714
Yes,Sat,42.0,2.875476
Yes,Sun,19.0,3.516842
Yes,Thur,17.0,3.03
