# Pandas

- Analyze 2D or multi dimensional data
- Table like data
- Internally they use numpy

In [None]:
import pandas as pd

In [6]:
data = {
    'Name': ['John', 'Anna', 'Peter', 'Linda'],
    'Age': [28, 34, 29, 32],
    'City': ['New York', 'Paris', 'Berlin', 'London']
}

data

{'Name': ['John', 'Anna', 'Peter', 'Linda'],
 'Age': [28, 34, 29, 32],
 'City': ['New York', 'Paris', 'Berlin', 'London']}

In [8]:
df = pd.DataFrame(data)

df

Unnamed: 0,Name,Age,City
0,John,28,New York
1,Anna,34,Paris
2,Peter,29,Berlin
3,Linda,32,London


### Filtering columns

```sql
SELECT Name FROM df
```

In [9]:
df['Name']

0     John
1     Anna
2    Peter
3    Linda
Name: Name, dtype: object

### First two rows

```sql
SELECT *
FROM df
LIMIT 2
```

In [12]:
df.head(2)

Unnamed: 0,Name,Age,City
0,John,28,New York
1,Anna,34,Paris


### Get a feel for the data
- Columns
- Data type
- Space occupied
- Null

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Name    4 non-null      object
 1   Age     4 non-null      int64 
 2   City    4 non-null      object
dtypes: int64(1), object(2)
memory usage: 228.0+ bytes


In [27]:
df.describe()

Unnamed: 0,Age
count,4.0
mean,30.75
std,2.753785
min,28.0
25%,28.75
50%,30.5
75%,32.5
max,34.0


In [26]:
df.describe(include="all")

Unnamed: 0,Name,Age,City
count,4,4.0,4
unique,4,,4
top,John,,New York
freq,1,,1
mean,,30.75,
std,,2.753785,
min,,28.0,
25%,,28.75,
50%,,30.5,
75%,,32.5,


In [17]:
df.columns

Index(['Name', 'Age', 'City'], dtype='object')

In [18]:
df.dtypes

Name    object
Age      int64
City    object
dtype: object

## Pandas Tasks

1. Load CSV as dataframe
2. Get first 5 rows
3. Get a feel for the dataframe
4. Get a feel for the data (Preliminary summarized info)

In [49]:
# Task 1.1

tips_df = pd.read_csv('./tips.csv')

In [None]:
# Task 1.2

tips_df.head(5)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [39]:
# Task 1.3

tips_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   total_bill  244 non-null    float64
 1   tip         244 non-null    float64
 2   sex         244 non-null    object 
 3   smoker      244 non-null    object 
 4   day         244 non-null    object 
 5   time        244 non-null    object 
 6   size        244 non-null    int64  
dtypes: float64(2), int64(1), object(4)
memory usage: 13.5+ KB


In [None]:
# Task 1.4

tips_df.describe(include="all")

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
count,244.0,244.0,244,244,244,244,244.0
unique,,,2,2,4,2,
top,,,Male,No,Sat,Dinner,
freq,,,157,151,87,176,
mean,19.785943,2.998279,,,,,2.569672
std,8.902412,1.383638,,,,,0.9511
min,3.07,1.0,,,,,1.0
25%,13.3475,2.0,,,,,2.0
50%,17.795,2.9,,,,,2.0
75%,24.1275,3.5625,,,,,3.0


```sql
SELECT tip, day, time
FROM tips_df
LIMIT 10
```

In [48]:
tips_df[["tip", "day", "time"]].head(10)

Unnamed: 0,tip,day,time
0,1.01,Sun,Dinner
1,1.66,Sun,Dinner
2,3.5,Sun,Dinner
3,3.31,Sun,Dinner
4,3.61,Sun,Dinner
5,4.71,Sun,Dinner
6,2.0,Sun,Dinner
7,3.12,Sun,Dinner
8,1.96,Sun,Dinner
9,3.23,Sun,Dinner


## Add another column

In [None]:
avg_tip = round(tips_df['tip'] / tips_df['size'] * 100, 2)

avg_tip

0       50.50
1       55.33
2      116.67
3      165.50
4       90.25
        ...  
239    197.33
240    100.00
241    100.00
242     87.50
243    150.00
Length: 244, dtype: float64

In [57]:
tips_df['tips_per_person'] = avg_tip

tips_df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tips_per_person
0,16.99,1.01,Female,No,Sun,Dinner,2,50.5
1,10.34,1.66,Male,No,Sun,Dinner,3,55.33
2,21.01,3.5,Male,No,Sun,Dinner,3,116.67
3,23.68,3.31,Male,No,Sun,Dinner,2,165.5
4,24.59,3.61,Female,No,Sun,Dinner,4,90.25


In [73]:
tips_df["tip_percent"] = round(tips_df['tip'] / tips_df['total_bill'] * 100, 2)

tips_df.head()


Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tips_per_person,tip_percent
0,16.99,1.01,Female,No,Sun,Dinner,2,50.5,5.94
1,10.34,1.66,Male,No,Sun,Dinner,3,55.33,16.05
2,21.01,3.5,Male,No,Sun,Dinner,3,116.67,16.66
3,23.68,3.31,Male,No,Sun,Dinner,2,165.5,13.98
4,24.59,3.61,Female,No,Sun,Dinner,4,90.25,14.68


In [89]:
tips_df.sort_values(by="tip_percent", ascending=False)
tips_df.sort_values(by="total_bill", ascending=False)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tips_per_person,tip_percent
170,50.81,10.00,Male,Yes,Sat,Dinner,3,333.33,19.68
212,48.33,9.00,Male,No,Sat,Dinner,4,225.00,18.62
59,48.27,6.73,Male,No,Sat,Dinner,4,168.25,13.94
156,48.17,5.00,Male,No,Sun,Dinner,6,83.33,10.38
182,45.35,3.50,Male,Yes,Sun,Dinner,3,116.67,7.72
...,...,...,...,...,...,...,...,...,...
149,7.51,2.00,Male,No,Thur,Lunch,2,100.00,26.63
111,7.25,1.00,Female,No,Sat,Dinner,1,100.00,13.79
172,7.25,5.15,Male,Yes,Sun,Dinner,2,257.50,71.03
92,5.75,1.00,Female,Yes,Fri,Dinner,2,50.00,17.39


In [86]:
# Task 1

tips_df[tips_df['day'] == 'Sun']

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tips_per_person,tip_percent
0,16.99,1.01,Female,No,Sun,Dinner,2,50.50,5.94
1,10.34,1.66,Male,No,Sun,Dinner,3,55.33,16.05
2,21.01,3.50,Male,No,Sun,Dinner,3,116.67,16.66
3,23.68,3.31,Male,No,Sun,Dinner,2,165.50,13.98
4,24.59,3.61,Female,No,Sun,Dinner,4,90.25,14.68
...,...,...,...,...,...,...,...,...,...
186,20.90,3.50,Female,Yes,Sun,Dinner,3,116.67,16.75
187,30.46,2.00,Male,Yes,Sun,Dinner,5,40.00,6.57
188,18.15,3.50,Female,Yes,Sun,Dinner,3,116.67,19.28
189,23.10,4.00,Male,Yes,Sun,Dinner,3,133.33,17.32


In [85]:
# Task 2

tips_df[tips_df["day"] == "Fri"][tips_df["tip"] > 3]

# Without warning:

tips_df_fri = tips_df[tips_df["day"] == 'Fri']

tips_df_fri[tips_df_fri["tip"] > 3]


  tips_df[tips_df["day"] == "Fri"][tips_df["tip"] > 3]


Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tips_per_person,tip_percent
91,22.49,3.5,Male,No,Fri,Dinner,2,175.0,15.56
93,16.32,4.3,Female,Yes,Fri,Dinner,2,215.0,26.35
94,22.75,3.25,Female,No,Fri,Dinner,2,162.5,14.29
95,40.17,4.73,Male,Yes,Fri,Dinner,4,118.25,11.77
96,27.28,4.0,Male,Yes,Fri,Dinner,2,200.0,14.66
221,13.42,3.48,Female,Yes,Fri,Lunch,2,174.0,25.93


In [None]:
# Task 3

tips_df.sort_values(by="tip", ascending=False).head(5)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tips_per_person,tip_percent
170,50.81,10.00,Male,Yes,Sat,Dinner,3,333.33,19.68
212,48.33,9.00,Male,No,Sat,Dinner,4,225.00,18.62
23,39.42,7.58,Male,No,Sat,Dinner,4,189.50,19.23
59,48.27,6.73,Male,No,Sat,Dinner,4,168.25,13.94
141,34.30,6.70,Male,No,Thur,Lunch,6,111.67,19.53
...,...,...,...,...,...,...,...,...,...
0,16.99,1.01,Female,No,Sun,Dinner,2,50.50,5.94
111,7.25,1.00,Female,No,Sat,Dinner,1,100.00,13.79
92,5.75,1.00,Female,Yes,Fri,Dinner,2,50.00,17.39
67,3.07,1.00,Female,Yes,Sat,Dinner,1,100.00,32.57
