In [2]:
import pandas as pd
import numpy as np

In [3]:
data = {
    "name": ["Tal", "Hamad", "Angie", "Avi", "Noy", "Bar", "Angela"],
    "gender": ["Male", "Male", "Female", "Male", "Female", "Female", "Female"],
    "usage": [5, 7, 4, 2, 1, 10, 8],
    "device": ["PS5", "XBox", "PC", "PS5", "XBox", "TV", "XBox"],
    "expense": np.random.randint(low=100, high=300, size=(7))
}

data_frame = pd.DataFrame(data)
data_frame

Unnamed: 0,name,gender,usage,device,expense
0,Tal,Male,5,PS5,145
1,Hamad,Male,7,XBox,133
2,Angie,Female,4,PC,176
3,Avi,Male,2,PS5,255
4,Noy,Female,1,XBox,268
5,Bar,Female,10,TV,179
6,Angela,Female,8,XBox,216


In [4]:
data_frame.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7 entries, 0 to 6
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   name     7 non-null      object
 1   gender   7 non-null      object
 2   usage    7 non-null      int64 
 3   device   7 non-null      object
 4   expense  7 non-null      int64 
dtypes: int64(2), object(3)
memory usage: 408.0+ bytes


#### Categorical data
1. columns that have categories of things rather than continuous / random values

In [6]:
data_frame['gender'].value_counts()

Female    4
Male      3
Name: gender, dtype: int64

In [7]:
data_frame['device'].value_counts()

XBox    3
PS5     2
PC      1
TV      1
Name: device, dtype: int64

In [8]:
data_frame[
    (data_frame['gender'] == "Male") &
    (data_frame['device'] == "XBox")
]

Unnamed: 0,name,gender,usage,device,expense
1,Hamad,Male,7,XBox,133


In [9]:
data_frame[
    (data_frame['expense'] >= 200) &
    (data_frame['gender'] == "Female")
]


Unnamed: 0,name,gender,usage,device,expense
4,Noy,Female,1,XBox,268
6,Angela,Female,8,XBox,216


##### Findings 1
* focus more on advertising xbox stuff to the female audience

In [10]:
data_frame['times played'] = np.random.randint(low=1, high=10, size=(7))
data_frame 

Unnamed: 0,name,gender,usage,device,expense,times played
0,Tal,Male,5,PS5,145,4
1,Hamad,Male,7,XBox,133,9
2,Angie,Female,4,PC,176,6
3,Avi,Male,2,PS5,255,1
4,Noy,Female,1,XBox,268,4
5,Bar,Female,10,TV,179,3
6,Angela,Female,8,XBox,216,1


In [11]:
data_frame['total hours'] = data_frame['usage'] * data_frame['times played']
data_frame

Unnamed: 0,name,gender,usage,device,expense,times played,total hours
0,Tal,Male,5,PS5,145,4,20
1,Hamad,Male,7,XBox,133,9,63
2,Angie,Female,4,PC,176,6,24
3,Avi,Male,2,PS5,255,1,2
4,Noy,Female,1,XBox,268,4,4
5,Bar,Female,10,TV,179,3,30
6,Angela,Female,8,XBox,216,1,8


In [15]:
data_frame.drop("usage", axis=1, inplace=True) # 1 - column and 0 - row
data_frame

Unnamed: 0,name,gender,device,expense,times played,total hours
0,Tal,Male,PS5,145,4,20
1,Hamad,Male,XBox,133,9,63
2,Angie,Female,PC,176,6,24
3,Avi,Male,PS5,255,1,2
4,Noy,Female,XBox,268,4,4
5,Bar,Female,TV,179,3,30
6,Angela,Female,XBox,216,1,8


In [16]:
data_frame.drop("times played", axis=1, inplace=True)
data_frame

Unnamed: 0,name,gender,device,expense,total hours
0,Tal,Male,PS5,145,20
1,Hamad,Male,XBox,133,63
2,Angie,Female,PC,176,24
3,Avi,Male,PS5,255,2
4,Noy,Female,XBox,268,4
5,Bar,Female,TV,179,30
6,Angela,Female,XBox,216,8


#### Dataset #2

In [43]:
reservations = {
    "name": ["Bob", "Ahmad", "Tony", "Josh", "Duke"],
    "time": ["Breakfast", "Dinner", "Dinner", "Lunch", "Breakfast"],
    "total bill": np.random.randint(low=10, high=50, size=(5)),
    "phone": [702754212, 344748992, 3944299, 200488323, 101983883]
}

df = pd.DataFrame(reservations)
df

Unnamed: 0,name,time,total bill,phone
0,Bob,Breakfast,21,702754212
1,Ahmad,Dinner,43,344748992
2,Tony,Dinner,10,3944299
3,Josh,Lunch,32,200488323
4,Duke,Breakfast,46,101983883


In [44]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   name        5 non-null      object
 1   time        5 non-null      object
 2   total bill  5 non-null      int64 
 3   phone       5 non-null      int64 
dtypes: int64(2), object(2)
memory usage: 288.0+ bytes


In [45]:
def mask(val):
    new_val = str(val)
    return "*****" + new_val[-4:]

In [46]:
mask(1294858594)

'*****8594'

In [47]:
df['phone'] = df['phone'].apply(mask)
df

Unnamed: 0,name,time,total bill,phone
0,Bob,Breakfast,21,*****4212
1,Ahmad,Dinner,43,*****8992
2,Tony,Dinner,10,*****4299
3,Josh,Lunch,32,*****8323
4,Duke,Breakfast,46,*****3883


In [48]:
df['time'].value_counts()

Breakfast    2
Dinner       2
Lunch        1
Name: time, dtype: int64

* Focus more on improving the lunch hours and make sure to keep breakfast and dinner as you are 

In [49]:
def generate_price_level(val):
    if val < 15:
        return "$"

    elif val > 15 and val < 30:
        return "$$"

    else:
        return "$$$"

In [50]:
df['price level'] = df['total bill'].apply(generate_price_level)
df

Unnamed: 0,name,time,total bill,phone,price level
0,Bob,Breakfast,21,*****4212,$$
1,Ahmad,Dinner,43,*****8992,$$$
2,Tony,Dinner,10,*****4299,$
3,Josh,Lunch,32,*****8323,$$$
4,Duke,Breakfast,46,*****3883,$$$


* breakfast is really good (obviously its expensive so you guys are using some magical cooking supplies) please improve lunch again (add chinese)

In [51]:
df.drop("name", axis=1, inplace=True)
df.drop("phone", axis=1, inplace=True)

In [52]:
df

Unnamed: 0,time,total bill,price level
0,Breakfast,21,$$
1,Dinner,43,$$$
2,Dinner,10,$
3,Lunch,32,$$$
4,Breakfast,46,$$$


In [53]:
my_map = {
    "Breakfast": 0,
    "Lunch": 1,
    "Dinner": 2
}

In [54]:
df['time'] = df['time'].map(my_map)
df

Unnamed: 0,time,total bill,price level
0,0,21,$$
1,2,43,$$$
2,2,10,$
3,1,32,$$$
4,0,46,$$$


In [55]:
my_map1 = {
    "$" : 0,
    "$$": 1,
    "$$$": 2 
}

In [56]:
df["price level"] = df["price level"].map(my_map1)
df

Unnamed: 0,time,total bill,price level
0,0,21,1
1,2,43,2
2,2,10,0
3,1,32,2
4,0,46,2


### Challenge
1. practice creating a dataframe of your own 
2. filter, apply and map + try to derive new data from the existing columns 