# Useful Methods

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('tips.csv')

In [3]:
df.head(3)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person,Payer Name,CC Number,Payment ID
0,16.99,1.01,Female,No,Sun,Dinner,2,8.49,Christy Cunningham,3560325168603410,Sun2959
1,10.34,1.66,Male,No,Sun,Dinner,3,3.45,Douglas Tucker,4478071379779230,Sun4608
2,21.01,3.5,Male,No,Sun,Dinner,3,7.0,Travis Walters,6011812112971322,Sun4458


## .apply()

This method is use to apply one function to the column

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   total_bill        244 non-null    float64
 1   tip               244 non-null    float64
 2   sex               244 non-null    object 
 3   smoker            244 non-null    object 
 4   day               244 non-null    object 
 5   time              244 non-null    object 
 6   size              244 non-null    int64  
 7   price_per_person  244 non-null    float64
 8   Payer Name        244 non-null    object 
 9   CC Number         244 non-null    int64  
 10  Payment ID        244 non-null    object 
dtypes: float64(3), int64(2), object(6)
memory usage: 21.1+ KB


In [5]:
def first_four(num):
    return str(num)[:4]

In [6]:
df['CC Number'][0]

3560325168603410

In [7]:
first_four(3560325168603410)

'3560'

In [8]:
# apply first four to all cc number and create new column first_four

In [9]:
df['first_four'] = df['CC Number'].apply(first_four)

In [10]:
df.head(3)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person,Payer Name,CC Number,Payment ID,first_four
0,16.99,1.01,Female,No,Sun,Dinner,2,8.49,Christy Cunningham,3560325168603410,Sun2959,3560
1,10.34,1.66,Male,No,Sun,Dinner,3,3.45,Douglas Tucker,4478071379779230,Sun4608,4478
2,21.01,3.5,Male,No,Sun,Dinner,3,7.0,Travis Walters,6011812112971322,Sun4458,6011


In [11]:
df['price_per_person'].mean()

7.888196721311474

In [12]:
def star(price):
    if price < 5:
        return '*'
    elif price >= 5 and price < 7.88:
        return '**'
    else:
        return '***'

In [13]:
df['Star'] = df['price_per_person'].apply(star)

In [14]:
df.head(5)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person,Payer Name,CC Number,Payment ID,first_four,Star
0,16.99,1.01,Female,No,Sun,Dinner,2,8.49,Christy Cunningham,3560325168603410,Sun2959,3560,***
1,10.34,1.66,Male,No,Sun,Dinner,3,3.45,Douglas Tucker,4478071379779230,Sun4608,4478,*
2,21.01,3.5,Male,No,Sun,Dinner,3,7.0,Travis Walters,6011812112971322,Sun4458,6011,**
3,23.68,3.31,Male,No,Sun,Dinner,2,11.84,Nathaniel Harris,4676137647685994,Sun5260,4676,***
4,24.59,3.61,Female,No,Sun,Dinner,4,6.15,Tonya Carter,4832732618637221,Sun2251,4832,**


apply method with lambda

In [15]:
# This is normal function 
def simple(num):
    return num*2

In [16]:
# This is lambda function
lambda num: num*2

<function __main__.<lambda>(num)>

In [17]:
df['total_bill'].apply(lambda bill:bill*0.18)

0      3.0582
1      1.8612
2      3.7818
3      4.2624
4      4.4262
        ...  
239    5.2254
240    4.8924
241    4.0806
242    3.2076
243    3.3804
Name: total_bill, Length: 244, dtype: float64

## apply use for multiple columns

Note, there are several ways to do this:

https://stackoverflow.com/questions/19914937/applying-function-with-multiple-arguments-to-create-a-new-pandas-column


In [18]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person,Payer Name,CC Number,Payment ID,first_four,Star
0,16.99,1.01,Female,No,Sun,Dinner,2,8.49,Christy Cunningham,3560325168603410,Sun2959,3560,***
1,10.34,1.66,Male,No,Sun,Dinner,3,3.45,Douglas Tucker,4478071379779230,Sun4608,4478,*
2,21.01,3.5,Male,No,Sun,Dinner,3,7.0,Travis Walters,6011812112971322,Sun4458,6011,**
3,23.68,3.31,Male,No,Sun,Dinner,2,11.84,Nathaniel Harris,4676137647685994,Sun5260,4676,***
4,24.59,3.61,Female,No,Sun,Dinner,4,6.15,Tonya Carter,4832732618637221,Sun2251,4832,**


In [19]:
def quality(total_bill,tip):
    if tip/total_bill  > 0.25:
        return "Generous"
    else:
        return "Other"

In [20]:
#using lambda we can apply method so many columns
df['Tip Quality'] = df[['total_bill','tip']].apply(lambda df: quality(df['total_bill'],df['tip']),axis=1)

In [21]:
df.head(3)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person,Payer Name,CC Number,Payment ID,first_four,Star,Tip Quality
0,16.99,1.01,Female,No,Sun,Dinner,2,8.49,Christy Cunningham,3560325168603410,Sun2959,3560,***,Other
1,10.34,1.66,Male,No,Sun,Dinner,3,3.45,Douglas Tucker,4478071379779230,Sun4608,4478,*,Other
2,21.01,3.5,Male,No,Sun,Dinner,3,7.0,Travis Walters,6011812112971322,Sun4458,6011,**,Other


In [22]:
df.tail(3)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person,Payer Name,CC Number,Payment ID,first_four,Star,Tip Quality
241,22.67,2.0,Male,Yes,Sat,Dinner,2,11.34,Keith Wong,6011891618747196,Sat3880,6011,***,Other
242,17.82,1.75,Male,No,Sat,Dinner,2,8.91,Dennis Dixon,4375220550950,Sat17,4375,***,Other
243,18.78,3.0,Female,No,Thur,Dinner,2,9.39,Michelle Hardin,3511451626698139,Thur672,3511,***,Other


In [23]:
# we can also use vectorize
df['Tip Quality'] = np.vectorize(quality)(df['total_bill'], df['tip'])

In [24]:
df.tail(3)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person,Payer Name,CC Number,Payment ID,first_four,Star,Tip Quality
241,22.67,2.0,Male,Yes,Sat,Dinner,2,11.34,Keith Wong,6011891618747196,Sat3880,6011,***,Other
242,17.82,1.75,Male,No,Sat,Dinner,2,8.91,Dennis Dixon,4375220550950,Sat17,4375,***,Other
243,18.78,3.0,Female,No,Thur,Dinner,2,9.39,Michelle Hardin,3511451626698139,Thur672,3511,***,Other


## statistical summaries

In [25]:
df.describe()

Unnamed: 0,total_bill,tip,size,price_per_person,CC Number
count,244.0,244.0,244.0,244.0,244.0
mean,19.785943,2.998279,2.569672,7.888197,2563496000000000.0
std,8.902412,1.383638,0.9511,2.914234,2369340000000000.0
min,3.07,1.0,1.0,2.88,60406790000.0
25%,13.3475,2.0,2.0,5.8,30407310000000.0
50%,17.795,2.9,2.0,7.255,3525318000000000.0
75%,24.1275,3.5625,3.0,9.39,4553675000000000.0
max,50.81,10.0,6.0,20.27,6596454000000000.0


In [26]:
df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
total_bill,244.0,19.78594,8.902412,3.07,13.3475,17.795,24.1275,50.81
tip,244.0,2.998279,1.383638,1.0,2.0,2.9,3.5625,10.0
size,244.0,2.569672,0.9510998,1.0,2.0,2.0,3.0,6.0
price_per_person,244.0,7.888197,2.914234,2.88,5.8,7.255,9.39,20.27
CC Number,244.0,2563496000000000.0,2369340000000000.0,60406790000.0,30407310000000.0,3525318000000000.0,4553675000000000.0,6596454000000000.0


## sort_values()

In [27]:
df.sort_values('price_per_person').head(3)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person,Payer Name,CC Number,Payment ID,first_four,Star,Tip Quality
92,5.75,1.0,Female,Yes,Fri,Dinner,2,2.88,Leah Ramirez,3508911676966392,Fri3780,3508,*,Other
67,3.07,1.0,Female,Yes,Sat,Dinner,1,3.07,Tiffany Brock,4359488526995267,Sat3455,4359,*,Generous
16,10.33,1.67,Female,No,Sun,Dinner,3,3.44,Elizabeth Foster,4240025044626033,Sun9715,4240,*,Other


In [28]:
df.sort_values(['price_per_person','tip']).tail(3)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person,Payer Name,CC Number,Payment ID,first_four,Star,Tip Quality
170,50.81,10.0,Male,Yes,Sat,Dinner,3,16.94,Gregory Clark,5473850968388236,Sat1954,5473,***,Other
179,34.63,3.55,Male,Yes,Sun,Dinner,2,17.32,Brian Bailey,346656312114848,Sun9851,3466,***,Other
184,40.55,3.0,Male,Yes,Sun,Dinner,2,20.27,Stephen Cox,3547798222044029,Sun5140,3547,***,Other


## Correlation

In [29]:
df.corr()

Unnamed: 0,total_bill,tip,size,price_per_person,CC Number
total_bill,1.0,0.675734,0.598315,0.647554,0.104576
tip,0.675734,1.0,0.489299,0.347405,0.110857
size,0.598315,0.489299,1.0,-0.175359,-0.030239
price_per_person,0.647554,0.347405,-0.175359,1.0,0.13524
CC Number,0.104576,0.110857,-0.030239,0.13524,1.0


In [30]:
df[['price_per_person','tip']].corr()

Unnamed: 0,price_per_person,tip
price_per_person,1.0,0.347405
tip,0.347405,1.0


## idxmin and idxmax

idxmax = find the location of max number

idxmin = find the loction of min number

In [31]:
df.head(3)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person,Payer Name,CC Number,Payment ID,first_four,Star,Tip Quality
0,16.99,1.01,Female,No,Sun,Dinner,2,8.49,Christy Cunningham,3560325168603410,Sun2959,3560,***,Other
1,10.34,1.66,Male,No,Sun,Dinner,3,3.45,Douglas Tucker,4478071379779230,Sun4608,4478,*,Other
2,21.01,3.5,Male,No,Sun,Dinner,3,7.0,Travis Walters,6011812112971322,Sun4458,6011,**,Other


In [32]:
df['total_bill'].min()

3.07

In [33]:
df['total_bill'].max()

50.81

In [34]:
df['total_bill'].idxmin()

67

In [35]:
df['total_bill'].idxmax()

170

## value_counts

Get a count per category

In [36]:
df['sex'].value_counts()

Male      157
Female     87
Name: sex, dtype: int64

## replace

In [37]:
#Quickly replace values with another one.
dff =df['Tip Quality'].replace(to_replace='Other',value='Ok')

In [38]:
dff.head(3)

0    Ok
1    Ok
2    Ok
Name: Tip Quality, dtype: object

## unique

Find the unique number

In [39]:
df['size'].unique()

array([2, 3, 4, 1, 6, 5], dtype=int64)

In [40]:
df['size'].nunique()

6

## map

map the value with dif name

In [41]:
my_map = {'Dinner':'D','Lunch':'L'}

In [42]:
df['time'].map(my_map).tail(3)

241    D
242    D
243    D
Name: time, dtype: object

## Duplicates

Return the true for the second time value occurred

In [43]:
df.duplicated().tail(3)

241    False
242    False
243    False
dtype: bool

## between

find between value which we given

In [44]:
df['total_bill'].between(10,20,inclusive=True).head(3)

  df['total_bill'].between(10,20,inclusive=True).head(3)


0     True
1     True
2    False
Name: total_bill, dtype: bool

In [45]:
df[df['total_bill'].between(10,20,inclusive=True)].tail(3)

  df[df['total_bill'].between(10,20,inclusive=True)].tail(3)


Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person,Payer Name,CC Number,Payment ID,first_four,Star,Tip Quality
236,12.6,1.0,Male,Yes,Sat,Dinner,2,6.3,Matthew Myers,3543676378973965,Sat5032,3543,**,Other
242,17.82,1.75,Male,No,Sat,Dinner,2,8.91,Dennis Dixon,4375220550950,Sat17,4375,***,Other
243,18.78,3.0,Female,No,Thur,Dinner,2,9.39,Michelle Hardin,3511451626698139,Thur672,3511,***,Other


## sample

Find any random value of sample

In [46]:
df.sample(5)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person,Payer Name,CC Number,Payment ID,first_four,Star,Tip Quality
132,11.17,1.5,Female,No,Thur,Lunch,2,5.58,Taylor Gonzalez,6011990685390011,Thur7783,6011,**,Other
90,28.97,3.0,Male,Yes,Fri,Dinner,2,14.48,Daniel Mason,3597456900644078,Fri4175,3597,***,Other
141,34.3,6.7,Male,No,Thur,Lunch,6,5.72,Steven Carlson,3526515703718508,Thur1025,3526,**,Other
44,30.4,5.6,Male,No,Sun,Dinner,4,7.6,Todd Cooper,503846761263,Sun2274,5038,**,Other
20,17.92,4.08,Male,No,Sat,Dinner,2,8.96,Thomas Rice,4403296224639756,Sat1709,4403,***,Other


In [47]:
#find any frac% value from dataset
df.sample(frac=0.1)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person,Payer Name,CC Number,Payment ID,first_four,Star,Tip Quality
54,25.56,4.34,Male,No,Sun,Dinner,4,6.39,Ronald Owens,6569607991983380,Sun9470,6569,**,Other
77,27.2,4.0,Male,No,Thur,Lunch,4,6.8,John Davis,30344778738589,Thur4924,3034,**,Other
141,34.3,6.7,Male,No,Thur,Lunch,6,5.72,Steven Carlson,3526515703718508,Thur1025,3526,**,Other
59,48.27,6.73,Male,No,Sat,Dinner,4,12.07,Brian Ortiz,6596453823950595,Sat8139,6596,***,Other
153,24.55,2.0,Male,No,Sun,Dinner,4,6.14,Todd Patterson,4416804908942159,Sun8670,4416,**,Other
164,17.51,3.0,Female,Yes,Sun,Dinner,2,8.76,Audrey Griffin,3500853929693258,Sun444,3500,***,Other
46,22.23,5.0,Male,No,Sun,Dinner,2,11.12,Joshua Gilmore,4292072734899,Sun7097,4292,***,Other
146,18.64,1.36,Female,No,Thur,Lunch,3,6.21,Kelly Estrada,60463302327,Thur3941,6046,**,Other
212,48.33,9.0,Male,No,Sat,Dinner,4,12.08,Alex Williamson,676218815212,Sat4590,6762,***,Other
188,18.15,3.5,Female,Yes,Sun,Dinner,3,6.05,Glenda Wiggins,578329325307,Sun430,5783,**,Other


## nlargest and nsmallest

Find give n largest or smallest number

In [48]:
df.nlargest(10,'price_per_person')

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person,Payer Name,CC Number,Payment ID,first_four,Star,Tip Quality
184,40.55,3.0,Male,Yes,Sun,Dinner,2,20.27,Stephen Cox,3547798222044029,Sun5140,3547,***,Other
179,34.63,3.55,Male,Yes,Sun,Dinner,2,17.32,Brian Bailey,346656312114848,Sun9851,3466,***,Other
170,50.81,10.0,Male,Yes,Sat,Dinner,3,16.94,Gregory Clark,5473850968388236,Sat1954,5473,***,Other
175,32.9,3.11,Male,Yes,Sun,Dinner,2,16.45,Nathan Reynolds,370307040837149,Sun5109,3703,***,Other
237,32.83,1.17,Male,Yes,Sat,Dinner,2,16.42,Thomas Brown,4284722681265508,Sat2929,4284,***,Other
83,32.68,5.0,Male,Yes,Thur,Lunch,2,16.34,Daniel Murphy,5356177501009133,Thur8801,5356,***,Other
173,31.85,3.18,Male,Yes,Sun,Dinner,2,15.92,Scott Perez,3577115550328507,Sun9335,3577,***,Other
182,45.35,3.5,Male,Yes,Sun,Dinner,3,15.12,Jose Parsons,4112207559459910,Sun2337,4112,***,Other
102,44.3,2.5,Female,Yes,Sat,Dinner,3,14.77,Heather Cohen,379771118886604,Sat6240,3797,***,Other
90,28.97,3.0,Male,Yes,Fri,Dinner,2,14.48,Daniel Mason,3597456900644078,Fri4175,3597,***,Other


In [49]:
df.nsmallest(10,'price_per_person')

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person,Payer Name,CC Number,Payment ID,first_four,Star,Tip Quality
92,5.75,1.0,Female,Yes,Fri,Dinner,2,2.88,Leah Ramirez,3508911676966392,Fri3780,3508,*,Other
67,3.07,1.0,Female,Yes,Sat,Dinner,1,3.07,Tiffany Brock,4359488526995267,Sat3455,4359,*,Generous
16,10.33,1.67,Female,No,Sun,Dinner,3,3.44,Elizabeth Foster,4240025044626033,Sun9715,4240,*,Other
1,10.34,1.66,Male,No,Sun,Dinner,3,3.45,Douglas Tucker,4478071379779230,Sun4608,4478,*,Other
172,7.25,5.15,Male,Yes,Sun,Dinner,2,3.62,Larry White,30432617123103,Sun9209,3043,*,Generous
149,7.51,2.0,Male,No,Thur,Lunch,2,3.76,Daniel Robbins,4823139288341889,Thur6321,4823,*,Generous
195,7.56,1.44,Male,No,Thur,Lunch,2,3.78,Michael White,4865390263095532,Thur697,4865,*,Other
218,7.74,1.44,Male,Yes,Sat,Dinner,2,3.87,Nicholas Archer,340517153733524,Sat4772,3405,*,Other
159,16.49,2.0,Male,No,Sun,Dinner,4,4.12,Christopher Soto,30501814271434,Sun1781,3050,*,Other
185,20.69,5.0,Male,No,Sun,Dinner,5,4.14,Joseph Howell,30362407455623,Sun5842,3036,*,Other
