# Basic knowledge about pandas


### 1. Basic operation with Series

In [1]:
from tarfile import data_filter

# creation of Series
import pandas as pd
s= pd.Series([1,2,3,4,5])
print(s)
# self-defined index
s=pd.Series([10,2,3,4,5], index=['A','B','C','D','E'])
print(s)
# define name
s=pd.Series([1,2,3],index=['A','B','C'],name='chart')
print(s)

0    1
1    2
2    3
3    4
4    5
dtype: int64
A    10
B     2
C     3
D     4
E     5
dtype: int64
A    1
B    2
C    3
Name: chart, dtype: int64


In [6]:
# creating Series through dictionaries
s=pd.Series({"a":1,"b":2,"c":3})
print(s)
s1=pd.Series(s,index=["a","c"])
print(s1)

a    1
b    2
c    3
dtype: int64
a    1
c    3
dtype: int64



### (1) Features:

index

values

dtype/dypes

shape

ndim

size

name

loc[]  label-based selection(self-defined)

iloc[] integer-based selection(staring from 0)

at[]   accessor at label(self-defined)

iat[]  integer accessor(starting from 0)

In [17]:
# Series features
s=pd.Series({'a':1,'b':2,'c':3,'d':4,'e':5},name='chart')
print(s)
s.name='test'
print(s.index)
print(s.values)
print(s.shape, s.ndim , s.size)
print(s.dtype)
print(s.name)
print(s.loc['a'])
print(s.loc['a':'b'])
print(s.iloc[0])
print(s.at['a'])
print(s.iat[0])

a    1
b    2
c    3
d    4
e    5
Name: chart, dtype: int64
Index(['a', 'b', 'c', 'd', 'e'], dtype='object')
[1 2 3 4 5]
(5,) 1 5
int64
test
1
a    1
b    2
Name: test, dtype: int64
1
1
1


In [21]:
# access data in series
print(s)
# 1. directly
print(s['a'])
# 2. boolean indexing
print(s[s<3])
# 3. check up first/last five rows
s['f']=6
print(s.head())
print(s.tail())

a    1
b    2
c    3
d    4
e    5
Name: test, dtype: int64
1
a    1
b    2
Name: test, dtype: int64
a    1
b    2
c    3
d    4
e    5
Name: test, dtype: int64


### (2) Operations
head() first n rows(auto 5 rows)

tail() last n rows(auto 5 rows)

isin() is in series?

isna() is NaN, or None?

sum()

mean()

min()

max()

var()

std()

median()

mode()  mode

quantile(q) q: 0 to 1

describe()

value_counts()

count()

unique()

drop_duplicates()

sample()

sort_index()

sort_values()

replace()

keys()


In [5]:
import numpy as np
s=pd.Series([10,2,np.nan,None,3,4,5],index=['A','B','C','D','E','F','G'],name='data')
print(s)

A    10.0
B     2.0
C     NaN
D     NaN
E     3.0
F     4.0
G     5.0
Name: data, dtype: float64


In [34]:
print(s.head(3))
print(s.tail(2))
# obtain all descriptive data
s.describe()  # None or NaN will be skipped

A    10.0
B     2.0
C     NaN
Name: data, dtype: float64
F    4.0
G    5.0
Name: data, dtype: float64


count     5.000000
mean      4.800000
std       3.114482
min       2.000000
25%       3.000000
50%       4.000000
75%       5.000000
max      10.000000
Name: data, dtype: float64

In [35]:
# obtain number of valid Series' elements
print(s.count())

5


In [6]:
# obtain index
print(s.keys())    # function call
print(s.index)     # feature

Index(['A', 'B', 'C', 'D', 'E', 'F', 'G'], dtype='object')
Index(['A', 'B', 'C', 'D', 'E', 'F', 'G'], dtype='object')


In [7]:
print(s.isna())    # check if every element exits
s.isna

A    False
B    False
C     True
D     True
E    False
F    False
G    False
Name: data, dtype: bool


<bound method Series.isna of A    10.0
B     2.0
C     NaN
D     NaN
E     3.0
F     4.0
G     5.0
Name: data, dtype: float64>

In [15]:
print(s.isin([4,5]))
print(s.isin([6]))

A    False
B    False
C    False
D    False
E    False
F     True
G     True
Name: data, dtype: bool
A    False
B    False
C    False
D    False
E    False
F    False
G    False
Name: data, dtype: bool


In [16]:
s.describe()

count     5.000000
mean      4.800000
std       3.114482
min       2.000000
25%       3.000000
50%       4.000000
75%       5.000000
max      10.000000
Name: data, dtype: float64

In [22]:
print(s.mean())
print(s.sum())
print('%.3f'%s.std())
print(s.var())
print(s.min())
print(s.max())
print(s.median())

4.8
24.0
3.114
9.700000000000001
2.0
10.0
4.0


In [23]:
print(s)

A    10.0
B     2.0
C     NaN
D     NaN
E     3.0
F     4.0
G     5.0
Name: data, dtype: float64


In [27]:
# sorting
print(s.sort_values())
print(s.quantile(0.25))
# calculation of quantile(overall similar to the percentile in numpy) : 1. position=(number of elements-1)*percentile
#         2. value=arr[position integer part]+(arr[position integer part+1]-arr[position integer part])*(position decimal part)

B     2.0
E     3.0
F     4.0
G     5.0
A    10.0
C     NaN
D     NaN
Name: data, dtype: float64
3.0


In [29]:
# mode
s['H']=4
print(s.mode())

0    4.0
Name: data, dtype: float64


In [31]:
# count
print(s.value_counts())

data
4.0     2
10.0    1
2.0     1
3.0     1
5.0     1
Name: count, dtype: int64


In [32]:
# drop duplicates
s.drop_duplicates()

A    10.0
B     2.0
C     NaN
E     3.0
F     4.0
G     5.0
Name: data, dtype: float64

In [34]:
print(s.unique())
print(s.nunique())

[10.  2. nan  3.  4.  5.]
5


In [37]:
# sort methods
print(s.sort_index())
print(s.sort_values())

A    10.0
B     2.0
C     NaN
D     NaN
E     3.0
F     4.0
G     5.0
H     4.0
Name: data, dtype: float64
B     2.0
E     3.0
F     4.0
H     4.0
G     5.0
A    10.0
C     NaN
D     NaN
Name: data, dtype: float64



### 2. Series questions



#### (1)Score data:

Create a Series of scores of 10 students, ranging from 50 to 100 calculate the mean, max, min, and find the number of students that scores higher than average.

Given:
np.random.seed(42)
scores=pd.Series(np.random.randint(50,101,10),index=['Student'+str(i) for i in range(1,11)]

In [None]:
import numpy as np
import pandas as pd
np.random.seed(42)
scores=pd.Series(np.random.randint(50,101,10),index=['Student'+str(i) for i in range(1,11)],name='Scores')
# or: values=np.random.randint(50,101,10)
#     index=[]
#     for i in range(1,11):
#         index.append('Student'+str(i))
#     scores=pd.Series(value,indexes)
print(scores)
print('Mean:',scores.mean())
print('Max:',scores.max())
print('Min:',scores.min())
mean=scores.mean()
print('The number of students score higher than average:',scores[scores>mean].count())
# or len(scores[scores>mean])


#### (2)Temperature

Given the highest temperatures everyday in one week's time, find:

 -the number of days that exceed 30 degrees
 -the sorted temperatures from high to low
 -the two days that has the maximum temperature difference

Given:
temperatures=pd.Series([28,31,29,32,30,27,33],index=['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday'])


In [9]:
temperatures=pd.Series([28,31,29,32,30,27,33],index=['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday'])
print('The number of days that exceed 30 degrees:',temperatures[temperatures>30].count())
print('Mean temperature:',temperatures.mean())
print('The temperatures sequenced from high to low:',temperatures.sort_values()[::-1]) # or: temperatures.sort_values(ascending=False)    the temperatures sequence is not eternally changed

# series difference function
t3=temperatures.diff().abs()
print('The two days with max difference:',*(t3.sort_values(ascending=False).index[:2].tolist()))
# tolist convert series to list
# *() is used to delete the [''] of the output

The number of days that exceed 30 degrees: 3
Mean temperature: 30.0
The temperatures sequenced from high to low: Sunday       33
Thursday     32
Tuesday      31
Friday       30
Wednesday    29
Monday       28
Saturday     27
dtype: int64
The two days with max difference: Sunday Tuesday


#### (3)Stack analysis
Calculate the daily return (today's closing price / previous day's closing price - 1)

Find the dates with the highest and lowest returns

Calculate the volatility (standard deviation of returns)

Given: prices=pd.Series([102.3,103.5,105.1,104.8,106.2,107.0,106.5,108.1,109.3,110.2],index=pd.date_range('2023-01-01',periods=10))

In [14]:
import pandas as pd
import numpy as np
prices=pd.Series([102.3,103.5,105.1,104.8,106.2,107.0,106.5,108.1,109.3,110.2],index=pd.date_range('2023-01-01',periods=10))
prices

2023-01-01    102.3
2023-01-02    103.5
2023-01-03    105.1
2023-01-04    104.8
2023-01-05    106.2
2023-01-06    107.0
2023-01-07    106.5
2023-01-08    108.1
2023-01-09    109.3
2023-01-10    110.2
Freq: D, dtype: float64

In [21]:
# daily return rate
return_rate=prices.pct_change()     # pct->percent
return_rate

2023-01-01         NaN
2023-01-02    0.011730
2023-01-03    0.015459
2023-01-04   -0.002854
2023-01-05    0.013359
2023-01-06    0.007533
2023-01-07   -0.004673
2023-01-08    0.015023
2023-01-09    0.011101
2023-01-10    0.008234
Freq: D, dtype: float64

In [19]:
# max and min date
print('The day that has the max return rate:',return_rate.idxmax())
print('The day that has the min return rate:', return_rate.idxmin())

The day that has the max return rate: 2023-01-03 00:00:00
The day that has the min return rate: 2023-01-07 00:00:00


In [22]:
# deviation
print('The standard deviation is:',return_rate.std())

The standard deviation is: 0.007373623845361105


#### (4)Sales analysis
Calculate the average sales each season(3 months 1 season)

Find the month that has the max sales

Calculate the monthly increasing rate

Find the months that has more than 2 succeeding months' sales increasing

Given: sales=pd.Series([120,135,145,160,155,170,180,175,190,200,210,220],index=pd.date_range('2022-01-01',periods=12,freq='MS'))

In [27]:
sales=pd.Series([120,135,145,160,155,170,180,175,190,200,210,220],index=pd.date_range('2022-01-01',periods=12,freq='MS'))
sales

2022-01-01    120
2022-02-01    135
2022-03-01    145
2022-04-01    160
2022-05-01    155
2022-06-01    170
2022-07-01    180
2022-08-01    175
2022-09-01    190
2022-10-01    200
2022-11-01    210
2022-12-01    220
Freq: MS, dtype: int64

In [30]:
print('The average sales by seasons:')
sales.resample('QS').mean()

The average sales by seasons:


2022-01-01    133.333333
2022-04-01    161.666667
2022-07-01    181.666667
2022-10-01    210.000000
Freq: QS-JAN, dtype: float64

In [31]:
print('The month that has the maximum sales is',sales.idxmax())

The month that has the maximum sales is 2022-12-01 00:00:00


In [26]:
print('The increase rates of sales by month are:',sales.pct_change())

The increase rates of sales by month are: 2022-01-31         NaN
2022-02-28    0.125000
2022-03-31    0.074074
2022-04-30    0.103448
2022-05-31   -0.031250
2022-06-30    0.096774
2022-07-31    0.058824
2022-08-31   -0.027778
2022-09-30    0.085714
2022-10-31    0.052632
2022-11-30    0.050000
2022-12-31    0.047619
Freq: ME, dtype: float64


In [36]:
# find the months that have more than two month increasing succeeding
increase=sales.pct_change()
assess=increase>0
assess   # assess.rolling(3)->use rolling window with length 3
assess[assess.rolling(3).sum()==3].keys().tolist()  # True=1,False=0, 3 succeeding month True->sum=3

[Timestamp('2022-04-01 00:00:00'),
 Timestamp('2022-11-01 00:00:00'),
 Timestamp('2022-12-01 00:00:00')]

#### (5)Sales by hour analysis

Calculate the sum of sales per day

Calculate the ratio of sales during working hours(8:00-22:00) against none-working hours

Find the 3 hours that has the maximum sales

Given: np.random.seed(42)
hourly_sales=pd.Series(np.random.randint(0,100,24),index=pd.date_range('2025-01-01',periods=24,freq='h'))

In [38]:
np.random.seed(42)
hourly_sales=pd.Series(np.random.randint(0,100,24),index=pd.date_range('2025-01-01',periods=24,freq='h'))
hourly_sales

2025-01-01 00:00:00    51
2025-01-01 01:00:00    92
2025-01-01 02:00:00    14
2025-01-01 03:00:00    71
2025-01-01 04:00:00    60
2025-01-01 05:00:00    20
2025-01-01 06:00:00    82
2025-01-01 07:00:00    86
2025-01-01 08:00:00    74
2025-01-01 09:00:00    74
2025-01-01 10:00:00    87
2025-01-01 11:00:00    99
2025-01-01 12:00:00    23
2025-01-01 13:00:00     2
2025-01-01 14:00:00    21
2025-01-01 15:00:00    52
2025-01-01 16:00:00     1
2025-01-01 17:00:00    87
2025-01-01 18:00:00    29
2025-01-01 19:00:00    37
2025-01-01 20:00:00     1
2025-01-01 21:00:00    63
2025-01-01 22:00:00    59
2025-01-01 23:00:00    20
Freq: h, dtype: int32

In [43]:
# sum
print('The daily sum of sales is:')
ds=hourly_sales.resample('D').sum()
ds
# or: hourly_sales.sum()

The daily sum of sales is:


2025-01-01    1205
Freq: D, dtype: int32

In [48]:
#ratio
wh=hourly_sales.between_time('8:00','22:00').sum()
#or: wh=hourly_sales[(hourly_sales.index.hour>=8)&(hourly_sales.index.hour<=22)]
print('Ratio:',wh/(ds-wh))
#or: nwh=hours_sales_drop(wh.index).sum()
#    print('Ratio:',wh/nwh)
#or: use boolean index

Ratio: 2025-01-01    1.429435
Freq: D, dtype: float64


In [51]:
# largest 3
print(hourly_sales.nlargest(3).keys())

DatetimeIndex(['2025-01-01 11:00:00', '2025-01-01 01:00:00',
               '2025-01-01 10:00:00'],
              dtype='datetime64[ns]', freq=None)



## 3. Basic operations with DataFrame

###  (1) create DataFrame

In [5]:
# create DataFrame
# 1. create by Series
import numpy as np
import pandas as pd
s1=pd.Series([1,2,3,4,5])
s2=pd.Series([6,7,8,9,10])
df=pd.DataFrame({"First column":s1,"Second column":s2})
print(df)
print(type(df))
print(type(df["First column"]))

   First column  Second column
0             1              6
1             2              7
2             3              8
3             4              9
4             5             10
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.series.Series'>


In [12]:
# 2. create by dictionary
df=pd.DataFrame({
    "id":[10,2,23,4,15],
    "name":['tom','jack','alice','bob','allen'],
    "age":[15,17,20,26,30],
    "score":[60.5,80,30.6,70,83.5]
},index=['student' f'{i}' for i in range(1,6)],
    columns=['id','name','age','score']
)
df

Unnamed: 0,id,name,age,score
student1,10,tom,15,60.5
student2,2,jack,17,80.0
student3,23,alice,20,30.6
student4,4,bob,26,70.0
student5,15,allen,30,83.5


### (2) DataFrame features

index   (row index of DataFrame)

values

dtypes  (return dtype of every column)

shape

ndim

size

columns  (label of every column)

loc[]

iloc[]

at[]

iat

T

In [16]:
# features of DataFrame
print(df)
print('Row index:',df.index)
print('Column labels:',df.columns)
print('Values:\n',df.values)

          id   name  age  score
student1  10    tom   15   60.5
student2   2   jack   17   80.0
student3  23  alice   20   30.6
student4   4    bob   26   70.0
student5  15  allen   30   83.5
Row index: Index(['student1', 'student2', 'student3', 'student4', 'student5'], dtype='object')
Column labels: Index(['id', 'name', 'age', 'score'], dtype='object')
Values:
 [[10 'tom' 15 60.5]
 [2 'jack' 17 80.0]
 [23 'alice' 20 30.6]
 [4 'bob' 26 70.0]
 [15 'allen' 30 83.5]]


In [19]:
print('Dimension:',df.ndim)
print('Data type:\n',df.dtypes)
print('Shape:',df.shape)
print('Number of elements:',df.size)

Dimension: 2
Data type:
 id         int64
name      object
age        int64
score    float64
dtype: object
Shape: (5, 4)
Number of elements: 20


In [20]:
# Transpose
print('Transpose',df.T)

      student1 student2 student3 student4 student5
id          10        2       23        4       15
name       tom     jack    alice      bob    allen
age         15       17       20       26       30
score     60.5     80.0     30.6     70.0     83.5


In [26]:
# get element of a row
print(df)
print('The 4th row:',df.loc['student4'])
print('The 4th row:',df.iloc[3])

          id   name  age  score
student1  10    tom   15   60.5
student2   2   jack   17   80.0
student3  23  alice   20   30.6
student4   4    bob   26   70.0
student5  15  allen   30   83.5
The 4th row: id          4
name      bob
age        26
score    70.0
Name: student4, dtype: object
The 4th row: id          4
name      bob
age        26
score    70.0
Name: student4, dtype: object


In [33]:
# get element of a column
print(df)
print("The column 'name':")
print(df.loc[:,'name'])
print("\nThe column 'name':")
print(df.iloc[:,0])

          id   name  age  score
student1  10    tom   15   60.5
student2   2   jack   17   80.0
student3  23  alice   20   30.6
student4   4    bob   26   70.0
student5  15  allen   30   83.5
The column 'name':
student1      tom
student2     jack
student3    alice
student4      bob
student5    allen
Name: name, dtype: object

The column 'name':
student1    10
student2     2
student3    23
student4     4
student5    15
Name: id, dtype: int64


In [39]:
# get one element
print('Score of alice:',df.at['student3','score'])
print('\nScore of alice:',df.iat[2,3])
print('\nScore of alice:',df.loc['student3','score'])
print('\nScore of alice:',df.iloc[2,3])

Score of alice: 30.6

Score of alice: 30.6

Score of alice: 30.6

Score of alice: 30.6


In [44]:
# get row or column
print(df['name'])
print(df.name)
print(type(df.name))
print(df[['name']])
print(type(df[['name']]))  # with 2 [], it creates a DataFrame instead of a Series
df[['name']]

student1      tom
student2     jack
student3    alice
student4      bob
student5    allen
Name: name, dtype: object
student1      tom
student2     jack
student3    alice
student4      bob
student5    allen
Name: name, dtype: object
<class 'pandas.core.series.Series'>
           name
student1    tom
student2   jack
student3  alice
student4    bob
student5  allen
<class 'pandas.core.frame.DataFrame'>


Unnamed: 0,name
student1,tom
student2,jack
student3,alice
student4,bob
student5,allen


In [45]:
# multiple column
print(df[['name','score']])

           name  score
student1    tom   60.5
student2   jack   80.0
student3  alice   30.6
student4    bob   70.0
student5  allen   83.5


In [50]:
# check(preview of dataset)
print(df.head(3))
print(df.tail(3))

          id   name  age  score
student1  10    tom   15   60.5
student2   2   jack   17   80.0
student3  23  alice   20   30.6
          id   name  age  score
student3  23  alice   20   30.6
student4   4    bob   26   70.0
student5  15  allen   30   83.5


In [60]:
# boolean index
print(df[df.score>70])
print(df[df.score>70].name)
print(df[(df.['score']>70)&(df.age<20)])

          id   name  age  score
student2   2   jack   17   80.0
student5  15  allen   30   83.5
student2     jack
student5    allen
Name: name, dtype: object
          id  name  age  score
student2   2  jack   17   80.0


In [62]:
# n samples from the DataFrame
df.sample(3)

Unnamed: 0,id,name,age,score
student5,15,allen,30,83.5
student1,10,tom,15,60.5
student2,2,jack,17,80.0


### (3) DataFrame basic operations

head()

tail()

isin()

isna()

sum()

mean()

min()

max()

var()

std()

median()

mode()

quantile(q)

describe()

value_counts()

count()

duplicated()

drop_duplicates()

sample()

replace()

sort_index()

sort_values()

replace()

nlargest()

nsmallest()

In [63]:
print(df)

          id   name  age  score
student1  10    tom   15   60.5
student2   2   jack   17   80.0
student3  23  alice   20   30.6
student4   4    bob   26   70.0
student5  15  allen   30   83.5


In [64]:
print(df.head(1))
print(df.tail(1))

          id name  age  score
student1  10  tom   15   60.5
          id   name  age  score
student5  15  allen   30   83.5


In [66]:
# check if element is in
print(df.isin(['jack',20]))

             id   name    age  score
student1  False  False  False  False
student2  False   True  False  False
student3  False  False   True  False
student4  False  False  False  False
student5  False  False  False  False


In [67]:
# check if data point is empty
print(df.isna())

             id   name    age  score
student1  False  False  False  False
student2  False  False  False  False
student3  False  False  False  False
student4  False  False  False  False
student5  False  False  False  False


In [73]:
print('Sum score:',df['score'].sum())
print('Max score:',df.score.max())
print('Min score:',df.score.min())
print('Mean score:',df.score.mean())
print('Median score:',df.score.median())
print('Mode score:',df.age.mode())

Sum score: 324.6
Max score: 83.5
Min score: 30.6
Mean score: 64.92
Median score: 70.0
Mode score: 0    15
1    17
2    20
3    26
4    30
Name: age, dtype: int64


In [75]:
print('Standard deviation:',df.score.std())
print('Squared deviation:',df.score.var())
print('Quantile:',df.score.quantile(0.25))

Standard deviation: 21.188605428390044
Squared deviation: 448.957
Quantile: 60.5


In [77]:
print(df.describe())

              id        age      score
count   5.000000   5.000000   5.000000
mean   10.800000  21.600000  64.920000
std     8.526429   6.268971  21.188605
min     2.000000  15.000000  30.600000
25%     4.000000  17.000000  60.500000
50%    10.000000  20.000000  70.000000
75%    15.000000  26.000000  80.000000
max    23.000000  30.000000  83.500000


In [82]:
print(df.count())

id       5
name     5
age      5
score    5
dtype: int64


In [4]:
df=pd.DataFrame({
    "id":[10,10,2,23,4,15],
    "name":['tom','tom','jack','alice','bob','allen'],
    "age":[15,15,17,20,26,30],
    "score":[60.5,60.5,80,30.6,70,83.5]
},index=['student' f'{i}' for i in range(1,7)],
    columns=['id','name','age','score']
)
df


Unnamed: 0,id,name,age,score
student1,10,tom,15,60.5
student2,10,tom,15,60.5
student3,2,jack,17,80.0
student4,23,alice,20,30.6
student5,4,bob,26,70.0
student6,15,allen,30,83.5


In [89]:
print(df.value_counts())   # take every record as count

id  name   age  score
10  tom    15   60.5     2
2   jack   17   80.0     1
4   bob    26   70.0     1
15  allen  30   83.5     1
23  alice  20   30.6     1
Name: count, dtype: int64


In [90]:
print(df.drop_duplicates())

          id   name  age  score
student1  10    tom   15   60.5
student3   2   jack   17   80.0
student4  23  alice   20   30.6
student5   4    bob   26   70.0
student6  15  allen   30   83.5


In [92]:
print(df.duplicated())
print(df.duplicated(subset=['age']))

student1    False
student2     True
student3    False
student4    False
student5    False
student6    False
dtype: bool
student1    False
student2     True
student3    False
student4    False
student5    False
student6    False
dtype: bool


In [5]:
df.sample(2)

Unnamed: 0,id,name,age,score
student1,10,tom,15,60.5
student4,23,alice,20,30.6


In [8]:
print(df.replace(15,30))
print(df)

          id   name  age  score
student1  10    tom   30   60.5
student2  10    tom   30   60.5
student3   2   jack   17   80.0
student4  23  alice   20   30.6
student5   4    bob   26   70.0
student6  30  allen   30   83.5
          id   name  age  score
student1  10    tom   15   60.5
student2  10    tom   15   60.5
student3   2   jack   17   80.0
student4  23  alice   20   30.6
student5   4    bob   26   70.0
student6  15  allen   30   83.5


In [17]:
df.cumsum()
df.cummax()
df.cummin()   # axis=1 or 0 to accumulate row or column

Unnamed: 0,id,name,age,score
student1,10,tom,15,60.5
student2,10,tom,15,60.5
student3,2,jack,15,60.5
student4,2,alice,15,30.6
student5,2,alice,15,30.6
student6,2,alice,15,30.6


In [18]:
print(df.sort_index(ascending=False))

          id   name  age  score
student6  15  allen   30   83.5
student5   4    bob   26   70.0
student4  23  alice   20   30.6
student3   2   jack   17   80.0
student2  10    tom   15   60.5
student1  10    tom   15   60.5


In [23]:
print(df.sort_values(by=['score','age'],ascending=[False,True]))

          id   name  age  score
student6  15  allen   30   83.5
student3   2   jack   17   80.0
student5   4    bob   26   70.0
student1  10    tom   15   60.5
student2  10    tom   15   60.5
student4  23  alice   20   30.6


In [28]:
print(df.nlargest(2,columns=['score','age']))
print(df.nsmallest(2,columns=['score']))

          id   name  age  score
student6  15  allen   30   83.5
student3   2   jack   17   80.0
          id   name  age  score
student4  23  alice   20   30.6
student1  10    tom   15   60.5


### 2. DataFrame questions

#### (1) Question 1 Score analysis

-Calculate the sum and average score of each student

-Find the students whose math score is higher than 90 or english score higher than 85

-Find the top three student based on the output of sum score descending

Given: data={'name':['Alan','Bill','Catlin','Dave','Evelyn'],
             'math':[85,92,78,88,95],
             'english':[90,88,85,92,80],
             'physics':[75,80,88,85,90]}

In [41]:
import pandas as pd
data=pd.DataFrame({'name':['Alan','Bill','Catlin','Dave','Evelyn'],
             'math':[85,92,78,88,95],
             'english':[90,88,85,92,80],
             'physics':[75,80,88,85,90]})
data

Unnamed: 0,name,math,english,physics
0,Alan,85,90,75
1,Bill,92,88,80
2,Catlin,78,85,88
3,Dave,88,92,85
4,Evelyn,95,80,90


In [48]:
data['sum_score']=data[['math','english','physics']].sum(axis=1)
data

Unnamed: 0,name,math,english,physics,sum_score
0,Alan,85,90,75,250
1,Bill,92,88,80,260
2,Catlin,78,85,88,251
3,Dave,88,92,85,265
4,Evelyn,95,80,90,265


In [49]:
data['avg_score']=data[['math','english','physics']].mean(axis=1)
# or data['avg_score']=data['sum_score']/3
data

Unnamed: 0,name,math,english,physics,sum_score,avg_score
0,Alan,85,90,75,250,83.333333
1,Bill,92,88,80,260,86.666667
2,Catlin,78,85,88,251,83.666667
3,Dave,88,92,85,265,88.333333
4,Evelyn,95,80,90,265,88.333333


In [50]:
data[(data['math']>90) | (data['english']>85)].name

0      Alan
1      Bill
3      Dave
4    Evelyn
Name: name, dtype: object

In [53]:
r1=data.sort_values('sum_score',ascending=False).head(3)
r2=data.nlargest(3,columns=['sum_score'])
print(r1)
print(r2)

     name  math  english  physics  sum_score  avg_score
4  Evelyn    95       80       90        265  88.333333
3    Dave    88       92       85        265  88.333333
1    Bill    92       88       80        260  86.666667
     name  math  english  physics  sum_score  avg_score
3    Dave    88       92       85        265  88.333333
4  Evelyn    95       80       90        265  88.333333
1    Bill    92       88       80        260  86.666667


#### (2) Question 2 Sales analysis

-Calculate the sum sales off each product(sales=price*amount)

-Find the product that has the highest sales

-List all information of the products in sales descending order

Given: data={
            'product name':['A','B','C','D'],
            'price':[100,150,200,120],
            'amount':[50,30,20,40]}

df=pd.DataFrame(data)



In [54]:
data={
            'product name':['A','B','C','D'],
            'price':[100,150,200,120],
            'amount':[50,30,20,40]}
df=pd.DataFrame(data)
df

Unnamed: 0,product name,price,amount
0,A,100,50
1,B,150,30
2,C,200,20
3,D,120,40


In [60]:
df['sum_sales']=df['price']*df['amount']
df

Unnamed: 0,product name,price,amount,sum_sales
0,A,100,50,5000
1,B,150,30,4500
2,C,200,20,4000
3,D,120,40,4800


In [66]:
df.nlargest(1,'sum_sales')

Unnamed: 0,product name,price,amount,sum_sales
0,A,100,50,5000


In [62]:
df.sort_values('sum_sales',ascending=False)

Unnamed: 0,product name,price,amount,sum_sales
0,A,100,50,5000
3,D,120,40,4800
1,B,150,30,4500
2,C,200,20,4000


In [4]:
import pandas as pd
import numpy as np