### Data extraction 

**df.loc：**
- df.loc 是基于标签进行索引和选择的方法。
- 它使用行和列的标签来提取数据，可以使用行和列的标签名称进行切片和选择。
- 语法：df.loc[row_label, column_label]。

**df.iloc：**
- df.iloc 是基于整数位置进行索引和选择的方法。
- 它使用行和列的整数位置（从 0 开始）来提取数据，可以使用整数位置进行切片和选择。
- 语法：df.iloc[row_index, column_index]。

#### Extract data bu Row

In [1]:
import pandas as pd
data = [[45, 60, 100], [56,45,50], [67, 67,67], [68, 72, 89], [12, 98, 96]]
students = ['Amy', 'Beck', 'Charlie', 'Daisy', 'Emma']
subjects = ['Math', 'English', 'Science']

df = pd.DataFrame(data, index=students, columns=subjects)
print(df)

         Math  English  Science
Amy        45       60      100
Beck       56       45       50
Charlie    67       67       67
Daisy      68       72       89
Emma       12       98       96


In [2]:
print(df.loc['Amy'])    # extract data by index name 

print(df.iloc[0])       # extract data by index number

Math        45
English     60
Science    100
Name: Amy, dtype: int64
Math        45
English     60
Science    100
Name: Amy, dtype: int64


In [3]:
# extract multi-person's data
print(df.loc[['Amy', 'Beck']])
print('*'*40)
print(df.iloc[[0, 1]])


      Math  English  Science
Amy     45       60      100
Beck    56       45       50
****************************************
      Math  English  Science
Amy     45       60      100
Beck    56       45       50


In [4]:
# Extract consecutive multiple rows of data
print(df.loc['Amy':'Emma'])   # index name: include 'Charlie'
print(df.iloc[0:4])              # index number: exclude 'Charlie'

         Math  English  Science
Amy        45       60      100
Beck       56       45       50
Charlie    67       67       67
Daisy      68       72       89
Emma       12       98       96
         Math  English  Science
Amy        45       60      100
Beck       56       45       50
Charlie    67       67       67
Daisy      68       72       89


In [5]:
# use slice to extract consecutive multiple rows of data
print(df.iloc[::])     # iloc[start:stop:step]
print(df.iloc[::2])     

         Math  English  Science
Amy        45       60      100
Beck       56       45       50
Charlie    67       67       67
Daisy      68       72       89
Emma       12       98       96
         Math  English  Science
Amy        45       60      100
Charlie    67       67       67
Emma       12       98       96


#### Extract data by column

In [6]:
import pandas as pd
data = [[45, 60, 100], [56,45,50], [67, 67,67], [68, 72, 89], [12, 98, 96]]
students = ['Amy', 'Beck', 'Charlie', 'Daisy', 'Emma']
subjects = ['Math', 'English', 'Science']

df = pd.DataFrame(data, index=students, columns=subjects)
print(df)

         Math  English  Science
Amy        45       60      100
Beck       56       45       50
Charlie    67       67       67
Daisy      68       72       89
Emma       12       98       96


In [7]:
print(df[['English', 'Math']])   # Extract directly using column names

print('*'*40)

print(df.loc[:, ['Math','English']])  # df.loc[row_label, column_label]

print('*'*40)

print(df.iloc[:, [0,1]])  # df.iloc[row_index, column_index]


print('----------------------Extract consecutive columns-------------------------------')
print(df.loc[:, 'Math':'Science']) 

print('*'*40)

print(df.iloc[:, 0:2])


         English  Math
Amy           60    45
Beck          45    56
Charlie       67    67
Daisy         72    68
Emma          98    12
****************************************
         Math  English
Amy        45       60
Beck       56       45
Charlie    67       67
Daisy      68       72
Emma       12       98
****************************************
         Math  English
Amy        45       60
Beck       56       45
Charlie    67       67
Daisy      68       72
Emma       12       98
----------------------Extract consecutive columns-------------------------------
         Math  English  Science
Amy        45       60      100
Beck       56       45       50
Charlie    67       67       67
Daisy      68       72       89
Emma       12       98       96
****************************************
         Math  English
Amy        45       60
Beck       56       45
Charlie    67       67
Daisy      68       72
Emma       12       98


#### Extract regional data

In [8]:
import pandas as pd
data = [[45, 60, 100], [56,45,50], [67, 67,67], [68, 72, 89], [12, 98, 96]]
students = ['Amy', 'Beck', 'Charlie', 'Daisy', 'Emma']
subjects = ['Math', 'English', 'Science']

df = pd.DataFrame(data, index=students, columns=subjects)
print(df)

         Math  English  Science
Amy        45       60      100
Beck       56       45       50
Charlie    67       67       67
Daisy      68       72       89
Emma       12       98       96


In [9]:
print('Charlie\'s math score is:', df.loc['Charlie', 'Math'], '\n', type(df.loc['Charlie', 'Math']))
print('-'*35)


print('Below is Amy and Daisy\'s  Math and Science score:\n', df.loc[['Amy', 'Daisy'], ['English', 'Science']])
print('-'*35)

print('Charlie\'s math score is:', df.iloc[2,0])
print('-'*35)

print('Below is Amy and Beck\'s  Math and English score:\n',df.iloc[0:2, 0:2])
print('-'*35)

print('Below is Amy and Daisy\'s  Math and Science score:\n', df.iloc[[0,3], [1,2]])
print('-'*35)

print('Below is the Math score for all student:\n', df.iloc[:,0])

Charlie's math score is: 67 
 <class 'numpy.int64'>
-----------------------------------
Below is Amy and Daisy's  Math and Science score:
        English  Science
Amy         60      100
Daisy       72       89
-----------------------------------
Charlie's math score is: 67
-----------------------------------
Below is Amy and Beck's  Math and English score:
       Math  English
Amy     45       60
Beck    56       45
-----------------------------------
Below is Amy and Daisy's  Math and Science score:
        English  Science
Amy         60      100
Daisy       72       89
-----------------------------------
Below is the Math score for all student:
 Amy        45
Beck       56
Charlie    67
Daisy      68
Emma       12
Name: Math, dtype: int64


#### Extract specified condition data

In [10]:
import pandas as pd
data = [[45, 60, 100], [56,45,50], [67, 67,67], [68, 72, 89], [12, 98, 96]]
students = ['Amy', 'Beck', 'Charlie', 'Daisy', 'Emma']
subjects = ['Math', 'English', 'Science']

df = pd.DataFrame(data, index=students, columns=subjects)
print(df)

         Math  English  Science
Amy        45       60      100
Beck       56       45       50
Charlie    67       67       67
Daisy      68       72       89
Emma       12       98       96


In [11]:
# One Condition: Extract information about students whose English scores are greater than or equal to 60
print(df.loc[df['English']>=60])

print('-'*35)
# Multiple Condition:
#   English >= 60
#   Math >= 60
print(df.loc[(df['English']>=60) & (df['Math']>=60)])

         Math  English  Science
Amy        45       60      100
Charlie    67       67       67
Daisy      68       72       89
Emma       12       98       96
-----------------------------------
         Math  English  Science
Charlie    67       67       67
Daisy      68       72       89


### Addition, modification and deletion of DataFrame

In [12]:
import pandas as pd
data = [[45, 60, 100], [56,45,50], [67, 67,67], [68, 72, 89], [12, 98, 96]]
students = ['Amy', 'Beck', 'Charlie', 'Daisy', 'Emma']
subjects = ['Math', 'English', 'Science']

df = pd.DataFrame(data, index=students, columns=subjects)
print(df)

         Math  English  Science
Amy        45       60      100
Beck       56       45       50
Charlie    67       67       67
Daisy      68       72       89
Emma       12       98       96


#### Add data - by column

In [13]:
#  Method one: direct assignment
df['History'] = [62, 74, 85, 96, 88]
print(df)
print('-'*50)

# Method two: using loc
df.loc[:, 'Computer'] = [88, 39, 100, 35, 60]
print(df)

         Math  English  Science  History
Amy        45       60      100       62
Beck       56       45       50       74
Charlie    67       67       67       85
Daisy      68       72       89       96
Emma       12       98       96       88
--------------------------------------------------
         Math  English  Science  History  Computer
Amy        45       60      100       62        88
Beck       56       45       50       74        39
Charlie    67       67       67       85       100
Daisy      68       72       89       96        35
Emma       12       98       96       88        60


In [14]:
# Insert a column at the specified index position
lst = [65, 68, 98, 100, 100]
df.insert(1, 'Psychology', lst)
print(df)

         Math  Psychology  English  Science  History  Computer
Amy        45          65       60      100       62        88
Beck       56          68       45       50       74        39
Charlie    67          98       67       67       85       100
Daisy      68         100       72       89       96        35
Emma       12         100       98       96       88        60


#### Add data - by Row


In [15]:
import pandas as pd
data = [[45, 60, 100], [56,45,50], [67, 67,67], [68, 72, 89], [12, 98, 96]]
students = ['Amy', 'Beck', 'Charlie', 'Daisy', 'Emma']
subjects = ['Math', 'English', 'Science']

df = pd.DataFrame(data, index=students, columns=subjects)
print(df)

         Math  English  Science
Amy        45       60      100
Beck       56       45       50
Charlie    67       67       67
Daisy      68       72       89
Emma       12       98       96


In [16]:
# Add one student with score
df.loc['Faith'] = [57, 58, 59]
print(df)

         Math  English  Science
Amy        45       60      100
Beck       56       45       50
Charlie    67       67       67
Daisy      68       72       89
Emma       12       98       96
Faith      57       58       59


In [17]:
# Add more than one student with their score by creating a new DataFrame
new_df = pd.DataFrame(
    data={
            'Math': [80, 90, 70, 85],
            'Science': [95, 85, 80, 90],
            'English': [75, 80, 90, 85]},
    index= ['John', 'Jane', 'Bob', 'Alice']
)

print(new_df)
print('-'*45)
# Add the new DataFrame to the original DataFrame
df = pd.concat([df, new_df])
print(df)

       Math  Science  English
John     80       95       75
Jane     90       85       80
Bob      70       80       90
Alice    85       90       85
---------------------------------------------
         Math  English  Science
Amy        45       60      100
Beck       56       45       50
Charlie    67       67       67
Daisy      68       72       89
Emma       12       98       96
Faith      57       58       59
John       80       75       95
Jane       90       80       85
Bob        70       90       80
Alice      85       85       90


#### Modification-Columns

In [18]:
import pandas as pd
data = [[45, 60, 100], [56,45,50], [67, 67,67], [68, 72, 89], [12, 98, 96]]
students = ['Amy', 'Beck', 'Charlie', 'Daisy', 'Emma']
subjects = ['Math', 'English', 'Science']

df = pd.DataFrame(data, index=students, columns=subjects)
print(df)

         Math  English  Science
Amy        45       60      100
Beck       56       45       50
Charlie    67       67       67
Daisy      68       72       89
Emma       12       98       96


In [19]:
# Method 1: Use the columns attribute directly
df.columns=['Math-1', 'English-1', 'Science-1']
print(df)

         Math-1  English-1  Science-1
Amy          45         60        100
Beck         56         45         50
Charlie      67         67         67
Daisy        68         72         89
Emma         12         98         96


In [20]:
# Method 2: rename()   'inplace': whether to directly modify the DataFrame
df.rename(columns={'Math-1': 'Math-2', 'English-1':'English-2', 'Science-1':'Science-2'}, inplace=True)
print(df)


         Math-2  English-2  Science-2
Amy          45         60        100
Beck         56         45         50
Charlie      67         67         67
Daisy        68         72         89
Emma         12         98         96


#### Modification-Row

In [21]:
import pandas as pd
data = [[45, 60, 100], [56,45,50], [67, 67,67], [68, 72, 89], [12, 98, 96]]
students = ['Amy', 'Beck', 'Charlie', 'Daisy', 'Emma']
subjects = ['Math', 'English', 'Science']

df = pd.DataFrame(data, index=students, columns=subjects)
print(df)

         Math  English  Science
Amy        45       60      100
Beck       56       45       50
Charlie    67       67       67
Daisy      68       72       89
Emma       12       98       96


In [22]:
# Method 1: direct assignment
df.index=list('ABCDE')
print(df)

   Math  English  Science
A    45       60      100
B    56       45       50
C    67       67       67
D    68       72       89
E    12       98       96


In [23]:
# Method 2: rename()
df.rename({'A': '1', 'B':'2', 'C':'3', 'D':'4', 'E':'5'}, inplace=True, axis=0)
print(df)

   Math  English  Science
1    45       60      100
2    56       45       50
3    67       67       67
4    68       72       89
5    12       98       96


#### Modification-Data

In [24]:
import pandas as pd
data = [[45, 60, 100], [56,45,50], [67, 67,67], [68, 72, 89], [12, 98, 96]]
students = ['Amy', 'Beck', 'Charlie', 'Daisy', 'Emma']
subjects = ['Math', 'English', 'Science']

df = pd.DataFrame(data, index=students, columns=subjects)
print(df)

         Math  English  Science
Amy        45       60      100
Beck       56       45       50
Charlie    67       67       67
Daisy      68       72       89
Emma       12       98       96


In [25]:
# Modify by Row
df.loc['Amy']=[100, 100, 120]
print(df)

# Modify all columns of row 0
df.iloc[0, :] = [90, 90, 90]
print(df)

         Math  English  Science
Amy       100      100      120
Beck       56       45       50
Charlie    67       67       67
Daisy      68       72       89
Emma       12       98       96
         Math  English  Science
Amy        90       90       90
Beck       56       45       50
Charlie    67       67       67
Daisy      68       72       89
Emma       12       98       96


In [26]:
# Modify by Column
df.loc[:,'Math']=[100, 100, 100, 100, 100]
print(df)

df.iloc[:, 0] = 90
print(df)

         Math  English  Science
Amy       100       90       90
Beck      100       45       50
Charlie   100       67       67
Daisy     100       72       89
Emma      100       98       96
         Math  English  Science
Amy        90       90       90
Beck       90       45       50
Charlie    90       67       67
Daisy      90       72       89
Emma       90       98       96


In [27]:
# Modify by specific one 
df.loc['Beck', 'English'] = 100
print(df)

df.iloc[1,1] = 180
print(df)

         Math  English  Science
Amy        90       90       90
Beck       90      100       50
Charlie    90       67       67
Daisy      90       72       89
Emma       90       98       96
         Math  English  Science
Amy        90       90       90
Beck       90      180       50
Charlie    90       67       67
Daisy      90       72       89
Emma       90       98       96


#### Deletion

In [51]:
import pandas as pd
data = [[45, 60, 100], [56,45,50], [67, 67,67], [68, 72, 89], [12, 98, 96]]
students = ['Amy', 'Beck', 'Charlie', 'Daisy', 'Emma']
subjects = ['Math', 'English', 'Science']

df = pd.DataFrame(data, index=students, columns=subjects)
print(df)



         Math  English  Science
Amy        45       60      100
Beck       56       45       50
Charlie    67       67       67
Daisy      68       72       89
Emma       12       98       96


In [29]:
# Delete by column

# method: drop()
# df.drop(['Math'], axis=1, inplace=True)
# df.drop(columns='Math', inplace=True)
df.drop(labels='Math', axis=1, inplace=True)     # These three line of code will get the same result
print(df)

         English  Science
Amy           60      100
Beck          45       50
Charlie       67       67
Daisy         72       89
Emma          98       96


In [30]:
# Delete by Row 
# df.drop(['Amy'], axis = 0, inplace = True)
# df.drop(index='Amy', inplace=True)
df.drop(labels='Amy', axis=0, inplace=True) 
print(df)

         English  Science
Beck          45       50
Charlie       67       67
Daisy         72       89
Emma          98       96


In [52]:
# Delete by condition:
# Drop rows where 'English' column is less than 60
df.drop(df[df['English'] < 60].index, inplace=True)

print(df)

         Math  English  Science
Amy        45       60      100
Charlie    67       67       67
Daisy      68       72       89
Emma       12       98       96


In [58]:
print(list(df[df['Math']<60].index))

['Amy', 'Emma']
