In [62]:
import pandas as pd
import numpy as np

In [6]:
s1 = pd.Series([2,3,7,8,5,1])
print(s1)
print(s1.index)
print(s1.values)

0    2
1    3
2    7
3    8
4    5
5    1
dtype: int64
RangeIndex(start=0, stop=6, step=1)
[2 3 7 8 5 1]


In [7]:
print(s1[2])
print(s1[2:5])

7
2    7
3    8
4    5
dtype: int64


In [8]:
s2 = pd.Series([2,4,7,9,11],index=["a","b","d","c","e"])
print(s2)

a     2
b     4
d     7
c     9
e    11
dtype: int64


In [27]:
print(s2.iloc[2])
print(s2.loc["c"])
print(s2.iloc[2:5])
print(s2.loc["a":"c"])

7
9
d     7
c     9
e    11
dtype: int64
a    2
b    4
d    7
c    9
dtype: int64


In [13]:
s3 = pd.Series({"big":23.2,"Tip":19.6,"Mari":10.2})
print(s3)

big     23.2
Tip     19.6
Mari    10.2
dtype: float64


In [16]:
print("big" in s3)

True


In [20]:
s3["big"] = 29.9
print(s3["big"])
print(s3.loc["big"])
print(s3.iloc[0])

29.9
29.9
29.9


In [28]:
print(s3[s3>20])
print(s3[(s3>10) & s3<30])
print(s3[~(s3>20)])

big    29.9
dtype: float64
big     29.9
Tip     19.6
Mari    10.2
dtype: float64
Tip     19.6
Mari    10.2
dtype: float64


## series的更多操作

In [7]:
s4 = pd.Series([1,3,4])
s5 = pd.Series([2,6])
print(s4+s5)

0    3.0
1    9.0
2    NaN
dtype: float64


In [10]:
print(s4.add(s5,fill_value=0))
print(s4.sub(s5,fill_value=0))
print(s4.mul(s5,fill_value=0))
print(s4.div(s5,fill_value=1))

0    3.0
1    9.0
2    4.0
dtype: float64
0   -1.0
1   -3.0
2    4.0
dtype: float64
0     2.0
1    18.0
2     0.0
dtype: float64
0    0.5
1    0.5
2    4.0
dtype: float64


In [11]:
s4.describe()

count    3.000000
mean     2.666667
std      1.527525
min      1.000000
25%      2.000000
50%      3.000000
75%      3.500000
max      4.000000
dtype: float64

In [12]:
def get_grade(score):
    if score >= 90:
        return "A"
    elif score >= 80:
        return "B"
    elif score >= 70:
        return "C"
    else:
        return "D"

In [15]:
scores = pd.Series([54,66,88,90,23,55,48,77,80])
grades = scores.apply(get_grade)
print(grades)

0    D
1    D
2    B
3    A
4    D
5    D
6    D
7    C
8    B
dtype: object


## DataFrame表格

In [16]:
s_id = pd.Series(["01","02","03","04","05"])
s_class = pd.Series(["二班","一班","二班","三班","一班"])
s_grade = pd.Series([92,67,55,88,74])

df1 = pd.DataFrame({"学号":s_id,"班级":s_class,"成绩":s_grade})
print(df1)

   学号  班级  成绩
0  01  二班  92
1  02  一班  67
2  03  二班  55
3  04  三班  88
4  05  一班  74


In [18]:
print(df1.index)
print(df1.columns)
print(df1.values)

RangeIndex(start=0, stop=5, step=1)
Index(['学号', '班级', '成绩'], dtype='object')
[['01' '二班' 92]
 ['02' '一班' 67]
 ['03' '二班' 55]
 ['04' '三班' 88]
 ['05' '一班' 74]]


In [21]:
print(df1["成绩"]>=80)
print(df1[df1["成绩"]>=80])

0     True
1    False
2    False
3     True
4    False
Name: 成绩, dtype: bool
   学号  班级  成绩
0  01  二班  92
3  04  三班  88


In [22]:
print(df1.head())
print(df1.head(2))

   学号  班级  成绩
0  01  二班  92
1  02  一班  67
2  03  二班  55
3  04  三班  88
4  05  一班  74
   学号  班级  成绩
0  01  二班  92
1  02  一班  67


## 表格练习

In [2]:
name = pd.Series(["小陈","小李","小王","小张","小赵","小周"],index=["001","002","003","004","005","006"])
gender = pd.Series(["女","女","男","男","女","男"],index=["006","005","004","003","002","001"])
height = pd.Series([172.5,168.0,178.2,181.3,161.7,155.9],index=["001","002","003","004","005","006"])

students = pd.DataFrame({"name":name,"gender":gender,"height":height})
print(students)

print(students.index)
print(students.columns)
print(students.T)

# 第二种方法更通用
print(students.height)
print(students["height"])

print(students[["gender","height"]])

print(students.loc["001"])
print(students.loc["003":"005"])

print(students.loc["005","height"])
print(students.loc[["003","005"],["name","height"]])

# 选定列，并圈定输出数据
print(students.loc[["003","005"],"name":"height"])
print(students.loc[["003","005"],"name"])
print(students.loc[["003","005"],:"height"])
print(students.loc[["003","005"]])

print(students["height"]>165.0)
print(students["gender"]=="女")
print(students[(students["height"]>165.0) & (students["gender"]=="女")])

print(students.head(5))

    name gender  height
001   小陈      男   172.5
002   小李      女   168.0
003   小王      男   178.2
004   小张      男   181.3
005   小赵      女   161.7
006   小周      女   155.9
Index(['001', '002', '003', '004', '005', '006'], dtype='object')
Index(['name', 'gender', 'height'], dtype='object')
          001    002    003    004    005    006
name       小陈     小李     小王     小张     小赵     小周
gender      男      女      男      男      女      女
height  172.5  168.0  178.2  181.3  161.7  155.9
001    172.5
002    168.0
003    178.2
004    181.3
005    161.7
006    155.9
Name: height, dtype: float64
001    172.5
002    168.0
003    178.2
004    181.3
005    161.7
006    155.9
Name: height, dtype: float64
    gender  height
001      男   172.5
002      女   168.0
003      男   178.2
004      男   181.3
005      女   161.7
006      女   155.9
name         小陈
gender        男
height    172.5
Name: 001, dtype: object
    name gender  height
003   小王      男   178.2
004   小张      男   181.3
005   小赵      女   161.7
16

## DataFrame更多操作

In [5]:
students["height"] = pd.Series([166,155,144,133,176,171],index=["001","002","003","004","005","006"])
students

Unnamed: 0,name,gender,height
1,小陈,男,166
2,小李,女,155
3,小王,男,144
4,小张,男,133
5,小赵,女,176
6,小周,女,171


In [6]:
students["height"] = [166,143,133,178,174,170]
students

Unnamed: 0,name,gender,height
1,小陈,男,166
2,小李,女,143
3,小王,男,133
4,小张,男,178
5,小赵,女,174
6,小周,女,170


In [10]:
students.loc["005"] = pd.Series(["赵一","女",162],index=["name","gender","height"])
students

Unnamed: 0,name,gender,height
1,小陈,男,166.0
2,小李,女,143.0
3,小王,男,133.0
4,小张,男,178.0
5,赵一,女,162.0
6,小周,女,170.0


In [11]:
students.loc["005"] = ["赵依","女",161]
students

Unnamed: 0,name,gender,height
1,小陈,男,166.0
2,小李,女,143.0
3,小王,男,133.0
4,小张,男,178.0
5,赵依,女,161.0
6,小周,女,170.0


In [14]:
# 只能用loc
students.loc["007"] = ["王尔","男",183]
students

Unnamed: 0,name,gender,height
1,小陈,男,166.0
2,小李,女,143.0
3,小王,男,133.0
4,小张,男,178.0
5,赵依,女,161.0
6,小周,女,170.0
7,王尔,男,183.0


In [19]:
students.drop("007")

Unnamed: 0,name,gender,height
1,小陈,男,166.0
2,小李,女,143.0
3,小王,男,133.0
4,小张,男,178.0
5,赵依,女,161.0
6,小周,女,170.0


In [20]:
students.drop(["007","003"])

Unnamed: 0,name,gender,height
1,小陈,男,166.0
2,小李,女,143.0
4,小张,男,178.0
5,赵依,女,161.0
6,小周,女,170.0


In [27]:
students.drop("height",axis=1)

Unnamed: 0,name,gender
1,小陈,男
2,小李,女
3,小王,男
4,小张,男
5,赵依,女
6,小周,女
7,王尔,男


In [29]:
students.drop("002",axis=0)

Unnamed: 0,name,gender,height
1,小陈,男,166.0
3,小王,男,133.0
4,小张,男,178.0
5,赵依,女,161.0
6,小周,女,170.0
7,王尔,男,183.0


In [30]:
students.drop(["height","name"],axis=1)

Unnamed: 0,gender
1,男
2,女
3,男
4,男
5,女
6,女
7,男


## 练习2

In [32]:
import pandas as pd

students_data = {
    "001": {"姓名": "小陈", "考试1": 85, "考试2": 95, "考试3": 92},
    "002": {"姓名": "小李", "考试1": 91, "考试2": 92, "考试3": 94},
    "003": {"姓名": "小王", "考试1": 86, "考试2": 81, "考试3": 89},
    "004": {"姓名": "小张", "考试1": 79, "考试2": 89, "考试3": 95},
    "005": {"姓名": "小赵", "考试1": 96, "考试2": 91, "考试3": 91},
    "006": {"姓名": "小周", "考试1": 81, "考试2": 89, "考试3": 92}
}

students = pd.DataFrame(students_data)
print(students)
print(students.T)

students = students.T
# students["考试4"] = [72, 69, 79, 83, 82, 76]
students["考试4"] = pd.Series([72, 69, 79, 83, 82, 76], index=students.index)
print(students)

students.loc["007"] = ["小杨", 79, 82, 81, 69]
print(students)

print(students.drop(["006","007"],axis=0))
print(students.drop(["考试2","考试4"],axis=1))

bonus = pd.Series({"考试1": 2, "考试2": 3, "考试3": 2, "考试4": 5})
# print(bonus + students[["考试1","考试2","考试3","考试4"]])
print(bonus + students["考试1":"考试4"])

students["考试4"] = students["考试4"] + 10
print(students)

    001 002 003 004 005 006
姓名   小陈  小李  小王  小张  小赵  小周
考试1  85  91  86  79  96  81
考试2  95  92  81  89  91  89
考试3  92  94  89  95  91  92
     姓名 考试1 考试2 考试3
001  小陈  85  95  92
002  小李  91  92  94
003  小王  86  81  89
004  小张  79  89  95
005  小赵  96  91  91
006  小周  81  89  92
     姓名 考试1 考试2 考试3  考试4
001  小陈  85  95  92   72
002  小李  91  92  94   69
003  小王  86  81  89   79
004  小张  79  89  95   83
005  小赵  96  91  91   82
006  小周  81  89  92   76
     姓名 考试1 考试2 考试3  考试4
001  小陈  85  95  92   72
002  小李  91  92  94   69
003  小王  86  81  89   79
004  小张  79  89  95   83
005  小赵  96  91  91   82
006  小周  81  89  92   76
007  小杨  79  82  81   69
     姓名 考试1 考试2 考试3  考试4
001  小陈  85  95  92   72
002  小李  91  92  94   69
003  小王  86  81  89   79
004  小张  79  89  95   83
005  小赵  96  91  91   82
     姓名 考试1 考试3
001  小陈  85  92
002  小李  91  94
003  小王  86  89
004  小张  79  95
005  小赵  96  91
006  小周  81  92
007  小杨  79  81
Empty DataFrame
Columns: [姓名, 考试1, 考试2, 考试3, 考试4]
Index: []
     姓名

In [33]:
students

Unnamed: 0,姓名,考试1,考试2,考试3,考试4
1,小陈,85,95,92,82
2,小李,91,92,94,79
3,小王,86,81,89,89
4,小张,79,89,95,93
5,小赵,96,91,91,92
6,小周,81,89,92,86
7,小杨,79,82,81,79


## 丝滑操作3

In [45]:
students_data = {
    "001": {"姓名": "小陈", "考试1": 85, "考试2": 95, "考试3": 92},
    "002": {"姓名": "小李", "考试1": 91, "考试2": 92, "考试3": 94},
    "003": {"姓名": "小王", "考试1": 86, "考试2": 81, "考试3": 89},
    "004": {"姓名": "小张", "考试1": 79, "考试2": 89, "考试3": 95},
    "005": {"姓名": "小赵", "考试1": 96, "考试2": 91, "考试3": 91},
    "006": {"姓名": "小周", "考试1": 81, "考试2": 89, "考试3": 92}
}
students = pd.DataFrame(students_data)
students = students.T
students

Unnamed: 0,姓名,考试1,考试2,考试3
1,小陈,85,95,92
2,小李,91,92,94
3,小王,86,81,89
4,小张,79,89,95
5,小赵,96,91,91
6,小周,81,89,92


In [58]:
average = students.loc[:,"考试1":"考试3"].mean(axis=1)
name = students["姓名"]
students_average = pd.DataFrame({"姓名":name,"平均分":average})
students_average

Unnamed: 0,姓名,平均分
1,小陈,90.666667
2,小李,92.333333
3,小王,85.333333
4,小张,87.666667
5,小赵,92.666667
6,小周,87.333333


In [64]:
students.loc[:,"考试1":"考试3"].apply(lambda x:np.sort(x)[-2])

考试1    91
考试2    92
考试3    94
dtype: int64

In [65]:
students

Unnamed: 0,姓名,考试1,考试2,考试3
1,小陈,85,95,92
2,小李,91,92,94
3,小王,86,81,89
4,小张,79,89,95
5,小赵,96,91,91
6,小周,81,89,92


In [66]:
def get_grade(score):
    if score >= 95:
        return "A+"
    elif score >= 90:
        return "A"
    elif score >= 85:
        return "B+"
    elif score >= 80:
        return "B"
    elif score >= 75:
        return "C+"
    else:
        return "C"

In [69]:
students.loc[:,"考试1":"考试3"].map(get_grade)

Unnamed: 0,考试1,考试2,考试3
1,B+,A+,A
2,A,A,A
3,B+,B,B+
4,C+,B+,A+
5,A+,A,A
6,B,B+,A


In [73]:
students.describe()

Unnamed: 0,姓名,考试1,考试2,考试3
count,6,6,6,6
unique,6,6,5,5
top,小陈,85,89,92
freq,1,1,2,2


In [75]:
students["考试1"] = students["考试1"].astype("int")
students["考试2"] = students["考试2"].astype("int")
students["考试3"] = students["考试3"].astype("int")

In [76]:
students.describe()

Unnamed: 0,考试1,考试2,考试3
count,6.0,6.0,6.0
mean,86.333333,89.5,92.166667
std,6.314006,4.722288,2.136976
min,79.0,81.0,89.0
25%,82.0,89.0,91.25
50%,85.5,90.0,92.0
75%,89.75,91.75,93.5
max,96.0,95.0,95.0
