In [2]:
# 导包
import pandas as pd
import numpy as np

# Series对象的创建及常用属性

In [3]:
# 方式一： python list - Series 默认索引
s1 = pd.Series(['a', 'b', 'c'])
print(s1)

0    a
1    b
2    c
dtype: object


In [4]:
# 方式二：python list - Series 指定索引
s2 = pd.Series(['a', 'b', 'c'], index = ['aa', 'bb', 'cc'])
s2

aa    a
bb    b
cc    c
dtype: object

In [5]:
# 方式三
s3 = pd.Series(('aa', 'man', 23), index = ['name', 'gender', 'age'])
s3

name       aa
gender    man
age        23
dtype: object

In [6]:
# 字典转换series
s4 = pd.Series({'name':"aa", "gender":"man", "age":23})
s4

name       aa
gender    man
age        23
dtype: object

In [7]:
# 方式五 把Numpy ndarray 转换成pandas
arr = np.arange(5)
print(arr, type(arr))
s5 = pd.Series(arr)
s5

[0 1 2 3 4] <class 'numpy.ndarray'>


0    0
1    1
2    2
3    3
4    4
dtype: int64

## Series 对象的常用属性

In [8]:
#data = 数据 index = 索引
s6 = pd.Series(data = [i for i in range(6)], index = [ i for i in "abcdef"])
s6

a    0
b    1
c    2
d    3
e    4
f    5
dtype: int64

In [9]:
# 演示常用属性
# 获取索引列
print(s6.index)

# 获取数值列
print(s6.values)

#根据索引获取对应的值
print(s6['d'])

Index(['a', 'b', 'c', 'd', 'e', 'f'], dtype='object')
[0 1 2 3 4 5]
3


# DataFrame对象创建及常用属性

In [10]:
# 字典 - DataFrame
# 定义数据集
data_dict = {
    '日期' : ['2024-12-19', '2024-12-20', '2024-12-21'],
    '温度' : [8, 5, 7],
    '湿度' : [65, 70, 60]
}

#字典中的每一对kv 对应了dataframe的一列
df1 = pd.DataFrame(data = data_dict, index = ['a', 'b', 'c']) # 默认索引
df1


Unnamed: 0,日期,温度,湿度
a,2024-12-19,8,65
b,2024-12-20,5,70
c,2024-12-21,7,60


In [11]:
#方式二：
# 列表 + 元组, 列表的每个数据是一行
data_list = [
    ("2024-12-19", 8, 65),
    ("2024-12-20", 5, 70),
    ("2024-12-21", 7, 50)
]

df2 = pd.DataFrame(data = data_list, columns = ["日期", "温度", "湿度"], index = ['A', 'B', "C"])
# df2
df2.sort_index()
df2

Unnamed: 0,日期,温度,湿度
A,2024-12-19,8,65
B,2024-12-20,5,70
C,2024-12-21,7,50


In [12]:
arr = np.array([[1,2], [3,4]])
print(arr.ndim)

2


# dataFrame的属性

In [13]:
# 创建dataframe对象，10个学生的五科成绩
score_data = np.random.randint(40, 100, size = (10, 5))
score_data

# 创建对应的data 
score_df = pd.DataFrame(score_data)
print(score_df)

# 处理列名，行索引
col_name = ["语", "外", "物", "化", "生"]
idx_names = ['同学' + str(i) for i in range(score_df.shape[0])]
score_df.columns = col_name
score_df.index = idx_names

print(score_df)

    0   1   2   3   4
0  99  97  81  60  57
1  75  94  65  49  88
2  58  63  96  42  40
3  90  86  91  70  66
4  76  96  62  42  67
5  55  82  95  53  70
6  83  96  43  41  65
7  80  44  83  75  57
8  50  73  59  69  90
9  69  93  58  60  79
      语   外   物   化   生
同学0  99  97  81  60  57
同学1  75  94  65  49  88
同学2  58  63  96  42  40
同学3  90  86  91  70  66
同学4  76  96  62  42  67
同学5  55  82  95  53  70
同学6  83  96  43  41  65
同学7  80  44  83  75  57
同学8  50  73  59  69  90
同学9  69  93  58  60  79


# DataFrame常用方法

In [14]:
score_df.head()

Unnamed: 0,语,外,物,化,生
同学0,99,97,81,60,57
同学1,75,94,65,49,88
同学2,58,63,96,42,40
同学3,90,86,91,70,66
同学4,76,96,62,42,67


In [15]:
score_df.tail()

Unnamed: 0,语,外,物,化,生
同学5,55,82,95,53,70
同学6,83,96,43,41,65
同学7,80,44,83,75,57
同学8,50,73,59,69,90
同学9,69,93,58,60,79


In [16]:
score_df.describe()

Unnamed: 0,语,外,物,化,生
count,10.0,10.0,10.0,10.0,10.0
mean,73.5,82.4,73.3,56.1,67.9
std,15.700318,17.58282,18.275971,12.617889,15.088259
min,50.0,44.0,43.0,41.0,40.0
25%,60.75,75.25,59.75,43.75,59.0
50%,75.5,89.5,73.0,56.5,66.5
75%,82.25,95.5,89.0,66.75,76.75
max,99.0,97.0,96.0,75.0,90.0


In [17]:
score_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 10 entries, 同学0 to 同学9
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   语       10 non-null     int64
 1   外       10 non-null     int64
 2   物       10 non-null     int64
 3   化       10 non-null     int64
 4   生       10 non-null     int64
dtypes: int64(5)
memory usage: 480.0+ bytes
