In [1]:
import numpy as np
import pandas as pd

## read files 读取文件

In [2]:
df_csv = pd.read_csv("./data/my_csv.csv")

In [3]:
df_csv

Unnamed: 0,col1,col2,col3,col4,col5
0,2,a,1.4,apple,2020/1/1
1,3,b,3.4,banana,2020/1/2
2,6,c,2.5,orange,2020/1/5
3,5,d,3.2,lemon,2020/1/7


In [4]:
df_txt = pd.read_table("./data/my_table.txt")

In [5]:
df_txt

Unnamed: 0,col1,col2,col3,col4
0,2,a,1.4,apple 2020/1/1
1,3,b,3.4,banana 2020/1/2
2,6,c,2.5,orange 2020/1/5
3,5,d,3.2,lemon 2020/1/7


In [6]:
df_excel = pd.read_excel("./data/my_excel.xlsx")

In [7]:
df_excel

Unnamed: 0,col1,col2,col3,col4,col5
0,2,a,1.4,apple,2020/1/1
1,3,b,3.4,banana,2020/1/2
2,6,c,2.5,orange,2020/1/5
3,5,d,3.2,lemon,2020/1/7


## no header

In [8]:
pd.read_table("./data/my_table.txt", header=None)

Unnamed: 0,0,1,2,3
0,col1,col2,col3,col4
1,2,a,1.4,apple 2020/1/1
2,3,b,3.4,banana 2020/1/2
3,6,c,2.5,orange 2020/1/5
4,5,d,3.2,lemon 2020/1/7


## index_col=[  ]

In [10]:
pd.read_csv("./data/my_csv.csv", index_col=["col1","col5"])

Unnamed: 0_level_0,Unnamed: 1_level_0,col2,col3,col4
col1,col5,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2,2020/1/1,a,1.4,apple
3,2020/1/2,b,3.4,banana
6,2020/1/5,c,2.5,orange
5,2020/1/7,d,3.2,lemon


## usecols=[   ]

In [13]:
pd.read_table("./data/my_table.txt",usecols=["col1","col2"])

Unnamed: 0,col1,col2
0,2,a
1,3,b
2,6,c
3,5,d


In [14]:
pd.read_excel("./data/my_excel.xlsx", usecols=["col1"])

Unnamed: 0,col1
0,2
1,3
2,6
3,5


## parse_dates()

In [15]:
pd.read_csv("./data/my_csv.csv", parse_dates=["col5"])

Unnamed: 0,col1,col2,col3,col4,col5
0,2,a,1.4,apple,2020-01-01
1,3,b,3.4,banana,2020-01-02
2,6,c,2.5,orange,2020-01-05
3,5,d,3.2,lemon,2020-01-07


## nrows= a number

In [18]:
pd.read_table("./data/my_table.txt",nrows=3, parse_dates=["col4"])

Unnamed: 0,col1,col2,col3,col4
0,2,a,1.4,apple 2020/1/1
1,3,b,3.4,banana 2020/1/2
2,6,c,2.5,orange 2020/1/5


## sep engine 特殊分隔符， txt文件


In [19]:
pd.read_table("./data/my_table_special_sep.txt")

Unnamed: 0,col1 |||| col2
0,TS |||| This is an apple.
1,GQ |||| My name is Bob.
2,WT |||| Well done!
3,PT |||| May I help you?


In [20]:
pd.read_table("./data/my_table_special_sep.txt", sep="\|\|\|\|", engine="python")

Unnamed: 0,col1,col2
0,TS,This is an apple.
1,GQ,My name is Bob.
2,WT,Well done!
3,PT,May I help you?


# write files 保存数据

In [22]:
df_csv.to_csv("./data/chao/writeFile.csv", index=False)
df_excel.to_excel("./data/chao/writeFile.xlsx", index=False)

## 没有 to_table， 但是可以用 to_csv代替

In [23]:
df_txt.to_csv("./data/chao/table.txt",sep="\t", index=False)

# 基本数据结构

## Series 一维数据

In [2]:
ser1 = pd.Series(data=[26,17,29],index=pd.Index(["Chao","Apurva","Vova"], name="Finland"), dtype="object",name="My first Series")

In [3]:
ser1

Finland
Chao      26
Apurva    17
Vova      29
Name: My first Series, dtype: object

## access attributes

In [4]:
ser1.name

'My first Series'

In [5]:
ser1.index

Index(['Chao', 'Apurva', 'Vova'], dtype='object', name='Finland')

In [6]:
ser1.values

array([26, 17, 29], dtype=object)

In [7]:
ser1.dtype

dtype('O')

In [8]:
ser1.shape

(3,)

In [9]:
ser1.size

3

In [10]:
ser1.ndim

1

# DataFrame 二维数据

In [16]:
data = [[100,200,300],[101,201,301],[102,202,303],[103,203,303],[104,204,304]]
index = pd.Index(["a","b","c","d","e"])
cols=["one hundred","two hundred","three hundred"]

In [17]:
df1 = pd.DataFrame(data=data, index=index, columns=cols)

In [18]:
df1

Unnamed: 0,one hundred,two hundred,three hundred
a,100,200,300
b,101,201,301
c,102,202,303
d,103,203,303
e,104,204,304


## access to df attributes

In [19]:
df1.index

Index(['a', 'b', 'c', 'd', 'e'], dtype='object')

In [20]:
df1.values

array([[100, 200, 300],
       [101, 201, 301],
       [102, 202, 303],
       [103, 203, 303],
       [104, 204, 304]], dtype=int64)

In [21]:
df1.columns

Index(['one hundred', 'two hundred', 'three hundred'], dtype='object')

In [22]:
df1.shape

(5, 3)

In [23]:
df1.ndim

2

In [24]:
df1.size

15

# basic functions 基本函数

In [25]:
df = pd.read_csv("./data/learn_pandas.csv")

In [26]:
df

Unnamed: 0,School,Grade,Name,Gender,Height,Weight,Transfer,Test_Number,Test_Date,Time_Record
0,Shanghai Jiao Tong University,Freshman,Gaopeng Yang,Female,158.9,46.0,N,1,2019/10/5,0:04:34
1,Peking University,Freshman,Changqiang You,Male,166.5,70.0,N,1,2019/9/4,0:04:20
2,Shanghai Jiao Tong University,Senior,Mei Sun,Male,188.9,89.0,N,2,2019/9/12,0:05:22
3,Fudan University,Sophomore,Xiaojuan Sun,Female,,41.0,N,2,2020/1/3,0:04:08
4,Fudan University,Sophomore,Gaojuan You,Male,174.0,74.0,N,2,2019/11/6,0:05:22
...,...,...,...,...,...,...,...,...,...,...
195,Fudan University,Junior,Xiaojuan Sun,Female,153.9,46.0,N,2,2019/10/17,0:04:31
196,Tsinghua University,Senior,Li Zhao,Female,160.9,50.0,N,3,2019/9/22,0:04:03
197,Shanghai Jiao Tong University,Senior,Chengqiang Chu,Female,153.9,45.0,N,1,2020/1/5,0:04:48
198,Shanghai Jiao Tong University,Senior,Chengmei Shen,Male,175.3,71.0,N,2,2020/1/7,0:04:58


In [27]:
df.shape

(200, 10)

In [28]:
df.ndim

2

In [29]:
df.size

2000

In [30]:
df.columns

Index(['School', 'Grade', 'Name', 'Gender', 'Height', 'Weight', 'Transfer',
       'Test_Number', 'Test_Date', 'Time_Record'],
      dtype='object')

In [31]:
df.index

RangeIndex(start=0, stop=200, step=1)

# slicing a df 切割，只去前7列

In [34]:
df = df[df.columns[:7]]

In [35]:
df

Unnamed: 0,School,Grade,Name,Gender,Height,Weight,Transfer
0,Shanghai Jiao Tong University,Freshman,Gaopeng Yang,Female,158.9,46.0,N
1,Peking University,Freshman,Changqiang You,Male,166.5,70.0,N
2,Shanghai Jiao Tong University,Senior,Mei Sun,Male,188.9,89.0,N
3,Fudan University,Sophomore,Xiaojuan Sun,Female,,41.0,N
4,Fudan University,Sophomore,Gaojuan You,Male,174.0,74.0,N
...,...,...,...,...,...,...,...
195,Fudan University,Junior,Xiaojuan Sun,Female,153.9,46.0,N
196,Tsinghua University,Senior,Li Zhao,Female,160.9,50.0,N
197,Shanghai Jiao Tong University,Senior,Chengqiang Chu,Female,153.9,45.0,N
198,Shanghai Jiao Tong University,Senior,Chengmei Shen,Male,175.3,71.0,N


## tail head 查看前后几行，默认五行

In [36]:
df.head(6)

Unnamed: 0,School,Grade,Name,Gender,Height,Weight,Transfer
0,Shanghai Jiao Tong University,Freshman,Gaopeng Yang,Female,158.9,46.0,N
1,Peking University,Freshman,Changqiang You,Male,166.5,70.0,N
2,Shanghai Jiao Tong University,Senior,Mei Sun,Male,188.9,89.0,N
3,Fudan University,Sophomore,Xiaojuan Sun,Female,,41.0,N
4,Fudan University,Sophomore,Gaojuan You,Male,174.0,74.0,N
5,Tsinghua University,Freshman,Xiaoli Qian,Female,158.0,51.0,N


In [37]:
df.tail(10)

Unnamed: 0,School,Grade,Name,Gender,Height,Weight,Transfer
190,Shanghai Jiao Tong University,Junior,Changli Qin,Male,177.3,,N
191,Tsinghua University,Junior,Li Sun,Female,166.6,54.0,N
192,Shanghai Jiao Tong University,Senior,Gaojuan Wang,Male,166.8,70.0,N
193,Tsinghua University,Senior,Xiaoqiang Qin,Male,193.9,79.0,N
194,Peking University,Senior,Yanmei Qian,Female,160.3,49.0,
195,Fudan University,Junior,Xiaojuan Sun,Female,153.9,46.0,N
196,Tsinghua University,Senior,Li Zhao,Female,160.9,50.0,N
197,Shanghai Jiao Tong University,Senior,Chengqiang Chu,Female,153.9,45.0,N
198,Shanghai Jiao Tong University,Senior,Chengmei Shen,Male,175.3,71.0,N
199,Tsinghua University,Sophomore,Chunpeng Lv,Male,155.7,51.0,N


## 使用 .info()  .describe() 查看表信息，数据概述

In [40]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   School    200 non-null    object 
 1   Grade     200 non-null    object 
 2   Name      200 non-null    object 
 3   Gender    200 non-null    object 
 4   Height    183 non-null    float64
 5   Weight    189 non-null    float64
 6   Transfer  188 non-null    object 
dtypes: float64(2), object(5)
memory usage: 11.1+ KB


In [41]:
df.describe()

Unnamed: 0,Height,Weight
count,183.0,189.0
mean,163.218033,55.015873
std,8.608879,12.824294
min,145.4,34.0
25%,157.15,46.0
50%,161.9,51.0
75%,167.5,65.0
max,193.9,89.0


## max min mean median sum quantile

In [42]:
df_demo = df.loc[:,["Height","Weight"]]

In [43]:
df_demo

Unnamed: 0,Height,Weight
0,158.9,46.0
1,166.5,70.0
2,188.9,89.0
3,,41.0
4,174.0,74.0
...,...,...
195,153.9,46.0
196,160.9,50.0
197,153.9,45.0
198,175.3,71.0


In [46]:
df_demo.max()

Height    193.9
Weight     89.0
dtype: float64

In [47]:
df_demo.min()

Height    145.4
Weight     34.0
dtype: float64

In [48]:
df_demo.mean()

Height    163.218033
Weight     55.015873
dtype: float64

In [49]:
df_demo.median()

Height    161.9
Weight     51.0
dtype: float64

In [50]:
df_demo.idxmax()

Height    193
Weight      2
dtype: int64

In [53]:
df_demo.iloc[193,:]

Height    193.9
Weight     79.0
Name: 193, dtype: float64

In [54]:
df_demo.iloc[2,:]

Height    188.9
Weight     89.0
Name: 2, dtype: float64

In [52]:
df_demo.idxmin()

Height    143
Weight     49
dtype: int64

In [55]:
df_demo.iloc[143,:]

Height    145.4
Weight     34.0
Name: 143, dtype: float64

In [56]:
df_demo.iloc[49,:]

Height    147.3
Weight     34.0
Name: 49, dtype: float64

# slicing with [], or [[]]

In [72]:
print("use single []","\n")
print(df.iloc[1],"\n")
print(type(df.iloc[1]))

use single [] 

School      Peking University
Grade                Freshman
Name           Changqiang You
Gender                   Male
Height                  166.5
Weight                   70.0
Transfer                    N
Name: 1, dtype: object 

<class 'pandas.core.series.Series'>


In [73]:
print("use double [[]]","\n")
print(df.iloc[[1]],"\n")
print(type(df.iloc[[1]]))

use double [[]] 

              School     Grade            Name Gender  Height  Weight Transfer
1  Peking University  Freshman  Changqiang You   Male   166.5    70.0        N 

<class 'pandas.core.frame.DataFrame'>


In [74]:
print("use single []","\n")
print(df.iloc[1:101],"\n")
print(type(df.iloc[1:101]))

use single [] 

                            School      Grade            Name  Gender  Height  \
1                Peking University   Freshman  Changqiang You    Male   166.5   
2    Shanghai Jiao Tong University     Senior         Mei Sun    Male   188.9   
3                 Fudan University  Sophomore    Xiaojuan Sun  Female     NaN   
4                 Fudan University  Sophomore     Gaojuan You    Male   174.0   
5              Tsinghua University   Freshman     Xiaoli Qian  Female   158.0   
..                             ...        ...             ...     ...     ...   
96               Peking University   Freshman   Changmei Feng  Female   163.8   
97             Tsinghua University  Sophomore   Xiaoqiang Qin  Female   160.8   
98                Fudan University  Sophomore    Xiaojuan Chu    Male     NaN   
99               Peking University   Freshman  Changpeng Zhao    Male   181.3   
100            Tsinghua University     Senior    Xiaofeng Shi  Female   164.4   

     Weight

In [79]:
print("use single []","\n")
print(df.iloc[:,1:7],"\n")
print(type(df.iloc[:,1:7]))

use single [] 

         Grade            Name  Gender  Height  Weight Transfer
0     Freshman    Gaopeng Yang  Female   158.9    46.0        N
1     Freshman  Changqiang You    Male   166.5    70.0        N
2       Senior         Mei Sun    Male   188.9    89.0        N
3    Sophomore    Xiaojuan Sun  Female     NaN    41.0        N
4    Sophomore     Gaojuan You    Male   174.0    74.0        N
..         ...             ...     ...     ...     ...      ...
195     Junior    Xiaojuan Sun  Female   153.9    46.0        N
196     Senior         Li Zhao  Female   160.9    50.0        N
197     Senior  Chengqiang Chu  Female   153.9    45.0        N
198     Senior   Chengmei Shen    Male   175.3    71.0        N
199  Sophomore     Chunpeng Lv    Male   155.7    51.0        N

[200 rows x 6 columns] 

<class 'pandas.core.frame.DataFrame'>


In [80]:
print("use double [[]]","\n")
print(df.iloc[:,[1]],"\n")
print(type(df.iloc[:,[1]]))

use double [[]] 

         Grade
0     Freshman
1     Freshman
2       Senior
3    Sophomore
4    Sophomore
..         ...
195     Junior
196     Senior
197     Senior
198     Senior
199  Sophomore

[200 rows x 1 columns] 

<class 'pandas.core.frame.DataFrame'>


In [81]:
print("use single []","\n")
print(df.iloc[:,1],"\n")
print(type(df.iloc[:,1]))

use single [] 

0       Freshman
1       Freshman
2         Senior
3      Sophomore
4      Sophomore
         ...    
195       Junior
196       Senior
197       Senior
198       Senior
199    Sophomore
Name: Grade, Length: 200, dtype: object 

<class 'pandas.core.series.Series'>


# 单[] ，双[[]] 总结。 </br></br>当选取单独一row, 或者单独一个column， </br></br>单[]返回Series， 双[[]] 返回 dataframe

In [85]:
print(type(df.iloc[1,:]), "取一行 单[]")
print("\n\n\n")
print(type(df.iloc[[1],:]), "取一行 双[[]]")
print("\n\n\n")
print(type(df.iloc[:,1]), "取一列 单[]")
print("\n\n\n")
print(type(df.iloc[:,[1]]), "取一列 双[[]]")
print("\n\n\n")

<class 'pandas.core.series.Series'> 取一行 单[]




<class 'pandas.core.frame.DataFrame'> 取一行 双[[]]




<class 'pandas.core.series.Series'> 取一列 单[]




<class 'pandas.core.frame.DataFrame'> 取一列 双[[]]






In [61]:
df.loc[:,"School"].unique()

array(['Shanghai Jiao Tong University', 'Peking University',
       'Fudan University', 'Tsinghua University'], dtype=object)

In [62]:
df.loc[:,"School"].nunique()

4