In [1]:
import numpy as np
import pandas as pd

### 本文格式文件

#### 读操作

In [3]:
# 读取csv格式文件
pd.read_csv("./data/ex1.csv")

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,python


In [13]:
# read_table读取带分隔符的数据
# 如果不指定分隔符，则会将一整行文件作为一行数据
pd.read_table("./data/ex1.csv")

Unnamed: 0,"a,b,c,d,message"
0,"1,2,3,4,hello"
1,"5,6,7,8,world"
2,"9,10,11,12,python"


In [14]:
# sep参数指定分隔符
pd.read_table("./data/ex1.csv", sep = ",")

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,python


In [15]:
# 读取无表头的数据文件
# ⚠️ 此时会把第一行数据充当列索引，这显然是不合理的
pd.read_csv("./data/ex2.csv")

Unnamed: 0,1,2,3,4,hello
0,5,6,7,8,world
1,9,10,11,12,python


In [16]:
# 读取无表头的数据文件
# 通过header参数分配默认列索引
pd.read_csv("./data/ex2.csv", header = None)

Unnamed: 0,0,1,2,3,4
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,python


In [32]:
# 读取无表头的数据文件
# 通过names参数自定义列索引
pd.read_csv("./data/ex2.csv", names = ['a', 'b', 'c', 'd', 'message'])

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,python


In [34]:
# 读取无表头的数据文件
# 将某一列的值作为行索引使用
pd.read_csv("./data/ex2.csv", names = ['a', 'b', 'c', 'd', 'message'], index_col = 'message')

Unnamed: 0_level_0,a,b,c,d
message,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
hello,1,2,3,4
world,5,6,7,8
python,9,10,11,12


In [35]:
# 读取文件使得行索引为多级索引
pd.read_csv("./data/mindex.csv", index_col = ['k1', 'k2'])

Unnamed: 0_level_0,Unnamed: 1_level_0,v1,v2
k1,k2,Unnamed: 2_level_1,Unnamed: 3_level_1
one,a,1,2
one,b,3,4
one,c,5,6
one,d,7,8
two,a,9,10
two,b,11,12
two,c,13,14
two,d,15,16


In [38]:
# 由于文件的表头的列比数据列少，自动推断第一列为行索引
pd.read_table("./data/ex3.txt", sep = "\s+")

Unnamed: 0,A,B,C
aaa,1,2,3
bbb,4,5,6
ccc,7,8,9


#### 写操作

In [42]:
df = pd.DataFrame({
    "A": [0, 1, 2, 3],
    "B": [4, 5, 6, 7],
    "C": [8, 9, 1, 0]
})
df

Unnamed: 0,A,B,C
0,0,4,8
1,1,5,9
2,2,6,1
3,3,7,0


In [43]:
# 写入csv文件
df.to_csv("./data/out_ex1.csv")

#### 分块读取

In [45]:
customer_churn_predict = pd.read_csv("./data/customer-churn-predict.csv")
customer_churn_predict.head(5)

Unnamed: 0,customerid,gender,seniorcitizen,partner,dependents,tenure,phoneservice,multiplelines,internetservice,onlinesecurity,...,deviceprotection,techsupport,streamingtv,streamingmovies,contract,paperlessbilling,paymentmethod,monthlycharges,totalcharges,churn
0,2405-LBMUW,Female,0,Yes,Yes,61,No,No phone service,DSL,Yes,...,No,Yes,No,Yes,One year,Yes,Bank transfer (automatic),$50.70,"$3,088.75",No
1,3454-JFUBC,Male,1,No,No,68,Yes,No,No,No internet service,...,No internet service,No internet service,No internet service,No internet service,Two year,Yes,Credit card (automatic),$20.00,"$1,396.00",No
2,0032-PGELS,Female,0,Yes,Yes,1,No,No phone service,DSL,Yes,...,No,No,No,No,Month-to-month,No,Bank transfer (automatic),$30.50,$30.50,Yes
3,9039-ZVJDC,Male,0,No,No,3,Yes,No,No,No internet service,...,No internet service,No internet service,No internet service,No internet service,Month-to-month,Yes,Mailed check,$19.10,$53.05,No
4,6797-LNAQX,Male,0,Yes,Yes,70,Yes,Yes,Fiber optic,No,...,No,No,Yes,Yes,Two year,No,Bank transfer (automatic),$98.30,"$6,859.50",Yes


In [62]:
# 方式一：指定分块读取的大小
reader = customer_churn_predict = pd.read_csv("./data/customer-churn-predict.csv", chunksize = 5)
type(reader)

pandas.io.parsers.readers.TextFileReader

In [63]:
# 获取一个chunk数据对饮的DataFrame
chunk = reader.get_chunk()
print("<chunk type > ", type(chunk))
print("<chunk shape > ", chuck.shape)
# 注意这里输出的行索引是0-4
chunk

<chunk type >  <class 'pandas.core.frame.DataFrame'>
<chunk shape >  (5, 21)


Unnamed: 0,customerid,gender,seniorcitizen,partner,dependents,tenure,phoneservice,multiplelines,internetservice,onlinesecurity,...,deviceprotection,techsupport,streamingtv,streamingmovies,contract,paperlessbilling,paymentmethod,monthlycharges,totalcharges,churn
0,2405-LBMUW,Female,0,Yes,Yes,61,No,No phone service,DSL,Yes,...,No,Yes,No,Yes,One year,Yes,Bank transfer (automatic),$50.70,"$3,088.75",No
1,3454-JFUBC,Male,1,No,No,68,Yes,No,No,No internet service,...,No internet service,No internet service,No internet service,No internet service,Two year,Yes,Credit card (automatic),$20.00,"$1,396.00",No
2,0032-PGELS,Female,0,Yes,Yes,1,No,No phone service,DSL,Yes,...,No,No,No,No,Month-to-month,No,Bank transfer (automatic),$30.50,$30.50,Yes
3,9039-ZVJDC,Male,0,No,No,3,Yes,No,No,No internet service,...,No internet service,No internet service,No internet service,No internet service,Month-to-month,Yes,Mailed check,$19.10,$53.05,No
4,6797-LNAQX,Male,0,Yes,Yes,70,Yes,Yes,Fiber optic,No,...,No,No,Yes,Yes,Two year,No,Bank transfer (automatic),$98.30,"$6,859.50",Yes


In [64]:
# 再执行一次get_chunk获取下一个批次数据
chunk = reader.get_chunk()
# 注意这里输出的行索引是5-9
chunk

Unnamed: 0,customerid,gender,seniorcitizen,partner,dependents,tenure,phoneservice,multiplelines,internetservice,onlinesecurity,...,deviceprotection,techsupport,streamingtv,streamingmovies,contract,paperlessbilling,paymentmethod,monthlycharges,totalcharges,churn
5,9013-AQORL,Female,0,No,Yes,48,No,No phone service,DSL,No,...,Yes,No,No,Yes,Month-to-month,No,Credit card (automatic),$45.55,"$2,108.35",No
6,2898-MRKPI,Male,0,Yes,Yes,68,Yes,Yes,Fiber optic,No,...,No,No,Yes,Yes,One year,Yes,Credit card (automatic),$101.05,"$6,770.50",No
7,2750-BJLSB,Female,0,No,No,47,Yes,No,Fiber optic,Yes,...,Yes,Yes,Yes,Yes,One year,Yes,Electronic check,$103.70,"$4,730.60",No
8,3648-GZPHF,Male,0,Yes,Yes,32,No,No phone service,DSL,No,...,Yes,Yes,No,No,One year,Yes,Mailed check,$36.25,"$1,151.05",No
9,2075-RMJIK,Female,0,Yes,Yes,5,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,$49.40,$232.55,No


In [65]:
# 方式二：指定为迭代器模式
reader = customer_churn_predict = pd.read_csv("./data/customer-churn-predict.csv", iterator = True)
type(reader)

pandas.io.parsers.readers.TextFileReader

In [66]:
# 第一次调用get_chunk获取0～4行的数据
part1 = reader.get_chunk(5)
part1

Unnamed: 0,customerid,gender,seniorcitizen,partner,dependents,tenure,phoneservice,multiplelines,internetservice,onlinesecurity,...,deviceprotection,techsupport,streamingtv,streamingmovies,contract,paperlessbilling,paymentmethod,monthlycharges,totalcharges,churn
0,2405-LBMUW,Female,0,Yes,Yes,61,No,No phone service,DSL,Yes,...,No,Yes,No,Yes,One year,Yes,Bank transfer (automatic),$50.70,"$3,088.75",No
1,3454-JFUBC,Male,1,No,No,68,Yes,No,No,No internet service,...,No internet service,No internet service,No internet service,No internet service,Two year,Yes,Credit card (automatic),$20.00,"$1,396.00",No
2,0032-PGELS,Female,0,Yes,Yes,1,No,No phone service,DSL,Yes,...,No,No,No,No,Month-to-month,No,Bank transfer (automatic),$30.50,$30.50,Yes
3,9039-ZVJDC,Male,0,No,No,3,Yes,No,No,No internet service,...,No internet service,No internet service,No internet service,No internet service,Month-to-month,Yes,Mailed check,$19.10,$53.05,No
4,6797-LNAQX,Male,0,Yes,Yes,70,Yes,Yes,Fiber optic,No,...,No,No,Yes,Yes,Two year,No,Bank transfer (automatic),$98.30,"$6,859.50",Yes


In [67]:
# 第二次调用get_chunk获取5～9行的数据
part1 = reader.get_chunk(5)
part1

Unnamed: 0,customerid,gender,seniorcitizen,partner,dependents,tenure,phoneservice,multiplelines,internetservice,onlinesecurity,...,deviceprotection,techsupport,streamingtv,streamingmovies,contract,paperlessbilling,paymentmethod,monthlycharges,totalcharges,churn
5,9013-AQORL,Female,0,No,Yes,48,No,No phone service,DSL,No,...,Yes,No,No,Yes,Month-to-month,No,Credit card (automatic),$45.55,"$2,108.35",No
6,2898-MRKPI,Male,0,Yes,Yes,68,Yes,Yes,Fiber optic,No,...,No,No,Yes,Yes,One year,Yes,Credit card (automatic),$101.05,"$6,770.50",No
7,2750-BJLSB,Female,0,No,No,47,Yes,No,Fiber optic,Yes,...,Yes,Yes,Yes,Yes,One year,Yes,Electronic check,$103.70,"$4,730.60",No
8,3648-GZPHF,Male,0,Yes,Yes,32,No,No phone service,DSL,No,...,Yes,Yes,No,No,One year,Yes,Mailed check,$36.25,"$1,151.05",No
9,2075-RMJIK,Female,0,Yes,Yes,5,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,$49.40,$232.55,No
