<div style="text-align: right">Excel to Python: Chapter7 & 8 & 9 Collect and Statistics and Output</div>
<div style="text-align: right">Zixiao With Material from 从Excel到Python</div>
<div style="text-align: right">December, 21, 2019</div>

In [1]:
import numpy as np
import pandas as pd
pd.options.display.max_columns = None # Display all column of dataframe

In [2]:
df = pd.DataFrame(pd.read_csv('./data/PRSA_Data_20130301-20170228/PRSA_Data_Aotizhongxin_20130301-20170228.csv'))
# df=pd.DataFrame(pd.read_Excel('name.xlsx'))

In [3]:
df = pd.DataFrame({"id":[1001,1002,1003,1004,1005,1006], 
                   "date":pd.date_range('20130102', periods=6),
                   "city":['Beijing ', 'SH', ' guangzhou ', 'Shenzhen', 'shanghai', 'BEIJING '],
                   "age":[23,44,54,32,34,32], 
                   "category":['100-A','100-B','110-A','110-C','210-A','130-F'],
                   "price":[1200,np.nan,2133,5433,np.nan,4432]}, 
                  columns =['id','date','city','category','age','price'])

In [4]:
df

Unnamed: 0,id,date,city,category,age,price
0,1001,2013-01-02,Beijing,100-A,23,1200.0
1,1002,2013-01-03,SH,100-B,44,
2,1003,2013-01-04,guangzhou,110-A,54,2133.0
3,1004,2013-01-05,Shenzhen,110-C,32,5433.0
4,1005,2013-01-06,shanghai,210-A,34,
5,1006,2013-01-07,BEIJING,130-F,32,4432.0


# Chapter 7 Data Collection

## Data Collection by Group & Data Summary

In [5]:
df['city'] = df['city'].map(str.strip).str.lower()

In [6]:
df.groupby(["city"]).count()

Unnamed: 0_level_0,id,date,category,age,price
city,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
beijing,2,2,2,2,2
guangzhou,1,1,1,1,1
sh,1,1,1,1,0
shanghai,1,1,1,1,0
shenzhen,1,1,1,1,1


In [7]:
# Group by and count number against id
df.groupby(["city"])['id'].count()

city
beijing      2
guangzhou    1
sh           1
shanghai     1
shenzhen     1
Name: id, dtype: int64

In [8]:
# Group by multiple column
df.groupby(['city','date']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,id,category,age,price
city,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
beijing,2013-01-02,1,1,1,1
beijing,2013-01-07,1,1,1,1
guangzhou,2013-01-04,1,1,1,1
sh,2013-01-03,1,1,1,0
shanghai,2013-01-06,1,1,1,0
shenzhen,2013-01-05,1,1,1,1


In [9]:
# Group by city and calculate length, sum, mean of price
df.groupby(['city'])['price'].agg([len,np.sum,np.mean])

Unnamed: 0_level_0,len,sum,mean
city,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
beijing,2.0,5632.0,2816.0
guangzhou,1.0,2133.0,2133.0
sh,1.0,0.0,
shanghai,1.0,0.0,
shenzhen,1.0,5433.0,5433.0


## Data perspective

In [10]:
df_temp = pd.DataFrame((x.split('-') for x in df['category']),index = df.index,columns=['numbers','size'])
df_temp['id'] = df['id']
df = pd.merge(df,df_temp)
df['price']=df['price'].fillna(value = df['price'].mean())

In [11]:
# Set city as row field, size as column field, calculate length and sum of price
pd.pivot_table(df,index=['city'],columns=['size'],values=['price'],aggfunc=[len,np.sum],fill_value=0,margins=True)

Unnamed: 0_level_0,len,len,len,len,len,sum,sum,sum,sum,sum
Unnamed: 0_level_1,price,price,price,price,price,price,price,price,price,price
size,A,B,C,F,All,A,B,C,F,All
city,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3
beijing,1,0,0,1,2.0,1200.0,0.0,0,4432,5632.0
guangzhou,1,0,0,0,1.0,2133.0,0.0,0,0,2133.0
sh,0,1,0,0,1.0,0.0,3299.5,0,0,3299.5
shanghai,1,0,0,0,1.0,3299.5,0.0,0,0,3299.5
shenzhen,0,0,1,0,1.0,0.0,0.0,5433,0,5433.0
All,3,1,1,1,6.0,6632.5,3299.5,5433,4432,19797.0


# Chapter 8 Data Statistics

## Data Sampling

In [14]:
df.head(10)

Unnamed: 0,id,date,city,category,age,price,numbers,size
0,1001,2013-01-02,beijing,100-A,23,1200.0,100,A
1,1002,2013-01-03,sh,100-B,44,3299.5,100,B
2,1003,2013-01-04,guangzhou,110-A,54,2133.0,110,A
3,1004,2013-01-05,shenzhen,110-C,32,5433.0,110,C
4,1005,2013-01-06,shanghai,210-A,34,3299.5,210,A
5,1006,2013-01-07,beijing,130-F,32,4432.0,130,F


In [12]:
# Sample through n
df.sample(n=3)

Unnamed: 0,id,date,city,category,age,price,numbers,size
0,1001,2013-01-02,beijing,100-A,23,1200.0,100,A
4,1005,2013-01-06,shanghai,210-A,34,3299.5,210,A
5,1006,2013-01-07,beijing,130-F,32,4432.0,130,F


In [20]:
# Sample by adding weigths
# Data with higher weigth will have more chance to be selected
weigths = [0,0,0.5,0,0.5,0]
df.sample(n=2,weights = weigths)

Unnamed: 0,id,date,city,category,age,price,numbers,size
4,1005,2013-01-06,shanghai,210-A,34,3299.5,210,A
2,1003,2013-01-04,guangzhou,110-A,54,2133.0,110,A


In [23]:
# Non bootstrap sample
df.sample(n=6,replace = False)

Unnamed: 0,id,date,city,category,age,price,numbers,size
3,1004,2013-01-05,shenzhen,110-C,32,5433.0,110,C
4,1005,2013-01-06,shanghai,210-A,34,3299.5,210,A
2,1003,2013-01-04,guangzhou,110-A,54,2133.0,110,A
5,1006,2013-01-07,beijing,130-F,32,4432.0,130,F
0,1001,2013-01-02,beijing,100-A,23,1200.0,100,A
1,1002,2013-01-03,sh,100-B,44,3299.5,100,B


In [24]:
# Bootstrap sample
df.sample(n=6,replace = True)

Unnamed: 0,id,date,city,category,age,price,numbers,size
2,1003,2013-01-04,guangzhou,110-A,54,2133.0,110,A
0,1001,2013-01-02,beijing,100-A,23,1200.0,100,A
3,1004,2013-01-05,shenzhen,110-C,32,5433.0,110,C
1,1002,2013-01-03,sh,100-B,44,3299.5,100,B
2,1003,2013-01-04,guangzhou,110-A,54,2133.0,110,A
1,1002,2013-01-03,sh,100-B,44,3299.5,100,B


## Data Description

In [28]:
# Use round to only show two digits after the decimal point
df.describe().round(2).T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
id,6.0,1003.5,1.87,1001.0,1002.25,1003.5,1004.75,1006.0
age,6.0,36.5,10.88,23.0,32.0,33.0,41.5,54.0
price,6.0,3299.5,1523.35,1200.0,2424.62,3299.5,4148.88,5433.0


In [29]:
# Standard deviation
df['price'].std()

1523.3516337339847

In [30]:
# Covariance
df['price'].cov(df['age'])

-1353.5

In [31]:
df.cov()

Unnamed: 0,id,age,price
id,3.5,-0.7,1946.0
age,-0.7,118.3,-1353.5
price,1946.0,-1353.5,2320600.2


In [32]:
# Correlation
df.corr()

Unnamed: 0,id,age,price
id,1.0,-0.034401,0.682824
age,-0.034401,1.0,-0.081689
price,0.682824,-0.081689,1.0


# Chapter 9 Data output

In [34]:
# Output as Excel
df.to_excel('data/Excel_test.xlsx',sheet_name='test1')

In [35]:
df.to_csv('data/Csv_test.csv')