In [1]:
import pandas as pd
import sys

In [2]:
def load_df():
    return pd.read_csv('data/1.csv', encoding='utf8', parse_dates=['birthdate'])

In [3]:
df = load_df()

In [4]:
## Preview data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   name         100 non-null    object        
 1   sex          100 non-null    object        
 2   birthdate    100 non-null    datetime64[ns]
 3   mail         100 non-null    object        
 4   address      100 non-null    object        
 5   blood_group  100 non-null    object        
 6   income       100 non-null    int64         
 7   consume_per  86 non-null     float64       
 8   consume      83 non-null     float64       
dtypes: datetime64[ns](1), float64(2), int64(1), object(5)
memory usage: 7.2+ KB


In [7]:
 # check mising data
condition = df[['consume_per', 'consume']].isna().any(axis = 1)

In [8]:
df[condition]

Unnamed: 0,name,sex,birthdate,mail,address,blood_group,income,consume_per,consume
0,臧佳,M,1991-11-01,dongmin@gmail.com,青海省杭州县永川金街j座 967342,A+,14361,0.29858,
1,张佳,M,1930-01-13,sgao@gmail.com,湖南省金凤市闵行王路Y座 742725,AB+,5533,0.461721,
2,赵玉兰,F,2013-10-04,junyin@gmail.com,贵州省南昌市新城邯郸路D座 163997,AB-,12026,0.490774,
12,芦丽娟,F,1914-03-02,xuping@yahoo.com,河北省张家港县清浦穆街K座 967410,AB+,15706,,
14,殷云,F,1912-07-14,xiuyingkong@hotmail.com,河南省旭市金平海口街B座 296646,AB-,7625,,
16,孙建平,M,1907-04-27,chaoxia@hotmail.com,浙江省浩县黄浦潜江街b座 139576,A+,13530,,
35,索桂芳,F,1918-04-25,ulu@gmail.com,西藏自治区潜江市沙湾宁德街L座 896541,AB+,12484,,
36,陈春梅,M,1919-09-12,ping43@yahoo.com,黑龍江省小红县梁平台北街q座 812231,AB-,7861,,
50,陈强,M,1917-07-06,minglu@gmail.com,河南省璐县新城欧路k座 988099,AB-,14550,,
55,徐敏,F,1919-12-09,shenxiuying@yahoo.com,浙江省淑兰市友好吴街p座 239027,AB+,19902,,


In [9]:
# Split email

def cal_split_mail(x_df):
    def ay_split_mail(x_s):
        arr=x_s.split('@')
        prefix=arr[0]
        post=arr[1]
        return pd.Series(
            (prefix,post),
            index='mail_prefix mail_post'.split()
        )

    res=x_df['mail'].apply(ay_split_mail)
    x_df[res.columns]=res
    return x_df

In [10]:
# Convert sex
def cal_convert_sex(x_df):
    mapping={'M':'男','F':'女'}
    x_df['sex']=x_df['sex'].map(mapping)
    return x_df

In [16]:
# Calculate fill consumer per
def cal_fill_cunsume_per(x_df,fill_map):
    cond=x_df['consume_per'].isna()
    res=x_df.loc[cond,'mail_post'].map(fill_map)
    x_df.loc[cond,'consume_per']=res

    return x_df

In [11]:
# Calculate consume 

def cal_cunsume_ifna(x_df):
    cond=x_df['consume'].isna()
    tmp=x_df[cond]
    x_df.loc[cond,'consume']=tmp['income'] * tmp['consume_per']
    return x_df

In [19]:
# Calculate grading
def cal_grading(x_df):
    bins=[0,4000,6000,9000,sys.maxsize]
    tmp=pd.cut(x_df['consume'],bins,labels=[1,2,3,4])
    x_df['grade']=tmp
    return x_df

In [13]:
def load_cunsume_per_na_map():
    return {
        'gmail.com': 0.583810,
        'hotmail.com': 0.557706,
        'yahoo.com': 0.583721,
    }

In [14]:
cp_na_map=load_cunsume_per_na_map()

In [20]:
res=(
    load_df()
    .pipe(cal_split_mail)
    .pipe(cal_convert_sex)
    .pipe(cal_fill_cunsume_per,fill_map=cp_na_map)
    .pipe(cal_cunsume_ifna)
    .pipe(cal_grading)
)

res.head()

Unnamed: 0,name,sex,birthdate,mail,address,blood_group,income,consume_per,consume,mail_prefix,mail_post,grade
0,臧佳,男,1991-11-01,dongmin@gmail.com,青海省杭州县永川金街j座 967342,A+,14361,0.29858,4287.910847,dongmin,gmail.com,2
1,张佳,男,1930-01-13,sgao@gmail.com,湖南省金凤市闵行王路Y座 742725,AB+,5533,0.461721,2554.700068,sgao,gmail.com,1
2,赵玉兰,女,2013-10-04,junyin@gmail.com,贵州省南昌市新城邯郸路D座 163997,AB-,12026,0.490774,5902.04711,junyin,gmail.com,2
3,张晶,男,1990-06-25,xiulanhan@yahoo.com,广西壮族自治区南宁市滨城马路j座 296286,B-,12906,0.781134,10081.31337,xiulanhan,yahoo.com,4
4,傅俊,女,1981-11-27,juan95@hotmail.com,广东省银川市金平田街e座 241842,A+,14471,0.713496,10325.001343,juan95,hotmail.com,4


In [21]:
pd.pivot_table(res,
    index='grade',
    values='consume',
    aggfunc=['count','mean'])

Unnamed: 0_level_0,count,mean
Unnamed: 0_level_1,consume,consume
grade,Unnamed: 1_level_2,Unnamed: 2_level_2
1,25,2384.738871
2,21,5085.039229
3,25,7602.430597
4,29,11785.295333


In [22]:
pd.pivot_table(res,
    index='grade',
    values='consume',
    columns='sex',
    aggfunc=['count','mean'])

Unnamed: 0_level_0,count,count,mean,mean
sex,女,男,女,男
grade,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,13,12,2461.127983,2301.984
2,13,8,5088.625983,5079.210753
3,10,15,7221.283826,7856.528445
4,19,10,11190.449085,12915.503206
