# 数据清理

In [2]:
import pandas as pd
import numpy as np

## 用stack清理变量值作为列名

In [6]:
state_fruit = pd.read_csv('state_fruit.csv', index_col=0)
state_fruit

Unnamed: 0,Apple,Orange,Banana
Texas,12,10,40
Arizona,9,7,12
Florida,0,14,190


In [7]:
# stack 方法可以将所有列名，转变为垂直的一级行索引
state_fruit.stack()

Texas    Apple      12
         Orange     10
         Banana     40
Arizona  Apple       9
         Orange      7
         Banana     12
Florida  Apple       0
         Orange     14
         Banana    190
dtype: int64

In [8]:
# 使用reset_index()， 将结果变为dataframe
state_fruit_tidy = state_fruit.stack().reset_index()
state_fruit_tidy

Unnamed: 0,level_0,level_1,0
0,Texas,Apple,12
1,Texas,Orange,10
2,Texas,Banana,40
3,Arizona,Apple,9
4,Arizona,Orange,7
5,Arizona,Banana,12
6,Florida,Apple,0
7,Florida,Orange,14
8,Florida,Banana,190


In [9]:
# 重命名列名
state_fruit_tidy.columns = ['state', 'fruit', 'weight']
state_fruit_tidy

Unnamed: 0,state,fruit,weight
0,Texas,Apple,12
1,Texas,Orange,10
2,Texas,Banana,40
3,Arizona,Apple,9
4,Arizona,Orange,7
5,Arizona,Banana,12
6,Florida,Apple,0
7,Florida,Orange,14
8,Florida,Banana,190


In [10]:
# 也可以使用rename_axis给不同的行索引层级命名
state_fruit.stack() .rename_axis(['state', 'fruit'])

state    fruit 
Texas    Apple      12
         Orange     10
         Banana     40
Arizona  Apple       9
         Orange      7
         Banana     12
Florida  Apple       0
         Orange     14
         Banana    190
dtype: int64

In [12]:
state_fruit.stack() .rename_axis(['state', 'fruit']).index

MultiIndex([(  'Texas',  'Apple'),
            (  'Texas', 'Orange'),
            (  'Texas', 'Banana'),
            ('Arizona',  'Apple'),
            ('Arizona', 'Orange'),
            ('Arizona', 'Banana'),
            ('Florida',  'Apple'),
            ('Florida', 'Orange'),
            ('Florida', 'Banana')],
           names=['state', 'fruit'])

In [11]:
state_fruit.stack().rename_axis(['state', 'fruit']).reset_index(name='weight')  
# reset_index()：这会将当前的多层索引（state 和 fruit）转化为普通的列，并且重新生成一个默认的整数索引。
# 并会把原来的数据值（即 stack() 操作后的数据）放到新的列中，并命名为 'weight'

Unnamed: 0,state,fruit,weight
0,Texas,Apple,12
1,Texas,Orange,10
2,Texas,Banana,40
3,Arizona,Apple,9
4,Arizona,Orange,7
5,Arizona,Banana,12
6,Florida,Apple,0
7,Florida,Orange,14
8,Florida,Banana,190


In [13]:
state_fruit2 = pd.read_csv('state_fruit2.csv')
state_fruit2

Unnamed: 0,State,Apple,Orange,Banana
0,Texas,12,10,40
1,Arizona,9,7,12
2,Florida,0,14,190


In [14]:
# state 不在行索引的位置上，使用stack将所有列名变为一个长series
state_fruit2.stack()

0  State       Texas
   Apple          12
   Orange         10
   Banana         40
1  State     Arizona
   Apple           9
   Orange          7
   Banana         12
2  State     Florida
   Apple           0
   Orange         14
   Banana        190
dtype: object

In [15]:
# 先设置state作为行索引名，再stack，可以得到和前面相似的结果
state_fruit2.set_index('State').stack()

State          
Texas    Apple      12
         Orange     10
         Banana     40
Arizona  Apple       9
         Orange      7
         Banana     12
Florida  Apple       0
         Orange     14
         Banana    190
dtype: int64

## 用melt清理变量值作为列名

In [16]:
tate_fruit2 = pd.read_csv('state_fruit2.csv')
state_fruit2

Unnamed: 0,State,Apple,Orange,Banana
0,Texas,12,10,40
1,Arizona,9,7,12
2,Florida,0,14,190


In [17]:
# 使用melt方法，将列传给id_vars和value_vars。melt可以将原先的列名作为变量，原先的值作为值
state_fruit2.melt(id_vars=['State'],  value_vars=['Apple', 'Orange', 'Banana'])

Unnamed: 0,State,variable,value
0,Texas,Apple,12
1,Arizona,Apple,9
2,Florida,Apple,0
3,Texas,Orange,10
4,Arizona,Orange,7
5,Florida,Orange,14
6,Texas,Banana,40
7,Arizona,Banana,12
8,Florida,Banana,190


In [18]:
# 随意设定一个行索引
state_fruit2.index=list('abc')
state_fruit2.index.name = 'letter'
state_fruit2

Unnamed: 0_level_0,State,Apple,Orange,Banana
letter,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
a,Texas,12,10,40
b,Arizona,9,7,12
c,Florida,0,14,190


In [19]:
# id_vars 参数指定了 State列 在转换后将作为标识列保留
# value_vars 参数指定了 'Apple', 'Orange', 'Banana' 是需要被转化为长格式的数据列
# var_name 参数指定了转换后新生成的列的名称 Fruit
# value_name 参数指定了新生成的列的名称 Weight
state_fruit2.melt(id_vars=['State'],  value_vars=['Apple', 'Orange', 'Banana'],  var_name='Fruit',  value_name='Weight')

Unnamed: 0,State,Fruit,Weight
0,Texas,Apple,12
1,Arizona,Apple,9
2,Florida,Apple,0
3,Texas,Orange,10
4,Arizona,Orange,7
5,Florida,Orange,14
6,Texas,Banana,40
7,Arizona,Banana,12
8,Florida,Banana,190


In [20]:
# 直接使用melt，将值放一列，标签放一列
state_fruit2.melt()

Unnamed: 0,variable,value
0,State,Texas
1,State,Arizona
2,State,Florida
3,Apple,12
4,Apple,9
5,Apple,0
6,Orange,10
7,Orange,7
8,Orange,14
9,Banana,40


In [21]:
# 要指明id变量，只需使用id_vars参数
state_fruit2.melt(id_vars='State')

Unnamed: 0,State,variable,value
0,Texas,Apple,12
1,Arizona,Apple,9
2,Florida,Apple,0
3,Texas,Orange,10
4,Arizona,Orange,7
5,Florida,Orange,14
6,Texas,Banana,40
7,Arizona,Banana,12
8,Florida,Banana,190


## 同时stack多组变量

In [4]:
movie = pd.read_csv('movie (1).csv')
actor = movie[['movie_title',
                'actor_1_name', 'actor_2_name', 'actor_3_name',
                'actor_1_facebook_likes',
                'actor_2_facebook_likes',
                'actor_3_facebook_likes']]
actor.head()

Unnamed: 0,movie_title,actor_1_name,actor_2_name,actor_3_name,actor_1_facebook_likes,actor_2_facebook_likes,actor_3_facebook_likes
0,Avatar,CCH Pounder,Joel David Moore,Wes Studi,1000.0,936.0,855.0
1,Pirates of the Caribbean: At World's End,Johnny Depp,Orlando Bloom,Jack Davenport,40000.0,5000.0,1000.0
2,Spectre,Christoph Waltz,Rory Kinnear,Stephanie Sigman,11000.0,393.0,161.0
3,The Dark Knight Rises,Tom Hardy,Christian Bale,Joseph Gordon-Levitt,27000.0,23000.0,23000.0
4,Star Wars: Episode VII - The Force Awakens,Doug Walker,Rob Walker,,131.0,12.0,


In [5]:
# 创建一个自定义函数，用来改变列名，wide_to_long要求分组的变量要有相同的数字结尾
def change_col_name(col_name):
    col_name = col_name.replace('_name', '')
    if 'facebook' in col_name:
        fb_idx = col_name.find('facebook')  # 查找子字符串 'facebook' 的位置（'facebook' 从字符串 "actor_1_facebook_likes" 的第 8 个位置开始（索引从 0 开始计数）
        col_name = col_name[:5] + col_name[fb_idx - 1:] + col_name[5:fb_idx-1]  # acter + _facebook_likes + _1
    return col_name

actor2 = actor.rename(columns=change_col_name)
actor2.head()

Unnamed: 0,movie_title,actor_1,actor_2,actor_3,actor_facebook_likes_1,actor_facebook_likes_2,actor_facebook_likes_3
0,Avatar,CCH Pounder,Joel David Moore,Wes Studi,1000.0,936.0,855.0
1,Pirates of the Caribbean: At World's End,Johnny Depp,Orlando Bloom,Jack Davenport,40000.0,5000.0,1000.0
2,Spectre,Christoph Waltz,Rory Kinnear,Stephanie Sigman,11000.0,393.0,161.0
3,The Dark Knight Rises,Tom Hardy,Christian Bale,Joseph Gordon-Levitt,27000.0,23000.0,23000.0
4,Star Wars: Episode VII - The Force Awakens,Doug Walker,Rob Walker,,131.0,12.0,


In [6]:
# 使用wide_to_long()函数(宽格式转化为长格式)，同时stack两列actor和Facebook
stubs = ['actor', 'actor_facebook_likes']
# stubnames = stubs: stubnames是列名前缀的列表，stubs是这些列的前缀
# i = ['movie_title']: 是想保留的索引列
# j='actor_num': j表示新生成的列，这一列将包含从宽格式中提取的列的编号
# sep='_': _是分隔符
actor2_tidy = pd.wide_to_long(actor2, stubnames = stubs, i = ['movie_title'], j='actor_num', sep='_').reset_index()

actor2_tidy

Unnamed: 0,movie_title,actor_num,actor,actor_facebook_likes
0,Avatar,1,CCH Pounder,1000.0
1,Pirates of the Caribbean: At World's End,1,Johnny Depp,40000.0
2,Spectre,1,Christoph Waltz,11000.0
3,The Dark Knight Rises,1,Tom Hardy,27000.0
4,Star Wars: Episode VII - The Force Awakens,1,Doug Walker,131.0
...,...,...,...,...
14743,Signed Sealed Delivered,3,Crystal Lowe,318.0
14744,The Following,3,Sam Underwood,319.0
14745,A Plague So Pleasant,3,David Chandler,0.0
14746,Shanghai Calling,3,Eliza Coupe,489.0


In [7]:
df = pd.read_csv('stackme.csv')
df

Unnamed: 0,State,Country,a1,b2,Test,d,e
0,TX,US,0.45,0.3,Test1,2,6
1,MA,US,0.03,1.2,Test2,9,7
2,ON,CAN,0.7,4.2,Test3,4,2


In [8]:
df2 = df.rename(columns = {'a1':'group1_a1', 'b2':'group1_b2', 'd':'group2_a1', 'e':'group2_b2'})
df2.columns

Index(['State', 'Country', 'group1_a1', 'group1_b2', 'Test', 'group2_a1',
       'group2_b2'],
      dtype='object')

In [9]:
# suffix 是一个正则表达式，用于匹配列名中的后缀。'.+' 表示匹配任何字符（即列名后面的所有部分）
pd.wide_to_long(df2, stubnames=['group1', 'group2'], i=['State', 'Country', 'Test'], j='Label', suffix='.+', sep='_')

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,group1,group2
State,Country,Test,Label,Unnamed: 4_level_1,Unnamed: 5_level_1
TX,US,Test1,a1,0.45,2
TX,US,Test1,b2,0.3,6
MA,US,Test2,a1,0.03,9
MA,US,Test2,b2,1.2,7
ON,CAN,Test3,a1,0.7,4
ON,CAN,Test3,b2,4.2,2


## 反转stacked数据

In [4]:
usecol_func = lambda x: 'ugds_' in x or x == 'instnm'
college = pd.read_csv('college.csv', index_col='instnm', usecols=usecol_func)  # usecols=usecol_func参数用于选择要加载的列
college.head()

Unnamed: 0_level_0,ugds_white,ugds_black,ugds_hisp,ugds_asian,ugds_aian,ugds_nhpi,ugds_2mor,ugds_nra,ugds_unkn
instnm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Alabama A & M University,0.0333,0.9353,0.0055,0.0019,0.0024,0.0019,0.0,0.0059,0.0138
University of Alabama at Birmingham,0.5922,0.26,0.0283,0.0518,0.0022,0.0007,0.0368,0.0179,0.01
Amridge University,0.299,0.4192,0.0069,0.0034,0.0,0.0,0.0,0.0,0.2715
University of Alabama in Huntsville,0.6988,0.1255,0.0382,0.0376,0.0143,0.0002,0.0172,0.0332,0.035
Alabama State University,0.0158,0.9208,0.0121,0.0019,0.001,0.0006,0.0098,0.0243,0.0137


In [5]:
college_stacked = college.stack()
college_stacked.head(18)

instnm                                         
Alabama A & M University             ugds_white    0.0333
                                     ugds_black    0.9353
                                     ugds_hisp     0.0055
                                     ugds_asian    0.0019
                                     ugds_aian     0.0024
                                     ugds_nhpi     0.0019
                                     ugds_2mor     0.0000
                                     ugds_nra      0.0059
                                     ugds_unkn     0.0138
University of Alabama at Birmingham  ugds_white    0.5922
                                     ugds_black    0.2600
                                     ugds_hisp     0.0283
                                     ugds_asian    0.0518
                                     ugds_aian     0.0022
                                     ugds_nhpi     0.0007
                                     ugds_2mor     0.0368
                        

In [6]:
college_stacked.unstack().head()

Unnamed: 0_level_0,ugds_white,ugds_black,ugds_hisp,ugds_asian,ugds_aian,ugds_nhpi,ugds_2mor,ugds_nra,ugds_unkn
instnm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Alabama A & M University,0.0333,0.9353,0.0055,0.0019,0.0024,0.0019,0.0,0.0059,0.0138
University of Alabama at Birmingham,0.5922,0.26,0.0283,0.0518,0.0022,0.0007,0.0368,0.0179,0.01
Amridge University,0.299,0.4192,0.0069,0.0034,0.0,0.0,0.0,0.0,0.2715
University of Alabama in Huntsville,0.6988,0.1255,0.0382,0.0376,0.0143,0.0002,0.0172,0.0332,0.035
Alabama State University,0.0158,0.9208,0.0121,0.0019,0.001,0.0006,0.0098,0.0243,0.0137


In [7]:
college2 = pd.read_csv('college.csv', usecols=usecol_func)
college2.head()

Unnamed: 0,instnm,ugds_white,ugds_black,ugds_hisp,ugds_asian,ugds_aian,ugds_nhpi,ugds_2mor,ugds_nra,ugds_unkn
0,Alabama A & M University,0.0333,0.9353,0.0055,0.0019,0.0024,0.0019,0.0,0.0059,0.0138
1,University of Alabama at Birmingham,0.5922,0.26,0.0283,0.0518,0.0022,0.0007,0.0368,0.0179,0.01
2,Amridge University,0.299,0.4192,0.0069,0.0034,0.0,0.0,0.0,0.0,0.2715
3,University of Alabama in Huntsville,0.6988,0.1255,0.0382,0.0376,0.0143,0.0002,0.0172,0.0332,0.035
4,Alabama State University,0.0158,0.9208,0.0121,0.0019,0.001,0.0006,0.0098,0.0243,0.0137


In [8]:
college_melted = college2.melt(id_vars='instnm', var_name='Race', value_name='Percentage')
college_melted.head()

Unnamed: 0,instnm,Race,Percentage
0,Alabama A & M University,ugds_white,0.0333
1,University of Alabama at Birmingham,ugds_white,0.5922
2,Amridge University,ugds_white,0.299
3,University of Alabama in Huntsville,ugds_white,0.6988
4,Alabama State University,ugds_white,0.0158


In [9]:
# 用 pivot 还原
melted_inv = college_melted.pivot(index='instnm', columns='Race', values='Percentage')
melted_inv.head()

Race,ugds_2mor,ugds_aian,ugds_asian,ugds_black,ugds_hisp,ugds_nhpi,ugds_nra,ugds_unkn,ugds_white
instnm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
A & W Healthcare Educators,0.0,0.0,0.0,0.975,0.025,0.0,0.0,0.0,0.0
A T Still University of Health Sciences,,,,,,,,,
ABC Beauty Academy,0.0,0.0,0.9333,0.0333,0.0333,0.0,0.0,0.0,0.0
ABC Beauty College Inc,0.0,0.0,0.0,0.6579,0.0526,0.0,0.0,0.0,0.2895
AI Miami International University of Art and Design,0.0018,0.0,0.0018,0.0198,0.4773,0.0,0.0025,0.4644,0.0324


In [None]:
college2_replication = melted_inv.loc[college2['INSTNM'], college2.columns[1:]].reset_index()
college2.equals(college2_replication)

In [None]:
college.stack().unstack(0)

In [None]:
college.T

## 分组聚合后unstacking

In [None]:
employee = pd.read_csv('employee.csv')
employee.groupby('RACE')['BASE_SALARY'].mean().astype(int)

In [None]:
agg = employee.groupby(['RACE', 'GENDER'])['BASE_SALARY'].mean().astype(int)
agg

In [None]:
agg.unstack('GENDER')

In [None]:
agg.unstack('RACE')

In [None]:
agg2 = employee.groupby(['RACE', 'GENDER'])['BASE_SALARY'].agg(['mean', 'max', 'min']).astype(int)
agg2.reset_index()

In [None]:
agg2.unstack('GENDER')

## 用分组聚合实现透视表

In [None]:
flights = pd.read_csv('flights.csv')
flights.head()

In [None]:
fp = flights.pivot_table(index='AIRLINE', columns='ORG_AIR', values='CANCELLED', aggfunc='sum', fill_value=0).round(2)
fp.head()

In [None]:
fg = flights.groupby(['AIRLINE', 'ORG_AIR'])['CANCELLED'].sum()
fg.head()


In [None]:
fg_unstack = fg.unstack('ORG_AIR', fill_value=0)
fg_unstack.head()

In [None]:
fg_unstack = fg.unstack('ORG_AIR', fill_value=0)
fp.equals(fg_unstack)

In [None]:
fp2 = flights.pivot_table(index=['AIRLINE', 'MONTH'], columns=['ORG_AIR', 'CANCELLED'], 
                          values=['DEP_DELAY', 'DIST'], aggfunc=[np.mean, np.sum], fill_value=0)
fp2.head()

In [None]:
flights.groupby(['AIRLINE', 'MONTH', 'ORG_AIR', 'CANCELLED'])['DEP_DELAY', 'DIST']\
        .agg(['mean', 'sum']).unstack(['ORG_AIR', 'CANCELLED'], fill_value=0) \
        .swaplevel(0, 1, axis='columns') \
        .head()

## 为了更容易reshaping，重新命名索引层

In [None]:
college = pd.read_csv('data/college.csv')

cg = college.groupby(['STABBR', 'RELAFFIL'])['UGDS', 'SATMTMID'] \
            .agg(['count', 'min', 'max']).head(6)
cg

In [None]:
cg = cg.rename_axis(['AGG_COLS', 'AGG_FUNCS'], axis='columns')
cg

In [None]:
cg.stack('AGG_FUNCS').head()

In [None]:
cg.stack('AGG_FUNCS').swaplevel('AGG_FUNCS', 'STABBR', axis='index').head()

In [None]:
cg.stack('AGG_FUNCS')\
  .swaplevel('AGG_FUNCS', 'STABBR',axis='index') \
  .sort_index(level='RELAFFIL', axis='index') \
  .sort_index(level='AGG_COLS', axis='columns').head(6)

In [None]:
cg.stack('AGG_FUNCS').unstack(['RELAFFIL', 'STABBR'])

In [None]:
cg.stack(['AGG_FUNCS', 'AGG_COLS']).head(12)

In [None]:
cg.rename_axis([None, None], axis='index').rename_axis([None, None], axis='columns')

## 当多个变量被存储为列名时进行清理

In [None]:
weightlifting = pd.read_csv('data/weightlifting_men.csv')
weightlifting

In [None]:
wl_melt = weightlifting.melt(id_vars='Weight Category', var_name='sex_age', value_name='Qual Total')
wl_melt.head()

In [None]:
sex_age = wl_melt['sex_age'].str.split(expand=True)
sex_age.head()

In [None]:
sex_age.columns = ['Sex', 'Age Group']
sex_age.head()

In [None]:
sex_age['Sex'] = sex_age['Sex'].str[0]
sex_age.head()

In [None]:
wl_cat_total = wl_melt[['WeightCategory', 'Qual Total']]
wl_tidy = pd.concat([sex_age, wl_cat_total], axis='columns')
wl_tidy.head()

In [None]:
cols = ['Weight Category', 'QualTotal']
sex_age[cols] = wl_melt[cols]

In [None]:
age_group = wl_melt.sex_age.str.extract('(\d{2}[-+](?:\d{2})?)', expand=False)
sex = wl_melt.sex_age.str[0]
new_cols = {'Sex':sex, 'Age Group': age_group}

In [None]:
wl_tidy2 = wl_melt.assign(**new_cols).drop('sex_age', axis='columns')
wl_tidy2.head()

In [None]:
wl_tidy2.sort_index(axis=1).equals(wl_tidy.sort_index(axis=1))

## 当多个变量被存储为列的值时进行清理

In [None]:
inspections = pd.read_csv('data/restaurant_inspections.csv', parse_dates=['Date'])
inspections.head(10)

In [None]:
inspections.pivot(index=['Name', 'Date'], columns='Info', values='Value')

In [None]:
inspections.set_index(['Name','Date', 'Info']).head(10)

In [None]:
inspections.set_index(['Name','Date', 'Info']).unstack('Info').head()

In [None]:
insp_tidy = inspections.set_index(['Name','Date', 'Info'])\
                        .unstack('Info') \
                        .rename_axis(columns=None) \
                        .reset_index(col_level=-1)
insp_tidy.head()

In [None]:
insp_tidy.columns = insp_tidy.columns.droplevel(0).rename(None)
insp_tidy.head()

In [None]:
inspections.set_index(['Name','Date','Info']) \
            .squeeze() \
            .unstack('Info') \
            .reset_index() \
            .rename_axis(None, axis='columns')

In [None]:
inspections.pivot_table(index=['Name','Date'], columns='Info', values='Value', aggfunc='first') \
            .reset_index()\
            .rename_axis(None, axis='columns')

## 当两个或多个值存储于一个单元格时进行清理

In [None]:
cities = pd.read_csv('data/texas_cities.csv')
cities

In [None]:
geolocations = cities.Geolocation.str.split(pat='. ', expand=True)
geolocations.columns = ['latitude', 'latitude direction', 'longitude', 'longitudedirection']
geolocations

In [None]:
geolocations = geolocations.astype({'latitude':'float', 'longitude':'float'})
geolocations.dtypes

In [None]:
cities_tidy = pd.concat([cities['City'], geolocations], axis='columns')
cities_tidy

In [None]:
temp = geolocations.apply(pd.to_numeric, errors='ignore')
temp

In [None]:
temp.dtypes

In [None]:
cities.Geolocation.str.split(pat='° |,', expand=True)

In [None]:
cities.Geolocation.str.extract('([0-9.]+). (N|S), ([0-9.]+). (E|W)', expand=True)

## 当多个变量被存储为列名和列值时进行清理

In [None]:
sensors = pd.read_csv('data/sensors.csv')
sensors

In [None]:
sensors.melt(id_vars=['Group', 'Property'], var_name='Year').head(6)

In [None]:
sensors.melt(id_vars=['Group', 'Property'], var_name='Year') \
        .pivot_table(index=['Group', 'Year'], columns='Property', values='value') \
        .reset_index() \
        .rename_axis(None, axis='columns')

In [None]:
sensors.set_index(['Group', 'Property']) \
        .stack() \
        .unstack('Property') \
        .rename_axis(['Group', 'Year'], axis='index') \
        .rename_axis(None, axis='columns') \
        .reset_index()

## 当多个观察单位被存储于同一张表时进行清理

In [None]:
movie = pd.read_csv('movie_altered.csv')
movie.head()

In [None]:
movie.insert(0, 'id', np.arange(len(movie)))
movie.head()

In [None]:
stubnames = ['director', 'director_fb_likes', 'actor', 'actor_fb_likes']
movie_long = pd.wide_to_long(movie, stubnames=stubnames, i='id', j='num', sep='_').reset_index()
movie_long['num'] = movie_long['num'].astype(int)
movie_long.head(9)

In [None]:
movie_table = movie_long[['id','title', 'year', 'duration', 'rating']]
director_table = movie_long[['id', 'director', 'num', 'director_fb_likes']]
actor_table = movie_long[['id', 'actor', 'num', 'actor_fb_likes']]
movie_table.head(9)

In [None]:
director_table.head(9)

In [None]:
actor_table.head(9)

In [None]:
movie_table = movie_table.drop_duplicates().reset_index(drop=True)
director_table = director_table.dropna().reset_index(drop=True)
actor_table = actor_table.dropna().reset_index(drop=True)
movie_table.head()

In [None]:
director_table.head()

In [None]:
movie.memory_usage(deep=True).sum()

In [None]:
movie_table.memory_usage(deep=True).sum() + \
director_table.memory_usage(deep=True).sum() +\
actor_table.memory_usage(deep=True).sum()

In [None]:
director_cat = pd.Categorical(director_table['director'])
director_table.insert(1,'director_id', director_cat.codes)
actor_cat = pd.Categorical(actor_table['actor'])
actor_table.insert(1, 'actor_id', actor_cat.codes)
director_table.head()

In [None]:
actor_table.head()

In [None]:
director_associative = director_table[['id', 'director_id', 'num']]
dcols = ['director_id', 'director', 'director_fb_likes']
director_unique = director_table[dcols].drop_duplicates().reset_index(drop=True)
director_associative.head()

In [None]:
director_unique.head()

In [None]:
actor_associative = actor_table[['id', 'actor_id', 'num']]
acols = ['actor_id', 'actor', 'actor_fb_likes']
actor_unique = actor_table[acols].drop_duplicates().reset_index(drop=True)
actor_associative.head()

In [None]:
actor_unique.head()

In [None]:
movie_table.memory_usage(deep=True).sum() + \
director_associative.memory_usage(deep=True).sum() + \
director_unique.memory_usage(deep=True).sum() + \
actor_associative.memory_usage(deep=True).sum()+ \
actor_unique.memory_usage(deep=True).sum()

In [None]:
movie_table.head()

In [None]:
actors = actor_associative.merge(actor_unique, on='actor_id') \
                            .drop('actor_id', 1) \
                            .pivot_table(index='id', columns='num', aggfunc='first')

actors.columns = actors.columns.get_level_values(0) + '_' + \
                 actors.columns.get_level_values(1).astype(str)

directors = director_associative.merge(director_unique, on='director_id') \
                                .drop('director_id', 1) \
                                .pivot_table(index='id', columns='num', aggfunc='first')

directors.columns = directors.columns.get_level_values(0) + '_' + \
directors.columns.get_level_values(1).astype(str)

In [None]:
actors.head()

In [None]:
directors.head()

In [None]:
movie2 = movie_table.merge(directors.reset_index(), on='id', how='left') \
                    .merge(actors.reset_index(), on='id', how='left') \
                    .merge(actors.reset_index(), on='id', how='left')
movie2.head()

In [None]:
movie.equals(movie2[movie.columns])

In [5]:
532+18

550