## 1.1 選取DataFrame的欄位

In [1]:
import pandas as pd
import numpy as np

pd.set_option('display.max_columns', 4, 'display.max_rows', 10, 'display.max_colwidth', 20)

In [108]:
movies = pd.read_csv('data/movie.csv')
movies['director_name']

0           James Cameron
1          Gore Verbinski
2              Sam Mendes
3       Christopher Nolan
4             Doug Walker
              ...        
4911          Scott Smith
4912                  NaN
4913     Benjamin Roberds
4914          Daniel Hsia
4915             Jon Gunn
Name: director_name, Length: 4916, dtype: object

In [3]:
movies.director_name

0           James Cameron
1          Gore Verbinski
2              Sam Mendes
3       Christopher Nolan
4             Doug Walker
              ...        
4911          Scott Smith
4912                  NaN
4913     Benjamin Roberds
4914          Daniel Hsia
4915             Jon Gunn
Name: director_name, Length: 4916, dtype: object

In [4]:
movies.loc[:, 'director_name']

0           James Cameron
1          Gore Verbinski
2              Sam Mendes
3       Christopher Nolan
4             Doug Walker
              ...        
4911          Scott Smith
4912                  NaN
4913     Benjamin Roberds
4914          Daniel Hsia
4915             Jon Gunn
Name: director_name, Length: 4916, dtype: object

In [5]:
movies.iloc[:, 1]

0           James Cameron
1          Gore Verbinski
2              Sam Mendes
3       Christopher Nolan
4             Doug Walker
              ...        
4911          Scott Smith
4912                  NaN
4913     Benjamin Roberds
4914          Daniel Hsia
4915             Jon Gunn
Name: director_name, Length: 4916, dtype: object

In [6]:
movies.iloc[[0,2],1]

0    James Cameron
2       Sam Mendes
Name: director_name, dtype: object

In [7]:
movies.iloc[0:2,1]

0     James Cameron
1    Gore Verbinski
Name: director_name, dtype: object

In [8]:
movies.iloc[[0,2],1:3]

Unnamed: 0,director_name,num_critic_for_reviews
0,James Cameron,723.0
2,Sam Mendes,602.0


In [9]:
movies.iloc[[0,2],[1,3]]

Unnamed: 0,director_name,duration
0,James Cameron,178.0
2,Sam Mendes,148.0


In [10]:
movies['director_name'].index

RangeIndex(start=0, stop=4916, step=1)

In [11]:
movies['director_name'].dtype

dtype('O')

In [12]:
movies['director_name'].size

4916

In [13]:
movies['director_name'].name

'director_name'

In [14]:
movies['director_name'].apply(type)

0         <class 'str'>
1         <class 'str'>
2         <class 'str'>
3         <class 'str'>
4         <class 'str'>
             ...       
4911      <class 'str'>
4912    <class 'float'>
4913      <class 'str'>
4914      <class 'str'>
4915      <class 'str'>
Name: director_name, Length: 4916, dtype: object

In [112]:
# unique only for series and return an one dim array
movies['director_name'].apply(type).unique()

array([<class 'str'>, <class 'float'>], dtype=object)

In [16]:
movies.director_name

0           James Cameron
1          Gore Verbinski
2              Sam Mendes
3       Christopher Nolan
4             Doug Walker
              ...        
4911          Scott Smith
4912                  NaN
4913     Benjamin Roberds
4914          Daniel Hsia
4915             Jon Gunn
Name: director_name, Length: 4916, dtype: object

## 1.2 呼叫Series的方法（method）

In [115]:
dir(pd.Series)

['T',
 '_AXIS_LEN',
 '_AXIS_NAMES',
 '_AXIS_NUMBERS',
 '_AXIS_ORDERS',
 '_AXIS_TO_AXIS_NUMBER',
 '_HANDLED_TYPES',
 '__abs__',
 '__add__',
 '__and__',
 '__annotations__',
 '__array__',
 '__array_priority__',
 '__array_ufunc__',
 '__array_wrap__',
 '__bool__',
 '__class__',
 '__contains__',
 '__copy__',
 '__deepcopy__',
 '__delattr__',
 '__delitem__',
 '__dict__',
 '__dir__',
 '__divmod__',
 '__doc__',
 '__eq__',
 '__finalize__',
 '__float__',
 '__floordiv__',
 '__format__',
 '__ge__',
 '__getattr__',
 '__getattribute__',
 '__getitem__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__iadd__',
 '__iand__',
 '__ifloordiv__',
 '__imod__',
 '__imul__',
 '__init__',
 '__init_subclass__',
 '__int__',
 '__invert__',
 '__ior__',
 '__ipow__',
 '__isub__',
 '__iter__',
 '__itruediv__',
 '__ixor__',
 '__le__',
 '__len__',
 '__long__',
 '__lt__',
 '__matmul__',
 '__mod__',
 '__module__',
 '__mul__',
 '__ne__',
 '__neg__',
 '__new__',
 '__nonzero__',
 '__or__',
 '__pos__',
 '__pow__',
 '__radd__',
 '_

In [18]:
s_attr_methods = set(dir(pd.Series))
len(s_attr_methods)

422

In [19]:
df_attr_methods = set(dir(pd.DataFrame))
len(df_attr_methods)

437

In [20]:
len(s_attr_methods & df_attr_methods)

367

DataFrame 與 Series有376個共同的屬性、方法

In [21]:
movies = pd.read_csv('data/movie.csv')
director = movies['director_name']
fb_likes = movies['actor_1_facebook_likes']
director.dtype

dtype('O')

In [22]:
fb_likes.dtype

dtype('float64')

In [23]:
director.head()

0        James Cameron
1       Gore Verbinski
2           Sam Mendes
3    Christopher Nolan
4          Doug Walker
Name: director_name, dtype: object

In [24]:
director.sample(n=5, random_state=42)

2347      Brian Percival
4687         Lucio Fulci
691        Phillip Noyce
3911       Sam Peckinpah
2488    Rowdy Herrington
Name: director_name, dtype: object

In [25]:
fb_likes.head()

0     1000.0
1    40000.0
2    11000.0
3    27000.0
4      131.0
Name: actor_1_facebook_likes, dtype: float64

In [26]:
director.value_counts()

Steven Spielberg    26
Woody Allen         22
Martin Scorsese     20
Clint Eastwood      20
Ridley Scott        16
                    ..
John Putch           1
Luca Guadagnino      1
Sam Fell             1
Dan Fogelman         1
Daniel Hsia          1
Name: director_name, Length: 2397, dtype: int64

In [27]:
fb_likes.value_counts()

1000.0     436
11000.0    206
2000.0     189
3000.0     150
12000.0    131
          ... 
703.0        1
208.0        1
79.0         1
269.0        1
291.0        1
Name: actor_1_facebook_likes, Length: 877, dtype: int64

In [28]:
director.size

4916

In [29]:
director.shape

(4916,)

In [30]:
len(director)

4916

In [31]:
director.unique()

array(['James Cameron', 'Gore Verbinski', 'Sam Mendes', ...,
       'Scott Smith', 'Benjamin Roberds', 'Daniel Hsia'], dtype=object)

In [32]:
director.count()

4814

In [33]:
fb_likes.count()

4909

In [34]:
fb_likes.min()

0.0

In [35]:
fb_likes.max()

640000.0

In [36]:
fb_likes.mean()

6494.488490527602

In [37]:
fb_likes.median()

982.0

In [38]:
fb_likes.std()

15106.986883848185

In [39]:
fb_likes.describe()

count      4909.000000
mean       6494.488491
std       15106.986884
min           0.000000
25%         607.000000
50%         982.000000
75%       11000.000000
max      640000.000000
Name: actor_1_facebook_likes, dtype: float64

In [40]:
director.describe()

count                 4814
unique                2397
top       Steven Spielberg
freq                    26
Name: director_name, dtype: object

In [41]:
fb_likes.quantile(.2)

510.0

In [42]:
fb_likes.quantile([.1, .2, .3, .4, .5, .6, .7, .8, .9])

0.1      240.0
0.2      510.0
0.3      694.0
0.4      854.0
0.5      982.0
0.6     1000.0
0.7     8000.0
0.8    13000.0
0.9    18000.0
Name: actor_1_facebook_likes, dtype: float64

In [43]:
director.isna()

0       False
1       False
2       False
3       False
4       False
        ...  
4911    False
4912     True
4913    False
4914    False
4915    False
Name: director_name, Length: 4916, dtype: bool

In [44]:
fb_likes.size

4916

In [45]:
fb_likes.count()

4909

fb_likes有 7 個缺失值

In [46]:
fb_likes_filled = fb_likes.fillna(0)
fb_likes_filled.count()

4916

In [47]:
fb_likes_dropped = fb_likes.dropna()
fb_likes_dropped.size

4909

In [48]:
director.value_counts(normalize=True)

Steven Spielberg    0.005401
Woody Allen         0.004570
Martin Scorsese     0.004155
Clint Eastwood      0.004155
Ridley Scott        0.003324
                      ...   
John Putch          0.000208
Luca Guadagnino     0.000208
Sam Fell            0.000208
Dan Fogelman        0.000208
Daniel Hsia         0.000208
Name: director_name, Length: 2397, dtype: float64

In [49]:
director.value_counts(normalize=True).sum()

1.0

In [50]:
director.hasnans

True

In [51]:
director.notna()

0        True
1        True
2        True
3        True
4        True
        ...  
4911     True
4912    False
4913     True
4914     True
4915     True
Name: director_name, Length: 4916, dtype: bool

## 1.3 Series的相關操作

In [52]:
5 + 9    

14

In [53]:
movies = pd.read_csv('data/movie.csv')
imdb_score = movies['imdb_score']
imdb_score

0       7.9
1       7.1
2       6.8
3       8.5
4       7.1
       ... 
4911    7.7
4912    7.5
4913    6.3
4914    6.3
4915    6.6
Name: imdb_score, Length: 4916, dtype: float64

In [54]:
imdb_score + 1

0       8.9
1       8.1
2       7.8
3       9.5
4       8.1
       ... 
4911    8.7
4912    8.5
4913    7.3
4914    7.3
4915    7.6
Name: imdb_score, Length: 4916, dtype: float64

In [55]:
imdb_score * 2.5

0       19.75
1       17.75
2       17.00
3       21.25
4       17.75
        ...  
4911    19.25
4912    18.75
4913    15.75
4914    15.75
4915    16.50
Name: imdb_score, Length: 4916, dtype: float64

In [56]:
imdb_score // 7

0       1.0
1       1.0
2       0.0
3       1.0
4       1.0
       ... 
4911    1.0
4912    1.0
4913    0.0
4914    0.0
4915    0.0
Name: imdb_score, Length: 4916, dtype: float64

In [57]:
imdb_score > 7

0        True
1        True
2       False
3        True
4        True
        ...  
4911     True
4912     True
4913    False
4914    False
4915    False
Name: imdb_score, Length: 4916, dtype: bool

In [58]:
director = movies['director_name']
director == 'James Cameron'

0        True
1       False
2       False
3       False
4       False
        ...  
4911    False
4912    False
4913    False
4914    False
4915    False
Name: director_name, Length: 4916, dtype: bool

In [59]:
imdb_score.add(1)   

0       8.9
1       8.1
2       7.8
3       9.5
4       8.1
       ... 
4911    8.7
4912    8.5
4913    7.3
4914    7.3
4915    7.6
Name: imdb_score, Length: 4916, dtype: float64

In [60]:
imdb_score.gt(7)  

0        True
1        True
2       False
3        True
4        True
        ...  
4911     True
4912     True
4913    False
4914    False
4915    False
Name: imdb_score, Length: 4916, dtype: bool

In [61]:
money = pd.Series([100, 20, None])
money - 15

0    85.0
1     5.0
2     NaN
dtype: float64

In [62]:
money.sub(15, fill_value=0)

0    85.0
1     5.0
2   -15.0
dtype: float64

## 1.4 串連Series的方法

In [63]:
movies = pd.read_csv('data/movie.csv')
fb_likes = movies['actor_1_facebook_likes']
director = movies['director_name']

In [64]:
director.value_counts().head(3)

Steven Spielberg    26
Woody Allen         22
Martin Scorsese     20
Name: director_name, dtype: int64

In [65]:
print(fb_likes.isna().sum())
fb_likes.fillna(0)
fb_likes.isna().sum()

7


7

In [66]:
fb_likes.dtype

dtype('float64')

In [67]:
(fb_likes.fillna(0)
         .astype(int)
         .head()
)

0     1000
1    40000
2    11000
3    27000
4      131
Name: actor_1_facebook_likes, dtype: int64

In [68]:
(fb_likes.fillna(0)
         #.astype(int)
         #.head()
)

0        1000.0
1       40000.0
2       11000.0
3       27000.0
4         131.0
         ...   
4911      637.0
4912      841.0
4913        0.0
4914      946.0
4915       86.0
Name: actor_1_facebook_likes, Length: 4916, dtype: float64

In [69]:
(fb_likes.fillna(0)
         .astype(int)
         #.head()
)

0        1000
1       40000
2       11000
3       27000
4         131
        ...  
4911      637
4912      841
4913        0
4914      946
4915       86
Name: actor_1_facebook_likes, Length: 4916, dtype: int64

In [70]:
def debug_df(df):
    print("BEFORE")
    print(df)
    print("AFTER")
    return df

In [71]:
(fb_likes.fillna(0)
         .pipe(debug_df)
         .astype(int) 
         .head()
)

BEFORE
0        1000.0
1       40000.0
2       11000.0
3       27000.0
4         131.0
         ...   
4911      637.0
4912      841.0
4913        0.0
4914      946.0
4915       86.0
Name: actor_1_facebook_likes, Length: 4916, dtype: float64
AFTER


0     1000
1    40000
2    11000
3    27000
4      131
Name: actor_1_facebook_likes, dtype: int64

In [72]:
intermediate = None
def get_intermediate(df):
    global intermediate
    intermediate = df
    return df

In [73]:
res = (fb_likes.fillna(0)
         .pipe(get_intermediate)
         .astype(int) 
         .head()
)

intermediate

0        1000.0
1       40000.0
2       11000.0
3       27000.0
4         131.0
         ...   
4911      637.0
4912      841.0
4913        0.0
4914      946.0
4915       86.0
Name: actor_1_facebook_likes, Length: 4916, dtype: float64

In [74]:
fb_likes.fillna(0)\
        .astype(int)\
        .head()

0     1000
1    40000
2    11000
3    27000
4      131
Name: actor_1_facebook_likes, dtype: int64

## 1.5 更改欄位名稱

In [75]:
movies = pd.read_csv('data/movie.csv')

In [76]:
col_map = {'director_name':'Director Name'} 

In [77]:
movies.rename(columns=col_map).head()

Unnamed: 0,color,Director Name,...,aspect_ratio,movie_facebook_likes
0,Color,James Cameron,...,1.78,33000
1,Color,Gore Verbinski,...,2.35,0
2,Color,Sam Mendes,...,2.35,85000
3,Color,Christopher Nolan,...,2.35,164000
4,,Doug Walker,...,,0


In [78]:
idx_map = {'Avatar':'Ratava', 
           'Spectre': 'Ertceps',
           "Pirates of the Caribbean: At World's End": 'POC'}
col_map = {'aspect_ratio': 'aspect',
           'movie_facebook_likes': 'fblikes'}
(movies
   .set_index('movie_title')
   .rename(index=idx_map, columns=col_map)
   .head(3)
)


Unnamed: 0_level_0,color,director_name,...,aspect,fblikes
movie_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Ratava,Color,James Cameron,...,1.78,33000
POC,Color,Gore Verbinski,...,2.35,0
Ertceps,Color,Sam Mendes,...,2.35,85000


In [79]:
movies = pd.read_csv('data/movie.csv', index_col='movie_title')
print(movies.index)
ids = movies.index.tolist()
# print(ids)
columns = movies.columns.tolist()
ids[0] = 'Ratava'
ids[1] = 'POC'
ids[2] = 'Ertceps'
columns[1] = 'director'
columns[-2] = 'aspect'
columns[-1] = 'fblikes'
movies.index = ids
movies.columns = columns

movies.head()

Index(['Avatar', 'Pirates of the Caribbean: At World's End', 'Spectre',
       'The Dark Knight Rises', 'Star Wars: Episode VII - The Force Awakens',
       'John Carter', 'Spider-Man 3', 'Tangled', 'Avengers: Age of Ultron',
       'Harry Potter and the Half-Blood Prince',
       ...
       'Primer', 'Cavite', 'El Mariachi', 'The Mongol King', 'Newlyweds',
       'Signed Sealed Delivered', 'The Following', 'A Plague So Pleasant',
       'Shanghai Calling', 'My Date with Drew'],
      dtype='object', name='movie_title', length=4916)


Unnamed: 0,color,director,...,aspect,fblikes
Ratava,Color,James Cameron,...,1.78,33000
POC,Color,Gore Verbinski,...,2.35,0
Ertceps,Color,Sam Mendes,...,2.35,85000
The Dark Knight Rises,Color,Christopher Nolan,...,2.35,164000
Star Wars: Episode VII - The Force Awakens,,Doug Walker,...,,0


In [80]:
movies = pd.read_csv('data/movie.csv', index_col='movie_title')
movies.head(3)

Unnamed: 0_level_0,color,director_name,...,aspect_ratio,movie_facebook_likes
movie_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Avatar,Color,James Cameron,...,1.78,33000
Pirates of the Caribbean: At World's End,Color,Gore Verbinski,...,2.35,0
Spectre,Color,Sam Mendes,...,2.35,85000


In [81]:
def to_clean(val):
    return val.strip().replace('_', '.')
movies.rename(columns=to_clean).head(3)

Unnamed: 0_level_0,color,director.name,...,aspect.ratio,movie.facebook.likes
movie_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Avatar,Color,James Cameron,...,1.78,33000
Pirates of the Caribbean: At World's End,Color,Gore Verbinski,...,2.35,0
Spectre,Color,Sam Mendes,...,2.35,85000


In [82]:
cols = [col.strip().replace('_', '.')
        for col in movies.columns]
movies.columns = cols
movies.head(3)

Unnamed: 0_level_0,color,director.name,...,aspect.ratio,movie.facebook.likes
movie_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Avatar,Color,James Cameron,...,1.78,33000
Pirates of the Caribbean: At World's End,Color,Gore Verbinski,...,2.35,0
Spectre,Color,Sam Mendes,...,2.35,85000


## 1.6 新增及刪除欄位

In [83]:
movies = pd.read_csv('data/movie.csv')
movies['has_seen'] = 0
movies.head(3)

Unnamed: 0,color,director_name,...,movie_facebook_likes,has_seen
0,Color,James Cameron,...,33000,0
1,Color,Gore Verbinski,...,0,0
2,Color,Sam Mendes,...,85000,0


In [84]:
col_map = {'aspect_ratio': 'aspect',
           'movie_facebook_likes': 'fblikes'}
(movies
   .rename(columns=col_map)
   .assign(has_seen=0)
)
movies.head(3)

Unnamed: 0,color,director_name,...,movie_facebook_likes,has_seen
0,Color,James Cameron,...,33000,0
1,Color,Gore Verbinski,...,0,0
2,Color,Sam Mendes,...,85000,0


In [85]:
total = (movies['actor_1_facebook_likes'] +
         movies['actor_2_facebook_likes'] + 
         movies['actor_3_facebook_likes'] + 
         movies['director_facebook_likes'])
total.head(5)

0     2791.0
1    46563.0
2    11554.0
3    95000.0
4        NaN
dtype: float64

In [86]:
cols = ['actor_1_facebook_likes','actor_2_facebook_likes',
        'actor_3_facebook_likes','director_facebook_likes']
print(type(movies[cols]))
sum_col = movies[cols].sum(axis='columns')
sum_col.head(5)

<class 'pandas.core.frame.DataFrame'>


0     2791.0
1    46563.0
2    11554.0
3    95000.0
4      274.0
dtype: float64

In [87]:
movies.assign(total_likes=sum_col).head(5)

Unnamed: 0,color,director_name,...,has_seen,total_likes
0,Color,James Cameron,...,0,2791.0
1,Color,Gore Verbinski,...,0,46563.0
2,Color,Sam Mendes,...,0,11554.0
3,Color,Christopher Nolan,...,0,95000.0
4,,Doug Walker,...,0,274.0


In [88]:
def sum_likes(df):
    return df[[c for c in df.columns
               if 'like' in c]].sum(axis=1)
movies.assign(total_likes=sum_likes).head(5)

Unnamed: 0,color,director_name,...,has_seen,total_likes
0,Color,James Cameron,...,0,40625.0
1,Color,Gore Verbinski,...,0,94913.0
2,Color,Sam Mendes,...,0,108254.0
3,Color,Christopher Nolan,...,0,365759.0
4,,Doug Walker,...,0,417.0


In [89]:
(movies
   .assign(total_likes=sum_col)
   ['total_likes']
   .isna()
   .sum()
)

0

In [90]:
(movies
   .assign(total_likes=total)
   ['total_likes']
   .isna()
   .sum()
)

122

In [91]:
def cast_like_gt_actor_director(df):
    return df['cast_total_facebook_likes'] >= df['total_likes']

df2 = (movies.assign(total_likes=total,
                     is_cast_likes_more = cast_like_gt_actor_director)
      )
df2

Unnamed: 0,color,director_name,...,total_likes,is_cast_likes_more
0,Color,James Cameron,...,2791.0,True
1,Color,Gore Verbinski,...,46563.0,True
2,Color,Sam Mendes,...,11554.0,True
3,Color,Christopher Nolan,...,95000.0,True
4,,Doug Walker,...,,False
...,...,...,...,...,...
4911,Color,Scott Smith,...,1427.0,True
4912,Color,,...,,False
4913,Color,Benjamin Roberds,...,0.0,True
4914,Color,Daniel Hsia,...,2154.0,True


In [92]:
df2['is_cast_likes_more'].all()

False

In [93]:
df2 = df2.drop(columns='total_likes')

In [94]:
actor_sum = (movies[[c for c in movies.columns if 'actor_' in c and '_likes' in c]]
             .sum(axis='columns'))

actor_sum.head(5)

0     2791.0
1    46000.0
2    11554.0
3    73000.0
4      143.0
dtype: float64

In [95]:
movies['cast_total_facebook_likes'] >= actor_sum

0       True
1       True
2       True
3       True
4       True
        ... 
4911    True
4912    True
4913    True
4914    True
4915    True
Length: 4916, dtype: bool

In [96]:
movies['cast_total_facebook_likes'].ge(actor_sum)

0       True
1       True
2       True
3       True
4       True
        ... 
4911    True
4912    True
4913    True
4914    True
4915    True
Length: 4916, dtype: bool

In [97]:
movies['cast_total_facebook_likes'].ge(actor_sum).all()

True

In [98]:
pct_like = (actor_sum
            .div(movies['cast_total_facebook_likes'])
)

In [99]:
pct_like.describe()

count    4883.000000
mean        0.833279
std         0.140566
min         0.300767
25%         0.735284
50%         0.869289
75%         0.954774
max         1.000000
dtype: float64

In [100]:
pd.Series(pct_like.values,
          index=movies['movie_title'].values).head()

Avatar                                        0.577369
Pirates of the Caribbean: At World's End      0.951396
Spectre                                       0.987521
The Dark Knight Rises                         0.683783
Star Wars: Episode VII - The Force Awakens    1.000000
dtype: float64

In [101]:
profit_index = movies.columns.get_loc('gross') + 1
profit_index

9

In [102]:
movies.insert(loc=profit_index,
              column='profit',
              value=movies['gross'] - movies['budget'])

In [103]:
del movies['director_name']