# Chapter 1: Pandas 기초

In [2]:
import pandas as pd
import numpy as np

## 소개

## DataFrame 구조 분석

In [3]:
movies = pd.read_csv('data/movie.csv')         # csv파일을 읽어온다
pd.set_option('max_columns', 5, 'max_rows', 10)   # columns과 row의 최대크기를 제한한다
type(movies)

pandas.core.frame.DataFrame

In [283]:
movies.head()   # 기본설정값은 5이다

Unnamed: 0,color,director_name,...,aspect_ratio,movie_facebook_likes
0,Color,James Cameron,...,1.78,33000
1,Color,Gore Verbinski,...,2.35,0
2,Color,Sam Mendes,...,2.35,85000
3,Color,Christopher Nolan,...,2.35,164000
4,,Doug Walker,...,,0


### DataFrame 속성

![image.png](attachment:image.png)
여러개의 Series로 구성된 2차원배열

In [4]:
movies = pd.read_csv('data/movie.csv')
columns = movies.columns
index = movies.index        
data = movies.values

In [5]:
columns

Index(['color', 'director_name', 'num_critic_for_reviews', 'duration',
       'director_facebook_likes', 'actor_3_facebook_likes', 'actor_2_name',
       'actor_1_facebook_likes', 'gross', 'genres', 'actor_1_name',
       'movie_title', 'num_voted_users', 'cast_total_facebook_likes',
       'actor_3_name', 'facenumber_in_poster', 'plot_keywords',
       'movie_imdb_link', 'num_user_for_reviews', 'language', 'country',
       'content_rating', 'budget', 'title_year', 'actor_2_facebook_likes',
       'imdb_score', 'aspect_ratio', 'movie_facebook_likes'],
      dtype='object')

In [6]:
index     

RangeIndex(start=0, stop=4916, step=1)

In [7]:
data

array([['Color', 'James Cameron', 723.0, ..., 7.9, 1.78, 33000],
       ['Color', 'Gore Verbinski', 302.0, ..., 7.1, 2.35, 0],
       ['Color', 'Sam Mendes', 602.0, ..., 6.8, 2.35, 85000],
       ...,
       ['Color', 'Benjamin Roberds', 13.0, ..., 6.3, nan, 16],
       ['Color', 'Daniel Hsia', 14.0, ..., 6.3, 2.35, 660],
       ['Color', 'Jon Gunn', 43.0, ..., 6.6, 1.85, 456]], dtype=object)

In [8]:
type(index)     # python의 range와 타입이 같다

pandas.core.indexes.range.RangeIndex

In [9]:
type(columns)

pandas.core.indexes.base.Index

In [10]:
type(data)

numpy.ndarray

In [11]:
issubclass(pd.RangeIndex, pd.Index)

True

In [12]:
index.values

array([   0,    1,    2, ..., 4913, 4914, 4915], dtype=int64)

In [13]:
columns.values

array(['color', 'director_name', 'num_critic_for_reviews', 'duration',
       'director_facebook_likes', 'actor_3_facebook_likes',
       'actor_2_name', 'actor_1_facebook_likes', 'gross', 'genres',
       'actor_1_name', 'movie_title', 'num_voted_users',
       'cast_total_facebook_likes', 'actor_3_name',
       'facenumber_in_poster', 'plot_keywords', 'movie_imdb_link',
       'num_user_for_reviews', 'language', 'country', 'content_rating',
       'budget', 'title_year', 'actor_2_facebook_likes', 'imdb_score',
       'aspect_ratio', 'movie_facebook_likes'], dtype=object)

## 데이터 형식 이해

In [17]:
movies = pd.read_csv('data/movie.csv')

In [18]:
movies.dtypes     # p33 int와 bool은 결측치를 지원하지않는다.

color                       object
director_name               object
num_critic_for_reviews     float64
duration                   float64
director_facebook_likes    float64
                            ...   
title_year                 float64
actor_2_facebook_likes     float64
imdb_score                 float64
aspect_ratio               float64
movie_facebook_likes         int64
Length: 28, dtype: object

In [19]:
movies.dtypes.value_counts()

float64    13
object     12
int64       3
dtype: int64

In [20]:
movies.info()    # info - 데이터 타입과 널값이 아닌 값의 갯수를 확인할수있다

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4916 entries, 0 to 4915
Data columns (total 28 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   color                      4897 non-null   object 
 1   director_name              4814 non-null   object 
 2   num_critic_for_reviews     4867 non-null   float64
 3   duration                   4901 non-null   float64
 4   director_facebook_likes    4814 non-null   float64
 5   actor_3_facebook_likes     4893 non-null   float64
 6   actor_2_name               4903 non-null   object 
 7   actor_1_facebook_likes     4909 non-null   float64
 8   gross                      4054 non-null   float64
 9   genres                     4916 non-null   object 
 10  actor_1_name               4909 non-null   object 
 11  movie_title                4916 non-null   object 
 12  num_voted_users            4916 non-null   int64  
 13  cast_total_facebook_likes  4916 non-null   int64

In [21]:
pd.Series(['Paul', np.nan, 'George']).dtype   # Series의 타입을 확인할수 있다 

dtype('O')

## 열선택

In [22]:
movies = pd.read_csv('data/movie.csv')
movies['director_name']

0           James Cameron
1          Gore Verbinski
2              Sam Mendes
3       Christopher Nolan
4             Doug Walker
              ...        
4911          Scott Smith
4912                  NaN
4913     Benjamin Roberds
4914          Daniel Hsia
4915             Jon Gunn
Name: director_name, Length: 4916, dtype: object

In [23]:
movies.director_name    # 자동완성 가능
                        # 속성이름과 중복되면안된다, 공백이들어가면 안되며 첫글자로 숫자로를 사용하면안된다

0           James Cameron
1          Gore Verbinski
2              Sam Mendes
3       Christopher Nolan
4             Doug Walker
              ...        
4911          Scott Smith
4912                  NaN
4913     Benjamin Roberds
4914          Daniel Hsia
4915             Jon Gunn
Name: director_name, Length: 4916, dtype: object

In [24]:
movies.loc[:, 'director_name']         # 열이름 으로 검색가능

0           James Cameron
1          Gore Verbinski
2              Sam Mendes
3       Christopher Nolan
4             Doug Walker
              ...        
4911          Scott Smith
4912                  NaN
4913     Benjamin Roberds
4914          Daniel Hsia
4915             Jon Gunn
Name: director_name, Length: 4916, dtype: object

In [25]:
movies.iloc[:, 1]    # 위치로 검색가능

0           James Cameron
1          Gore Verbinski
2              Sam Mendes
3       Christopher Nolan
4             Doug Walker
              ...        
4911          Scott Smith
4912                  NaN
4913     Benjamin Roberds
4914          Daniel Hsia
4915             Jon Gunn
Name: director_name, Length: 4916, dtype: object

In [26]:
movies['director_name'].index

RangeIndex(start=0, stop=4916, step=1)

In [27]:
movies['director_name'].dtype

dtype('O')

In [28]:
movies['director_name'].size

4916

In [29]:
movies.iloc[:,3].name

'duration'

In [30]:
movies['director_name'].name

'director_name'

In [31]:
type(movies['director_name'])

pandas.core.series.Series

In [32]:
movies['director_name'].apply(type)    #apply메소드 안에 type함수를 사용하여 column의 각각의 타입을 확인할수있다

0         <class 'str'>
1         <class 'str'>
2         <class 'str'>
3         <class 'str'>
4         <class 'str'>
             ...       
4911      <class 'str'>
4912    <class 'float'>
4913      <class 'str'>
4914      <class 'str'>
4915      <class 'str'>
Name: director_name, Length: 4916, dtype: object

## Series 메소드 호출

In [33]:
s_attr_methods = set(dir(pd.Series))
len(s_attr_methods)

421

In [34]:
df_attr_methods = set(dir(pd.DataFrame))
len(df_attr_methods)

432

In [35]:
len(s_attr_methods & df_attr_methods)    #DateFrame과 Series형식은 공유하는 메소드가 많다

365

In [37]:
movies = pd.read_csv('data/movie.csv')
director = movies['director_name']
fb_likes = movies['actor_1_facebook_likes']

In [38]:
director.dtype

dtype('O')

In [39]:
fb_likes.dtype

dtype('float64')

In [40]:
director.head()

0        James Cameron
1       Gore Verbinski
2           Sam Mendes
3    Christopher Nolan
4          Doug Walker
Name: director_name, dtype: object

In [41]:
director.sample(n=5, random_state=42)      # 매게변수 n을 통해 검색하고자 하는 갯수 설정, random_state - 시드

2347      Brian Percival
4687         Lucio Fulci
691        Phillip Noyce
3911       Sam Peckinpah
2488    Rowdy Herrington
Name: director_name, dtype: object

In [42]:
fb_likes.head()

0     1000.0
1    40000.0
2    11000.0
3    27000.0
4      131.0
Name: actor_1_facebook_likes, dtype: float64

In [43]:
director.value_counts()     # 중복되는 값들의 개수를 파악하는 메소드

Steven Spielberg    26
Woody Allen         22
Martin Scorsese     20
Clint Eastwood      20
Ridley Scott        16
                    ..
John Putch           1
Luca Guadagnino      1
Sam Fell             1
Dan Fogelman         1
Daniel Hsia          1
Name: director_name, Length: 2397, dtype: int64

In [44]:
fb_likes.value_counts()

1000.0     436
11000.0    206
2000.0     189
3000.0     150
12000.0    131
          ... 
703.0        1
208.0        1
79.0         1
269.0        1
291.0        1
Name: actor_1_facebook_likes, Length: 877, dtype: int64

In [45]:
director.size

4916

In [46]:
director.shape    

(4916,)

In [47]:
movies.shape

(4916, 28)

In [48]:
len(director)

4916

In [49]:
director.unique()

array(['James Cameron', 'Gore Verbinski', 'Sam Mendes', ...,
       'Scott Smith', 'Benjamin Roberds', 'Daniel Hsia'], dtype=object)

In [50]:
director.count()

4814

In [51]:
fb_likes.count()

4909

In [52]:
fb_likes.quantile() # 매개변수를 통해 분위 파악 (가본값으로는 0.5)

982.0

In [53]:
fb_likes.quantile([.1, .2, .3, .4, .5, .6, .7, .8, .9])

0.1      240.0
0.2      510.0
0.3      694.0
0.4      854.0
0.5      982.0
0.6     1000.0
0.7     8000.0
0.8    13000.0
0.9    18000.0
Name: actor_1_facebook_likes, dtype: float64

In [54]:
fb_likes.min() # 최소값

0.0

In [55]:
fb_likes.max() # 최대값

640000.0

In [56]:
fb_likes.mean() # 평균

6494.488490527602

In [57]:
fb_likes.median()  # 중앙값   

982.0

In [58]:
fb_likes.std() # 표준 편차

15106.986883848309

In [59]:
fb_likes.describe() # 다양한 계산된 값들을 보여준다

count      4909.000000
mean       6494.488491
std       15106.986884
min           0.000000
25%         607.000000
50%         982.000000
75%       11000.000000
max      640000.000000
Name: actor_1_facebook_likes, dtype: float64

In [60]:
director.describe()   

count                 4814
unique                2397
top       Steven Spielberg
freq                    26
Name: director_name, dtype: object

In [61]:
director.isna() # isnull과 같다

0       False
1       False
2       False
3       False
4       False
        ...  
4911    False
4912     True
4913    False
4914    False
4915    False
Name: director_name, Length: 4916, dtype: bool

In [62]:
fb_likes.size

4916

In [63]:
fb_likes.count()    # count는 Nan값을측정하지 않는다

4909

In [64]:
fb_likes_filled = fb_likes.fillna(0)  

In [65]:
fb_likes_dropped = fb_likes.dropna()   # dropna - Nan값 제거
fb_likes_filled.count()
fb_likes_dropped.size

4909

In [66]:
director.value_counts(normalize=True) # normalize 매개변수를 True로 사용시 빈도값 반환

Steven Spielberg    0.005401
Woody Allen         0.004570
Martin Scorsese     0.004155
Clint Eastwood      0.004155
Ridley Scott        0.003324
                      ...   
John Putch          0.000208
Luca Guadagnino     0.000208
Sam Fell            0.000208
Dan Fogelman        0.000208
Daniel Hsia         0.000208
Name: director_name, Length: 2397, dtype: float64

In [67]:
director.hasnans    # Series 안에 Nan값이 있는지를 확인

True

In [68]:
director.notna()   # isna반대

0        True
1        True
2        True
3        True
4        True
        ...  
4911     True
4912    False
4913     True
4914     True
4915     True
Name: director_name, Length: 4916, dtype: bool

## Series 연산

In [69]:
5 + 9    # plus operator example. Adds 5 and 9

14

In [71]:
movies = pd.read_csv('data/movie.csv')
imdb_score = movies['imdb_score']
imdb_score

0       7.9
1       7.1
2       6.8
3       8.5
4       7.1
       ... 
4911    7.7
4912    7.5
4913    6.3
4914    6.3
4915    6.6
Name: imdb_score, Length: 4916, dtype: float64

In [72]:
imdb_score + 1      # for문을 돌리지않고 직곽적으로 연산이 가능하다

0       8.9
1       8.1
2       7.8
3       9.5
4       8.1
       ... 
4911    8.7
4912    8.5
4913    7.3
4914    7.3
4915    7.6
Name: imdb_score, Length: 4916, dtype: float64

In [73]:
imdb_score * 2.5

0       19.75
1       17.75
2       17.00
3       21.25
4       17.75
        ...  
4911    19.25
4912    18.75
4913    15.75
4914    15.75
4915    16.50
Name: imdb_score, Length: 4916, dtype: float64

In [74]:
imdb_score // 7

0       1.0
1       1.0
2       0.0
3       1.0
4       1.0
       ... 
4911    1.0
4912    1.0
4913    0.0
4914    0.0
4915    0.0
Name: imdb_score, Length: 4916, dtype: float64

In [75]:
imdb_score > 7 

0        True
1        True
2       False
3        True
4        True
        ...  
4911     True
4912     True
4913    False
4914    False
4915    False
Name: imdb_score, Length: 4916, dtype: bool

In [76]:
director = movies['director_name']
director == 'James Cameron'

0        True
1       False
2       False
3       False
4       False
        ...  
4911    False
4912    False
4913    False
4914    False
4915    False
Name: director_name, Length: 4916, dtype: bool

In [77]:
imdb_score.add(1)   # imdb_score + 1

0       8.9
1       8.1
2       7.8
3       9.5
4       8.1
       ... 
4911    8.7
4912    8.5
4913    7.3
4914    7.3
4915    7.6
Name: imdb_score, Length: 4916, dtype: float64

In [78]:
imdb_score.gt(7)   # imdb_score > 7

0        True
1        True
2       False
3        True
4        True
        ...  
4911     True
4912     True
4913    False
4914    False
4915    False
Name: imdb_score, Length: 4916, dtype: bool

## Series 메서드 체인

In [79]:
movies = pd.read_csv('data/movie.csv')
fb_likes = movies['actor_1_facebook_likes']
director = movies['director_name']

In [80]:
director.value_counts().head(3)

Steven Spielberg    26
Woody Allen         22
Martin Scorsese     20
Name: director_name, dtype: int64

In [81]:
fb_likes.isna().sum()   # Nan값의 갯수

7

In [82]:
fb_likes.dtype         # 결측치가 있는 수치 열의 데이터형식은 float형식 이어야한다

dtype('float64')

In [83]:
(fb_likes.fillna(0)      
         .astype(int)
         .head()
)

0     1000
1    40000
2    11000
3    27000
4      131
Name: actor_1_facebook_likes, dtype: int32

In [84]:
(fb_likes.fillna(0)              # 체인형식을 많이 사용할수록 디버깅이 어렵다
         #.astype(int)
         #.head()
)

0        1000.0
1       40000.0
2       11000.0
3       27000.0
4         131.0
         ...   
4911      637.0
4912      841.0
4913        0.0
4914      946.0
4915       86.0
Name: actor_1_facebook_likes, Length: 4916, dtype: float64

In [146]:
(fb_likes.fillna(0)
         .astype(int)
         #.head()
)

0        1000
1       40000
2       11000
3       27000
4         131
        ...  
4911      637
4912      841
4913        0
4914      946
4915       86
Name: actor_1_facebook_likes, Length: 4916, dtype: int32

In [150]:
fb_likes.isna().mean()    # 전체값중 Nan값의 비율

0.0014239218877135883

In [156]:
fb_likes.fillna(0) \
        .astype(int) \
        .head()
# \를 사용해 줄바꿈

0     1000
1    40000
2    11000
3    27000
4      131
Name: actor_1_facebook_likes, dtype: int32

In [157]:
def debug_df(df):
    print("BEFORE")
    print(df)
    print("AFTER")
    return df

In [325]:
(fb_likes.fillna(0)         # pipe 메소드를 통해 체인 중 함수를 실행할수있다
         .pipe(debug_df)
         .astype(int) 
         .head()
)

BEFORE
0        1000.0
1       40000.0
2       11000.0
3       27000.0
4         131.0
         ...   
4911      637.0
4912      841.0
4913        0.0
4914      946.0
4915       86.0
Name: actor_1_facebook_likes, Length: 4916, dtype: float64
AFTER


0     1000
1    40000
2    11000
3    27000
4      131
Name: actor_1_facebook_likes, dtype: int32

In [159]:
intermediate = None
def get_intermediate(df):
    global intermediate      # global 사용
    intermediate = df
    return df

In [330]:
res = (fb_likes.fillna(0)
         .pipe(get_intermediate)
         .astype(int) 
         .head()
)

In [331]:
intermediate

0        1000.0
1       40000.0
2       11000.0
3       27000.0
4         131.0
         ...   
4911      637.0
4912      841.0
4913        0.0
4914      946.0
4915       86.0
Name: actor_1_facebook_likes, Length: 4916, dtype: float64

In [162]:
res

0     1000
1    40000
2    11000
3    27000
4      131
Name: actor_1_facebook_likes, dtype: int32

## 열 이름 변경

In [163]:
movies = pd.read_csv('data/movie.csv')

In [164]:
col_map = {'director_name':'Director Name', 
             'num_critic_for_reviews': 'Critical Reviews'} 

In [165]:
movies.rename(columns=col_map).head()

Unnamed: 0,color,Director Name,...,aspect_ratio,movie_facebook_likes
0,Color,James Cameron,...,1.78,33000
1,Color,Gore Verbinski,...,2.35,0
2,Color,Sam Mendes,...,2.35,85000
3,Color,Christopher Nolan,...,2.35,164000
4,,Doug Walker,...,,0


In [166]:
idx_map = {'Avatar':'Ratava', 'Spectre': 'Ertceps',
  "Pirates of the Caribbean: At World's End": 'POC'}
col_map = {'aspect_ratio': 'aspect',
  "movie_facebook_likes": 'fblikes'}
(movies
   .set_index('movie_title')      # movie_title열을 인덱스로 사용
   .rename(index=idx_map, columns=col_map)
   .head(3)
)

Unnamed: 0_level_0,color,director_name,...,aspect,fblikes
movie_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Ratava,Color,James Cameron,...,1.78,33000
POC,Color,Gore Verbinski,...,2.35,0
Ertceps,Color,Sam Mendes,...,2.35,85000


In [285]:
movies = pd.read_csv('data/movie.csv', index_col='movie_title')   # 처음부터 index를 특정열로 정할수 있다
ids = movies.index.tolist()          # tolist() - list 형태로 변환
columns = movies.columns.tolist()
movies

Unnamed: 0_level_0,color,director_name,...,aspect_ratio,movie_facebook_likes
movie_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Avatar,Color,James Cameron,...,1.78,33000
Pirates of the Caribbean: At World's End,Color,Gore Verbinski,...,2.35,0
Spectre,Color,Sam Mendes,...,2.35,85000
The Dark Knight Rises,Color,Christopher Nolan,...,2.35,164000
Star Wars: Episode VII - The Force Awakens,,Doug Walker,...,,0
...,...,...,...,...,...
Signed Sealed Delivered,Color,Scott Smith,...,,84
The Following,Color,,...,16.00,32000
A Plague So Pleasant,Color,Benjamin Roberds,...,,16
Shanghai Calling,Color,Daniel Hsia,...,2.35,660


In [None]:
# 행과 열 레이블을 list 대입을 사용해 이름을 변경한다.

In [174]:
ids[0] = 'Ratava'
ids[1] = 'POC'
ids[2] = 'Ertceps'
columns[1] = 'director'
columns[-2] = 'aspect'
columns[-1] = 'fblikes'
movies.index = ids
movies.columns = columns

In [175]:
movies.head()

Unnamed: 0,color,director,...,aspect,fblikes
Ratava,Color,James Cameron,...,1.78,33000
POC,Color,Gore Verbinski,...,2.35,0
Ertceps,Color,Sam Mendes,...,2.35,85000
The Dark Knight Rises,Color,Christopher Nolan,...,2.35,164000
Star Wars: Episode VII - The Force Awakens,,Doug Walker,...,,0


In [176]:
def to_clean(val): # 공백제거, 소문자변환, 공백변환
    return val.strip().lower().replace(' ', '_')   

In [177]:
movies.rename(columns=to_clean).head(3)   # 매개변수 값으로 함수를 사용하여 열변경가능

Unnamed: 0,color,director,...,aspect,fblikes
Ratava,Color,James Cameron,...,1.78,33000
POC,Color,Gore Verbinski,...,2.35,0
Ertceps,Color,Sam Mendes,...,2.35,85000


In [59]:
cols = [col.strip().lower().replace(' ', '_')   # 리스트 컴프레헨션을 사용하여 한줄로 만들수도있다
        for col in movies.columns]
movies.columns = cols
movies.head(3)

Unnamed: 0,color,director,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,num_user_for_reviews,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect,fblikes
Ratava,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,...,3054.0,English,USA,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000
POC,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,Action|Adventure|Fantasy,...,1238.0,English,USA,PG-13,300000000.0,2007.0,5000.0,7.1,2.35,0
Ertceps,Color,Sam Mendes,602.0,148.0,0.0,161.0,Rory Kinnear,11000.0,200074175.0,Action|Adventure|Thriller,...,994.0,English,UK,PG-13,245000000.0,2015.0,393.0,6.8,2.35,85000


## 열 생성과 삭제

In [296]:
movies = pd.read_csv('data/movie.csv')  # 열생성방법은 다양하다
movies['has_seen'] = 0 # 열 생성1

In [297]:
movies = pd.read_csv('data/movie.csv')
idx_map = {'Avatar':'Ratava', 'Spectre': 'Ertceps',
  "Pirates of the Caribbean: At World's End": 'POC'}
col_map = {'aspect_ratio': 'aspect',
  "movie_facebook_likes": 'fblikes'}
(movies
   .rename(index=idx_map, columns=col_map)
   .assign(has_seen=0)    # 열 생성2 - 매개변수명이 열의 이름이 되기때문에 유효한 매개변수 값을 사용해야한다
)

Unnamed: 0,color,director_name,...,fblikes,has_seen
0,Color,James Cameron,...,33000,0
1,Color,Gore Verbinski,...,0,0
2,Color,Sam Mendes,...,85000,0
3,Color,Christopher Nolan,...,164000,0
4,,Doug Walker,...,0,0
...,...,...,...,...,...
4911,Color,Scott Smith,...,84,0
4912,Color,,...,32000,0
4913,Color,Benjamin Roberds,...,16,0
4914,Color,Daniel Hsia,...,660,0


In [298]:
total = (movies['actor_1_facebook_likes'] +
         movies['actor_2_facebook_likes'] + 
         movies['actor_3_facebook_likes'] + 
         movies['director_facebook_likes'])   # Nan값이 하나라도 있을시 무조건 Nan값이 나온다

In [299]:
total.head(5)

0     2791.0
1    46563.0
2    11554.0
3    95000.0
4        NaN
dtype: float64

In [300]:
(movies
   .assign(total_likes=total)
   ['total_likes']
   .isna()
   .sum()
)

122

In [306]:
(movies
   .assign(total_likes=total.fillna(0))
   ['total_likes']
   .isna()
   .sum()
)


0

In [301]:
cols = ['actor_1_facebook_likes','actor_2_facebook_likes',
    'actor_3_facebook_likes','director_facebook_likes']
sum_col = movies[cols].sum(axis='columns')      # sum을 사용시 Nan값을 무시하고 연산을 한다  (skipnan 의 기본값은 True 이기때문..)
sum_col.head(5)

0     2791.0
1    46563.0
2    11554.0
3    95000.0
4      274.0
dtype: float64

In [302]:
(movies
   .assign(total_likes=sum_col)
   ['total_likes']
   .isna()
   .sum()
)

0

In [303]:
movies.assign(total_likes=sum_col).head(5) 

Unnamed: 0,color,director_name,...,movie_facebook_likes,total_likes
0,Color,James Cameron,...,33000,2791.0
1,Color,Gore Verbinski,...,0,46563.0
2,Color,Sam Mendes,...,85000,11554.0
3,Color,Christopher Nolan,...,164000,95000.0
4,,Doug Walker,...,0,274.0


In [304]:
def sum_likes(df):
   return df[[c for c in df.columns
              if 'like' in c]].sum(axis=1)

In [305]:
movies.assign(total_likes=sum_likes).head(5)  # 열생성시 함수를 사용하여 특정 조건의 열생성 가능

Unnamed: 0,color,director_name,...,movie_facebook_likes,total_likes
0,Color,James Cameron,...,33000,40625.0
1,Color,Gore Verbinski,...,0,94913.0
2,Color,Sam Mendes,...,85000,108254.0
3,Color,Christopher Nolan,...,164000,365759.0
4,,Doug Walker,...,0,417.0


In [240]:
def cast_like_gt_actor_director(df):
    return df['cast_total_facebook_likes'] >= \
           df['total_likes']

In [241]:
df2 = (movies
   .assign(total_likes=total,
           is_cast_likes_more = cast_like_gt_actor_director)
)

In [242]:
df2['is_cast_likes_more'].all()  # all - False가 하나라도 있으면 False반환

False

In [227]:
df2 = df2.drop(columns='total_likes') # 열삭제

In [228]:
actor_sum = (movies
   [[c for c in movies.columns if 'actor_' in c and '_likes' in c]]       # 배우들이 받은 좋아요수 검색
   .sum(axis='columns')
)

In [229]:
actor_sum.head(5)

0     2791.0
1    46000.0
2    11554.0
3    73000.0
4      143.0
dtype: float64

In [255]:
movies['cast_total_facebook_likes'] >= actor_sum   # 연산자로 비교

0       True
1       True
2       True
3       True
4       True
        ... 
4911    True
4912    True
4913    True
4914    True
4915    True
Length: 4916, dtype: bool

In [308]:
movies['cast_total_facebook_likes'].ge(actor_sum)  # 메소드로 비교

0       True
1       True
2       True
3       True
4       True
        ... 
4911    True
4912    True
4913    True
4914    True
4915    True
Length: 4916, dtype: bool

In [311]:
movies['cast_total_facebook_likes'].ge(actor_sum).all()    # 모든값이 True인지 확인

True

In [252]:
pct_like = (actor_sum
    .div(movies['cast_total_facebook_likes']).mul(100)
)
pct_like

0        57.736864
1        95.139607
2        98.752137
3        68.378310
4       100.000000
           ...    
4911     62.417871
4912    100.000000
4913           NaN
4914     90.276614
4915     76.687117
Length: 4916, dtype: float64

In [234]:
pct_like.describe()

count    4883.000000
mean       83.327889
std        14.056578
min        30.076696
25%        73.528368
50%        86.928884
75%        95.477440
max       100.000000
dtype: float64

In [235]:
pd.Series(pct_like.values,
    index=movies['movie_title'].values).head()

Avatar                                         57.736864
Pirates of the Caribbean: At World's End       95.139607
Spectre                                        98.752137
The Dark Knight Rises                          68.378310
Star Wars: Episode VII - The Force Awakens    100.000000
dtype: float64

In [319]:
profit_index = movies.columns.get_loc('gross') + 1     # get_loc - 열의 이름을 통해 열의 위치값을 찾을수있다
profit_index

9

In [320]:
movies.insert(loc=profit_index,
              column='profit',
              value=movies['gross'] - movies['budget'])   # 열생성4 insert는 DataFrame자체를 수정하므로 대입 명령문은 없다

In [278]:
movies.iloc[:, :10]   # 10번째 열값에 들어가 있는모습

Unnamed: 0,color,director_name,...,gross,profit
0,Color,James Cameron,...,760505847.0,523505847.0
1,Color,Gore Verbinski,...,309404152.0,9404152.0
2,Color,Sam Mendes,...,200074175.0,-44925825.0
3,Color,Christopher Nolan,...,448130642.0,198130642.0
4,,Doug Walker,...,,
...,...,...,...,...,...
4911,Color,Scott Smith,...,,
4912,Color,,...,,
4913,Color,Benjamin Roberds,...,,
4914,Color,Daniel Hsia,...,10443.0,


In [279]:
del movies['director_name']   # drop 대신 사용가능 but 새로운 DataFrame을 반환하지 않기에 .drop사용 선호

In [280]:
movies

Unnamed: 0,color,num_critic_for_reviews,...,aspect_ratio,movie_facebook_likes
0,Color,723.0,...,1.78,33000
1,Color,302.0,...,2.35,0
2,Color,602.0,...,2.35,85000
3,Color,813.0,...,2.35,164000
4,,,...,,0
...,...,...,...,...,...
4911,Color,1.0,...,,84
4912,Color,43.0,...,16.00,32000
4913,Color,13.0,...,,16
4914,Color,14.0,...,2.35,660
