<a href="https://colab.research.google.com/github/Blackman9t/PandasCookbook/blob/master/Chapter_1_pandas_Foundations_part_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Recipes
* [Dissecting the anatomy of a DataFrame](#Dissecting-the-anatomy-of-a-DataFrame)
* [Accessing the main DataFrame components](#Accessing-the-main-DataFrame-components)
* [Understanding data types](#Understanding-data-types)
* [Selecting a single Series](#Selecting-a-single-Series)
* [Calling Series methods](#Calling-Series-methods)
* [Working with operators on a Series](#Working-with-operators-on-a-Series)
* [Chaining Series methods together](#Chaining-Series-methods-together)
* [Making a meaningful index](#Making-a-meaningful-index)
* [Renaming row and column labels](#Renaming-row-and-column-labels)
* [Creating and deleting columns](#Creating-and-deleting-columns)

In [0]:
import pandas as pd
import numpy as np
from IPython.display import display

## change options for each recipe

In [0]:
pd.set_option('max_rows', 10)

### mounting gdrive so we can import data files to colab

In [135]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [0]:
# next let's read the movie file from gdrive

file = ''

with open('/content/gdrive/My Drive/Colab_Notebooks/data_tools/pandas/PandasCookBook_Code/data/movie.csv') as f:
    file = f.read()

In [0]:
# let's read the file into a Data frame using the IO class

import sys
if sys.version_info[0] < 3: 
    from StringIO import StringIO
else:
    from io import StringIO

TESTDATA = StringIO(file)

# define additional missing values

missing_data = ['na','--','?','None']

movie = pd.read_csv(TESTDATA, sep=",", na_values=missing_data)


# Dissecting the anatomy of a DataFrame

In [138]:
movie.head()

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,...,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
0,Color,James Cameron,723.0,178.0,...,936.0,7.9,1.78,33000
1,Color,Gore Verbinski,302.0,169.0,...,5000.0,7.1,2.35,0
2,Color,Sam Mendes,602.0,148.0,...,393.0,6.8,2.35,85000
3,Color,Christopher Nolan,813.0,164.0,...,23000.0,8.5,2.35,164000
4,,Doug Walker,,,...,12.0,7.1,,0


![dataframe anatomy](./images/ch01_dataframe_anatomy.png)

# Accessing the main DataFrame components

In [139]:
movie.shape

(4916, 28)

In [0]:
columns = movie.columns
index = movie.index
data = movie.values

In [141]:
columns

Index(['color', 'director_name', 'num_critic_for_reviews', 'duration',
       'director_facebook_likes', 'actor_3_facebook_likes', 'actor_2_name',
       'actor_1_facebook_likes', 'gross', 'genres', 'actor_1_name',
       'movie_title', 'num_voted_users', 'cast_total_facebook_likes',
       'actor_3_name', 'facenumber_in_poster', 'plot_keywords',
       'movie_imdb_link', 'num_user_for_reviews', 'language', 'country',
       'content_rating', 'budget', 'title_year', 'actor_2_facebook_likes',
       'imdb_score', 'aspect_ratio', 'movie_facebook_likes'],
      dtype='object')

In [142]:
index

RangeIndex(start=0, stop=4916, step=1)

In [143]:
data

array([['Color', 'James Cameron', 723.0, ..., 7.9, 1.78, 33000],
       ['Color', 'Gore Verbinski', 302.0, ..., 7.1, 2.35, 0],
       ['Color', 'Sam Mendes', 602.0, ..., 6.8, 2.35, 85000],
       ...,
       ['Color', 'Benjamin Roberds', 13.0, ..., 6.3, nan, 16],
       ['Color', 'Daniel Hsia', 14.0, ..., 6.3, 2.35, 660],
       ['Color', 'Jon Gunn', 43.0, ..., 6.6, 1.85, 456]], dtype=object)

In [144]:
len(data)

4916

confirming that RangeIndex is a sub class of index

In [145]:
issubclass(pd.RangeIndex, pd.Index)  # This checks if first arg is a subclass of second arg

True

## There's more

In [146]:
index.values

array([   0,    1,    2, ..., 4913, 4914, 4915])

In [147]:
columns.values

array(['color', 'director_name', 'num_critic_for_reviews', 'duration',
       'director_facebook_likes', 'actor_3_facebook_likes',
       'actor_2_name', 'actor_1_facebook_likes', 'gross', 'genres',
       'actor_1_name', 'movie_title', 'num_voted_users',
       'cast_total_facebook_likes', 'actor_3_name',
       'facenumber_in_poster', 'plot_keywords', 'movie_imdb_link',
       'num_user_for_reviews', 'language', 'country', 'content_rating',
       'budget', 'title_year', 'actor_2_facebook_likes', 'imdb_score',
       'aspect_ratio', 'movie_facebook_likes'], dtype=object)

# Understanding data types

In [148]:
movie.dtypes

color                       object
director_name               object
num_critic_for_reviews     float64
duration                   float64
director_facebook_likes    float64
                            ...   
title_year                 float64
actor_2_facebook_likes     float64
imdb_score                 float64
aspect_ratio               float64
movie_facebook_likes         int64
Length: 28, dtype: object

In [149]:
movie.get_dtype_counts()

float64    13
int64       3
object     12
dtype: int64

# Selecting a single Series

In [150]:
movie['director_name']

0           James Cameron
1          Gore Verbinski
2              Sam Mendes
3       Christopher Nolan
4             Doug Walker
              ...        
4911          Scott Smith
4912                  NaN
4913     Benjamin Roberds
4914          Daniel Hsia
4915             Jon Gunn
Name: director_name, Length: 4916, dtype: object

In [151]:
type(movie['director_name'])

pandas.core.series.Series

# There's more

In [152]:
director = movie['director_name'] # save Series to variable
director.name

'director_name'

In [153]:
director.to_frame().head()  # converting the Series to a Data Frame and display first 5 rows

Unnamed: 0,director_name
0,James Cameron
1,Gore Verbinski
2,Sam Mendes
3,Christopher Nolan
4,Doug Walker


# Calling Series methods

## Getting ready...

In [154]:
s_attr_methods = set(dir(pd.Series))
len(s_attr_methods)

471

In [155]:
df_attr_methods = set(dir(pd.DataFrame))
len(df_attr_methods)

458

In [156]:
Meth = s_attr_methods & df_attr_methods
len(s_attr_methods & df_attr_methods)

400

## How to do it...

In [0]:
director = movie['director_name']
actor_1_fb_likes = movie['actor_1_facebook_likes']

In [158]:
director.head()

0        James Cameron
1       Gore Verbinski
2           Sam Mendes
3    Christopher Nolan
4          Doug Walker
Name: director_name, dtype: object

In [159]:
actor_1_fb_likes.head()

0     1000.0
1    40000.0
2    11000.0
3    27000.0
4      131.0
Name: actor_1_facebook_likes, dtype: float64

In [160]:
with pd.option_context('max_rows', 8):
    display(director.value_counts())

Steven Spielberg    26
Woody Allen         22
Clint Eastwood      20
Martin Scorsese     20
                    ..
Zach Cregger         1
Nathan Greno         1
Giovanni Zelko       1
Rob Pritts           1
Name: director_name, Length: 2397, dtype: int64

In [161]:
pd.set_option('max_rows', 8)
actor_1_fb_likes.value_counts()

1000.0     436
11000.0    206
2000.0     189
3000.0     150
          ... 
216.0        1
859.0        1
225.0        1
334.0        1
Name: actor_1_facebook_likes, Length: 877, dtype: int64

In [162]:
director.size  # Size normally outputs the product of nrows by ncols. In series it will be == len

4916

In [163]:
director.shape

(4916,)

In [164]:
len(director)

4916

In [165]:
director.count()  # Count returns the total number of Non-NaN values

4814

In [166]:
actor_1_fb_likes.count()

4909

In [167]:
actor_1_fb_likes.quantile(.25)  # method exists to calculate an exact quantile of numeric data

607.0

In [168]:
actor_1_fb_likes.min(), actor_1_fb_likes.max(), actor_1_fb_likes.mean(), \
    actor_1_fb_likes.median(), actor_1_fb_likes.std(), actor_1_fb_likes.sum()

(0.0, 640000.0, 6494.488490527602, 982.0, 15106.986883848309, 31881444.0)

In [169]:
actor_1_fb_likes.describe()

count      4909.000000
mean       6494.488491
std       15106.986884
min           0.000000
25%         607.000000
50%         982.000000
75%       11000.000000
max      640000.000000
Name: actor_1_facebook_likes, dtype: float64

In [170]:
actor_1_fb_likes.quantile(.2)

510.0

In [171]:
actor_1_fb_likes.quantile([.1, .2, .3, .4, .5, .6, .7, .8, .9])

0.1      240.0
0.2      510.0
0.3      694.0
0.4      854.0
        ...   
0.6     1000.0
0.7     8000.0
0.8    13000.0
0.9    18000.0
Name: actor_1_facebook_likes, Length: 9, dtype: float64

In [172]:
director.isnull()

0       False
1       False
2       False
3       False
        ...  
4912     True
4913    False
4914    False
4915    False
Name: director_name, Length: 4916, dtype: bool

In [173]:
actor_1_fb_likes_filled = actor_1_fb_likes.fillna(0)  # Filling missing values with 0
actor_1_fb_likes_filled.count()

4916

In [174]:
actor_1_fb_likes_dropped = actor_1_fb_likes.dropna()  # dropping missing values
actor_1_fb_likes_dropped.size

4909

## There's more...

In [175]:
director.value_counts(normalize=True)  # Setting normalize=True returns the relative frequency of each item in the distribution instead of the count of occurrence of each.

Steven Spielberg    0.005401
Woody Allen         0.004570
Clint Eastwood      0.004155
Martin Scorsese     0.004155
                      ...   
Zach Cregger        0.000208
Nathan Greno        0.000208
Giovanni Zelko      0.000208
Rob Pritts          0.000208
Name: director_name, Length: 2397, dtype: float64

In [176]:
director.hasnans  # Asking if the director series has any NaN values, returns a bool

True

In [177]:
director.notnull()  # returns a bool for each observation depending on whether it is null or not

0        True
1        True
2        True
3        True
        ...  
4912    False
4913     True
4914     True
4915     True
Name: director_name, Length: 4916, dtype: bool

In [178]:
director.describe()

count                 4814
unique                2397
top       Steven Spielberg
freq                    26
Name: director_name, dtype: object

# Working with operators on a Series

In [0]:
pd.options.display.max_rows = 6

In [180]:
5 + 9    # plus operator example. Adds 5 and 9

14

In [181]:
4 ** 2   # exponentiation operator. Raises 4 to the second power

16

In [0]:
a = 10   # assignment operator.

In [183]:
5 <= 9   # less than or equal to operator

True

In [184]:
'abcde' + 'fg'    # plus operator for strings. C

'abcdefg'

In [185]:
not (5 <= 9)      # not is an operator that is a reserved keyword and reverse a boolean

False

In [186]:
7 in [1, 2, 6]    # in operator checks for membership of a list

False

In [187]:
set([1,2,3]) & set([2,3,4])

{2, 3}

In [0]:
# [1, 2, 3] - 3  # This code line produces a TypeError

In [0]:
# a = set([1,2,3])     
# a[0]                 # the indexing operator does not work with sets

## Getting ready...

In [190]:
imdb_score = movie['imdb_score']
imdb_score

0       7.9
1       7.1
2       6.8
       ... 
4913    6.3
4914    6.3
4915    6.6
Name: imdb_score, Length: 4916, dtype: float64

In [191]:
imdb_score + 1

0       8.9
1       8.1
2       7.8
       ... 
4913    7.3
4914    7.3
4915    7.6
Name: imdb_score, Length: 4916, dtype: float64

In [192]:
imdb_score * 2.5

0       19.75
1       17.75
2       17.00
        ...  
4913    15.75
4914    15.75
4915    16.50
Name: imdb_score, Length: 4916, dtype: float64

In [193]:
imdb_score // 7

0       1.0
1       1.0
2       0.0
       ... 
4913    0.0
4914    0.0
4915    0.0
Name: imdb_score, Length: 4916, dtype: float64

In [194]:
imdb_score > 7

0        True
1        True
2       False
        ...  
4913    False
4914    False
4915    False
Name: imdb_score, Length: 4916, dtype: bool

boolean equality determination

In [0]:
director = movie['director_name']

In [196]:
director == 'James Cameron'  # checks if each element in the series is equal to 'James Cameron'

0        True
1       False
2       False
        ...  
4913    False
4914    False
4915    False
Name: director_name, Length: 4916, dtype: bool

## There's more...

In [197]:
imdb_score.add(1)              # imdb_score + 1

0       8.9
1       8.1
2       7.8
       ... 
4913    7.3
4914    7.3
4915    7.6
Name: imdb_score, Length: 4916, dtype: float64

In [198]:
imdb_score.mul(2.5)            # imdb_score * 2.5

0       19.75
1       17.75
2       17.00
        ...  
4913    15.75
4914    15.75
4915    16.50
Name: imdb_score, Length: 4916, dtype: float64

In [199]:
imdb_score.floordiv(7)         # imdb_score // 7

0       1.0
1       1.0
2       0.0
       ... 
4913    0.0
4914    0.0
4915    0.0
Name: imdb_score, Length: 4916, dtype: float64

In [200]:
imdb_score.gt(7)               # imdb_score > 7

0        True
1        True
2       False
        ...  
4913    False
4914    False
4915    False
Name: imdb_score, Length: 4916, dtype: bool

In [201]:
director.eq('James Cameron')   # director == 'James Cameron'

0        True
1       False
2       False
        ...  
4913    False
4914    False
4915    False
Name: director_name, Length: 4916, dtype: bool

In [202]:
imdb_score.astype(int).mod(5)  # Casting imdb_score to an int and finding mode 5 for each entry.

0       2
1       2
2       1
       ..
4913    1
4914    1
4915    1
Name: imdb_score, Length: 4916, dtype: int64

In [0]:
a = type(1)

In [204]:
type(a)

type

In [0]:
a = type(imdb_score)

In [206]:
a([1,2,3])

0    1
1    2
2    3
dtype: int64

# Chaining Series methods together

In [0]:
actor_1_fb_likes = movie['actor_1_facebook_likes']
director = movie['director_name']

In [208]:
director.value_counts().head(3)  # Select the count of unique elements in director and display the first 3

Steven Spielberg    26
Woody Allen         22
Clint Eastwood      20
Name: director_name, dtype: int64

In [209]:
actor_1_fb_likes.isnull().sum()  # what is the number of NaN values in the actor_1_fb_likes column

7

In [210]:
actor_1_fb_likes.dtype

dtype('float64')

In [211]:
# Chaining expressions, first fillna with zero
# Then convert the values to int
# Then display first 5 rows

actor_1_fb_likes.fillna(0)\
                .astype(int)\
                .head()

0     1000
1    40000
2    11000
3    27000
4      131
Name: actor_1_facebook_likes, dtype: int64

## There's more...

In [212]:
actor_1_fb_likes.isnull()\
                .mean()

0.0014239218877135883

In [213]:
# doing the same code again with parenthesis

(actor_1_fb_likes.fillna(0)
                 .astype(int)
                 .head())

0     1000
1    40000
2    11000
3    27000
4      131
Name: actor_1_facebook_likes, dtype: int64

# Making a meaningful index

In [214]:
movie.shape

(4916, 28)

In [215]:
movie.head()

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,...,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
0,Color,James Cameron,723.0,178.0,...,936.0,7.9,1.78,33000
1,Color,Gore Verbinski,302.0,169.0,...,5000.0,7.1,2.35,0
2,Color,Sam Mendes,602.0,148.0,...,393.0,6.8,2.35,85000
3,Color,Christopher Nolan,813.0,164.0,...,23000.0,8.5,2.35,164000
4,,Doug Walker,,,...,12.0,7.1,,0


Defining a new data frame like movie dataframe but by simply setting index to a series in movie dataframe

In [216]:
movie2 = movie.set_index('movie_title')
movie2

Unnamed: 0_level_0,color,director_name,num_critic_for_reviews,duration,...,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
movie_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Avatar,Color,James Cameron,723.0,178.0,...,936.0,7.9,1.78,33000
Pirates of the Caribbean: At World's End,Color,Gore Verbinski,302.0,169.0,...,5000.0,7.1,2.35,0
Spectre,Color,Sam Mendes,602.0,148.0,...,393.0,6.8,2.35,85000
...,...,...,...,...,...,...,...,...,...
A Plague So Pleasant,Color,Benjamin Roberds,13.0,76.0,...,0.0,6.3,,16
Shanghai Calling,Color,Daniel Hsia,14.0,100.0,...,719.0,6.3,2.35,660
My Date with Drew,Color,Jon Gunn,43.0,90.0,...,23.0,6.6,1.85,456


it is possible to choose a column as the index upon initial read with the index_col parameter of the read_csv function:

In [223]:
data = StringIO(file)
pd.read_csv(data, index_col='movie_title')

Unnamed: 0_level_0,color,director_name,num_critic_for_reviews,duration,...,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
movie_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Avatar,Color,James Cameron,723.0,178.0,...,936.0,7.9,1.78,33000
Pirates of the Caribbean: At World's End,Color,Gore Verbinski,302.0,169.0,...,5000.0,7.1,2.35,0
Spectre,Color,Sam Mendes,602.0,148.0,...,393.0,6.8,2.35,85000
...,...,...,...,...,...,...,...,...,...
A Plague So Pleasant,Color,Benjamin Roberds,13.0,76.0,...,0.0,6.3,,16
Shanghai Calling,Color,Daniel Hsia,14.0,100.0,...,719.0,6.3,2.35,660
My Date with Drew,Color,Jon Gunn,43.0,90.0,...,23.0,6.6,1.85,456


# There's more...

In [224]:
movie2.reset_index()  # resetting the index back to default

Unnamed: 0,movie_title,color,director_name,num_critic_for_reviews,...,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
0,Avatar,Color,James Cameron,723.0,...,936.0,7.9,1.78,33000
1,Pirates of the Caribbean: At World's End,Color,Gore Verbinski,302.0,...,5000.0,7.1,2.35,0
2,Spectre,Color,Sam Mendes,602.0,...,393.0,6.8,2.35,85000
...,...,...,...,...,...,...,...,...,...
4913,A Plague So Pleasant,Color,Benjamin Roberds,13.0,...,0.0,6.3,,16
4914,Shanghai Calling,Color,Daniel Hsia,14.0,...,719.0,6.3,2.35,660
4915,My Date with Drew,Color,Jon Gunn,43.0,...,23.0,6.6,1.85,456


# Renaming row and column labels

In [229]:
# If we know from time, which column will be more meaningful for data indexing and analysis,
# Then we can specify it on time in the read_csv function by setting index_col parameter

data = StringIO(file)

movie3 = pd.read_csv(data, index_col='movie_title')

movie3

Unnamed: 0_level_0,color,director_name,num_critic_for_reviews,duration,...,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
movie_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Avatar,Color,James Cameron,723.0,178.0,...,936.0,7.9,1.78,33000
Pirates of the Caribbean: At World's End,Color,Gore Verbinski,302.0,169.0,...,5000.0,7.1,2.35,0
Spectre,Color,Sam Mendes,602.0,148.0,...,393.0,6.8,2.35,85000
...,...,...,...,...,...,...,...,...,...
A Plague So Pleasant,Color,Benjamin Roberds,13.0,76.0,...,0.0,6.3,,16
Shanghai Calling,Color,Daniel Hsia,14.0,100.0,...,719.0,6.3,2.35,660
My Date with Drew,Color,Jon Gunn,43.0,90.0,...,23.0,6.6,1.85,456


<h3>By default, both set_index and read_csv drop the column used as the index from the DataFrame. With set_index, it is possible to keep the column in the DataFrame by setting the drop parameter to False.

There's more...
Conversely, it is possible to turn the index into a column with the reset_index method. This will make movie_title a column again and revert the index back to a RangeIndex. reset_index always puts the column as the very first one in the DataFrame, so the columns may not be in their original order:</h3>

In [233]:
'movie_title' in movie3.columns  # the movie_title col has become the index so it no longer exists as a column

False

Renaming some index and columns

In [0]:
indexes_renamed = {'Avatar':'Ratava', 'Spectre': 'Ertceps'} 
columns_renamed = {'director_name':'Director Name', 
                       'num_critic_for_reviews': 'Critical Reviews'} 

In [235]:
movie3.rename(index=indexes_renamed, columns=columns_renamed).head()

Unnamed: 0_level_0,color,Director Name,Critical Reviews,duration,...,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
movie_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Ratava,Color,James Cameron,723.0,178.0,...,936.0,7.9,1.78,33000
Pirates of the Caribbean: At World's End,Color,Gore Verbinski,302.0,169.0,...,5000.0,7.1,2.35,0
Ertceps,Color,Sam Mendes,602.0,148.0,...,393.0,6.8,2.35,85000
The Dark Knight Rises,Color,Christopher Nolan,813.0,164.0,...,23000.0,8.5,2.35,164000
Star Wars: Episode VII - The Force Awakens,,Doug Walker,,,...,12.0,7.1,,0


# There's more

How it works...
The rename DataFrame method allows for both row and column labels to be renamed at the same time with the index and columns parameters. Each of these parameters may be set to a dictionary that maps old labels to their new values.

There's more...
There are multiple ways to rename row and column labels. It is possible to reassign the index and column attributes directly to a Python list. This assignment works when the list has the same number of elements as the row and column labels. The following code uses the tolist method on each Index object to create a Python list of labels. It then modifies a couple values in the list and reassigns the list to the attributes index and columns:



In [0]:
data = StringIO(file)

movie = pd.read_csv(data, index_col='movie_title')
index = movie.index
columns = movie.columns

index_list = index.tolist()
column_list = columns.tolist()

index_list[0] = 'Ratava'
index_list[2] = 'Ertceps'
column_list[1] = 'Director Name'
column_list[2] = 'Critical Reviews'

In [237]:
print(index_list[:5])

['Ratava', "Pirates of the Caribbean: At World's End", 'Ertceps', 'The Dark Knight Rises', 'Star Wars: Episode VII - The Force Awakens']


In [238]:
print(column_list)

['color', 'Director Name', 'Critical Reviews', 'duration', 'director_facebook_likes', 'actor_3_facebook_likes', 'actor_2_name', 'actor_1_facebook_likes', 'gross', 'genres', 'actor_1_name', 'num_voted_users', 'cast_total_facebook_likes', 'actor_3_name', 'facenumber_in_poster', 'plot_keywords', 'movie_imdb_link', 'num_user_for_reviews', 'language', 'country', 'content_rating', 'budget', 'title_year', 'actor_2_facebook_likes', 'imdb_score', 'aspect_ratio', 'movie_facebook_likes']


In [0]:
movie.index = index_list
movie.columns = column_list

In [242]:
movie.head()

Unnamed: 0,color,Director Name,Critical Reviews,duration,...,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
Ratava,Color,James Cameron,723.0,178.0,...,936.0,7.9,1.78,33000
Pirates of the Caribbean: At World's End,Color,Gore Verbinski,302.0,169.0,...,5000.0,7.1,2.35,0
Ertceps,Color,Sam Mendes,602.0,148.0,...,393.0,6.8,2.35,85000
The Dark Knight Rises,Color,Christopher Nolan,813.0,164.0,...,23000.0,8.5,2.35,164000
Star Wars: Episode VII - The Force Awakens,,Doug Walker,,,...,12.0,7.1,,0


# Creating and deleting columns

In [0]:
movie['has_seen'] = 0

In [244]:
movie.columns

Index(['color', 'Director Name', 'Critical Reviews', 'duration',
       'director_facebook_likes', 'actor_3_facebook_likes', 'actor_2_name',
       'actor_1_facebook_likes', 'gross', 'genres', 'actor_1_name',
       'num_voted_users', 'cast_total_facebook_likes', 'actor_3_name',
       'facenumber_in_poster', 'plot_keywords', 'movie_imdb_link',
       'num_user_for_reviews', 'language', 'country', 'content_rating',
       'budget', 'title_year', 'actor_2_facebook_likes', 'imdb_score',
       'aspect_ratio', 'movie_facebook_likes', 'has_seen'],
      dtype='object')

In [0]:
movie['actor_director_facebook_likes'] = (movie['actor_1_facebook_likes'] + 
                                              movie['actor_2_facebook_likes'] + 
                                              movie['actor_3_facebook_likes'] + 
                                              movie['director_facebook_likes'])

In [246]:
movie['actor_director_facebook_likes'].isnull().sum()

122

In [0]:
movie['actor_director_facebook_likes'] = movie['actor_director_facebook_likes'].fillna(0)

In [0]:
movie['is_cast_likes_more'] = (movie['cast_total_facebook_likes'] >= 
                                  movie['actor_director_facebook_likes'])

is_cast_likes_more is now a column of boolean values. We can check whether all the values of this column are True with the all Series method:

In [251]:
movie['is_cast_likes_more'].all()  # Checks if all values are True in the column is_cast_likes_more

False

It turns out that there is at least one movie with more actor_director_facebook_likes than cast_total_facebook_likes. It could be that director Facebook likes are not part of the cast total likes. Let's backtrack and delete column actor_director_facebook_likes:

In [0]:
movie = movie.drop('actor_director_facebook_likes', axis='columns')

Let's recreate a column of just the total actor likes:

In [0]:
movie['actor_total_facebook_likes'] = (movie['actor_1_facebook_likes'] + 
                                       movie['actor_2_facebook_likes'] + 
                                       movie['actor_3_facebook_likes'])

movie['actor_total_facebook_likes'] = movie['actor_total_facebook_likes'].fillna(0)

Check again whether all the values in cast_total_facebook_likes are greater than the actor_total_facebook_likes:

In [254]:
movie['is_cast_likes_more'] = movie['cast_total_facebook_likes'] >= \
                                  movie['actor_total_facebook_likes']
    
movie['is_cast_likes_more'].all()

True

Finally, let's calculate the percentage of the cast_total_facebook_likes that come from actor_total_facebook_likes:

In [0]:
movie['pct_actor_cast_like'] = (movie['actor_total_facebook_likes'] / 
                                movie['cast_total_facebook_likes'])

Let's validate that the min and max of this column fall between 0 and 1:

In [256]:
movie['pct_actor_cast_like'].min(), movie['pct_actor_cast_like'].max() 

(0.0, 1.0)

We can then output this column as a Series. First, we need to set the index to the movie title so we can properly identify each value.

In [261]:
movie.reset_index
movie.head()

Unnamed: 0,color,Director Name,Critical Reviews,duration,...,has_seen,is_cast_likes_more,actor_total_facebook_likes,pct_actor_cast_like
Ratava,Color,James Cameron,723.0,178.0,...,0,True,2791.0,0.577369
Pirates of the Caribbean: At World's End,Color,Gore Verbinski,302.0,169.0,...,0,True,46000.0,0.951396
Ertceps,Color,Sam Mendes,602.0,148.0,...,0,True,11554.0,0.987521
The Dark Knight Rises,Color,Christopher Nolan,813.0,164.0,...,0,True,73000.0,0.683783
Star Wars: Episode VII - The Force Awakens,,Doug Walker,,,...,0,True,0.0,0.0


<h3>How it works...
Many pandas operations are flexible, and column creation is one of them. This recipe assigns both a scalar value, as seen in Step 1, and a Series, as seen in step 2, to create a new column.

Step 2 adds four different Series together with the plus operator. Step 3 uses method chaining to find and fill missing values. Step 4 uses the greater than or equal comparison operator to return a boolean Series, which is then evaluated with the all method in step 5 to check whether every single value is True or not.

The drop method accepts the name of the row or column to delete. It defaults to dropping rows by the index names. To drop columns you must set the axis parameter to either 1 or columns. The default value for axis is 0 or the string index.

Steps 7 and 8 redo the work of step 3 to step 5 without the director_facebook_likes column. Step 9 finally calculates the desired column we wanted since step 4. Step 10 validates that the percentages are between 0 and 1.</h3>.

## There's more...

It is possible to insert a new column into a specific place in a DataFrame besides the end with the insert method. The insert method takes the integer position of the new column as its first argument, the name of the new column as its second, and the values as its third. You will need to use the get_loc Index method to find the integer location of the column name.

The insert method modifies the calling DataFrame in-place, so there won't be an assignment statement. The profit of each movie may be calculated by subtracting budget from gross and inserting it directly after gross with the following:

In [0]:
profit_index = movie.columns.get_loc('gross') + 1  # This gets the column location of gross and 1 added to make it the index of profit so they be side by side

In [0]:
movie.insert(profit_index, 'profit', movie.gross - movie.budget)

In [268]:
movie.columns  # we can see that profit is immediately after gross and is the value of gross minus budget

Index(['color', 'Director Name', 'Critical Reviews', 'duration',
       'director_facebook_likes', 'actor_3_facebook_likes', 'actor_2_name',
       'actor_1_facebook_likes', 'gross', 'profit', 'genres', 'actor_1_name',
       'num_voted_users', 'cast_total_facebook_likes', 'actor_3_name',
       'facenumber_in_poster', 'plot_keywords', 'movie_imdb_link',
       'num_user_for_reviews', 'language', 'country', 'content_rating',
       'budget', 'title_year', 'actor_2_facebook_likes', 'imdb_score',
       'aspect_ratio', 'movie_facebook_likes', 'has_seen',
       'is_cast_likes_more', 'actor_total_facebook_likes',
       'pct_actor_cast_like'],
      dtype='object')

### An alternative to deleting columns with the drop method is to use the del statement:

In [0]:
del(movie['actor_total_facebook_likes'])

In [275]:
movie.columns  # We can see that the actor_total_facebook_likes column has been aptly deleted.

Index(['color', 'Director Name', 'Critical Reviews', 'duration',
       'director_facebook_likes', 'actor_3_facebook_likes', 'actor_2_name',
       'actor_1_facebook_likes', 'gross', 'profit', 'genres', 'actor_1_name',
       'num_voted_users', 'cast_total_facebook_likes', 'actor_3_name',
       'facenumber_in_poster', 'plot_keywords', 'movie_imdb_link',
       'num_user_for_reviews', 'language', 'country', 'content_rating',
       'budget', 'title_year', 'actor_2_facebook_likes', 'imdb_score',
       'aspect_ratio', 'movie_facebook_likes', 'has_seen',
       'is_cast_likes_more', 'pct_actor_cast_like'],
      dtype='object')