# Pandas Intro

**pandas** is a fast, powerful, flexible and easy to use open source data analysis and manipulation tool, built on top of the Python programming language. (https://pandas.pydata.org/)

In [2]:
import pandas as pd

## Series
![series](series.png)

The primary two building blocks of **pandas** are the `Series` and `DataFrame`.

A `Series` is essentially a column of data with
* a name
* a row index
* a datatype

In [6]:
data = [42, 53, 64, 75.5]
ser = pd.Series( data, name="sales", index=["john", "paul", "george", "ringo"] )
ser

john      42.0
paul      53.0
george    64.0
ringo     75.5
Name: sales, dtype: float64

In [7]:
data_dict = {"XBX": 1989, "EP": 1912, "OX": 2022}
club_series = pd.Series(data_dict)
club_series

XBX    1989
EP     1912
OX     2022
dtype: int64

## DataFrame

A `DataFrame` is a collection of 1 or more Series, hence, a 2-dimensional table of data with
* a Series per column
* a shared index for all the columns
* a name

![](series-and-dataframe.png)

In [11]:
data = {
    "apples": [3,2,0,1],
    "oranges": [0,3,7,2]
}

fruit_df = pd.DataFrame(data)
fruit_df

Unnamed: 0,apples,oranges
0,3,0
1,2,3
2,0,7
3,1,2


### Components of a `DataFrame`

In [12]:
# index
fruit_df.index

RangeIndex(start=0, stop=4, step=1)

In [40]:
# custom index
fruit_df = pd.DataFrame(data, index=["john", "paul", "george", "ringo"])
fruit_df


Unnamed: 0,apples,oranges
john,3,0
paul,2,3
george,0,7
ringo,1,2


In [15]:
# data
fruit_df.to_numpy()

array([[3, 0],
       [2, 3],
       [0, 7],
       [1, 2]])

In [16]:
# datatypes
fruit_df.dtypes

apples     int64
oranges    int64
dtype: object

## Reading in a csv

In [18]:
movies_df = pd.read_csv('movie.csv')
movies_df

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,num_user_for_reviews,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
0,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,...,3054.0,English,USA,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000
1,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,Action|Adventure|Fantasy,...,1238.0,English,USA,PG-13,300000000.0,2007.0,5000.0,7.1,2.35,0
2,Color,Sam Mendes,602.0,148.0,0.0,161.0,Rory Kinnear,11000.0,200074175.0,Action|Adventure|Thriller,...,994.0,English,UK,PG-13,245000000.0,2015.0,393.0,6.8,2.35,85000
3,Color,Christopher Nolan,813.0,164.0,22000.0,23000.0,Christian Bale,27000.0,448130642.0,Action|Thriller,...,2701.0,English,USA,PG-13,250000000.0,2012.0,23000.0,8.5,2.35,164000
4,,Doug Walker,,,131.0,,Rob Walker,131.0,,Documentary,...,,,,,,,12.0,7.1,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4911,Color,Scott Smith,1.0,87.0,2.0,318.0,Daphne Zuniga,637.0,,Comedy|Drama,...,6.0,English,Canada,,,2013.0,470.0,7.7,,84
4912,Color,,43.0,43.0,,319.0,Valorie Curry,841.0,,Crime|Drama|Mystery|Thriller,...,359.0,English,USA,TV-14,,,593.0,7.5,16.00,32000
4913,Color,Benjamin Roberds,13.0,76.0,0.0,0.0,Maxwell Moody,0.0,,Drama|Horror|Thriller,...,3.0,English,USA,,1400.0,2013.0,0.0,6.3,,16
4914,Color,Daniel Hsia,14.0,100.0,0.0,489.0,Daniel Henney,946.0,10443.0,Comedy|Drama|Romance,...,9.0,English,USA,PG-13,,2012.0,719.0,6.3,2.35,660


In [20]:
# head
movies_df.head(10)

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,num_user_for_reviews,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
0,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,...,3054.0,English,USA,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000
1,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,Action|Adventure|Fantasy,...,1238.0,English,USA,PG-13,300000000.0,2007.0,5000.0,7.1,2.35,0
2,Color,Sam Mendes,602.0,148.0,0.0,161.0,Rory Kinnear,11000.0,200074175.0,Action|Adventure|Thriller,...,994.0,English,UK,PG-13,245000000.0,2015.0,393.0,6.8,2.35,85000
3,Color,Christopher Nolan,813.0,164.0,22000.0,23000.0,Christian Bale,27000.0,448130642.0,Action|Thriller,...,2701.0,English,USA,PG-13,250000000.0,2012.0,23000.0,8.5,2.35,164000
4,,Doug Walker,,,131.0,,Rob Walker,131.0,,Documentary,...,,,,,,,12.0,7.1,,0
5,Color,Andrew Stanton,462.0,132.0,475.0,530.0,Samantha Morton,640.0,73058679.0,Action|Adventure|Sci-Fi,...,738.0,English,USA,PG-13,263700000.0,2012.0,632.0,6.6,2.35,24000
6,Color,Sam Raimi,392.0,156.0,0.0,4000.0,James Franco,24000.0,336530303.0,Action|Adventure|Romance,...,1902.0,English,USA,PG-13,258000000.0,2007.0,11000.0,6.2,2.35,0
7,Color,Nathan Greno,324.0,100.0,15.0,284.0,Donna Murphy,799.0,200807262.0,Adventure|Animation|Comedy|Family|Fantasy|Musi...,...,387.0,English,USA,PG,260000000.0,2010.0,553.0,7.8,1.85,29000
8,Color,Joss Whedon,635.0,141.0,0.0,19000.0,Robert Downey Jr.,26000.0,458991599.0,Action|Adventure|Sci-Fi,...,1117.0,English,USA,PG-13,250000000.0,2015.0,21000.0,7.5,2.35,118000
9,Color,David Yates,375.0,153.0,282.0,10000.0,Daniel Radcliffe,25000.0,301956980.0,Adventure|Family|Fantasy|Mystery,...,973.0,English,UK,PG,250000000.0,2009.0,11000.0,7.5,2.35,10000


In [21]:
# sample

movies_df.sample(8)

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,num_user_for_reviews,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
2614,Color,Sara Sugarman,74.0,89.0,10.0,231.0,Carol Kane,811.0,29302097.0,Comedy|Family|Music|Romance,...,105.0,English,Germany,PG,15000000.0,2004.0,636.0,4.6,1.85,810
1088,Color,Luis Llosa,38.0,110.0,49.0,20.0,Rod Steiger,13000.0,57362581.0,Action|Crime|Drama|Romance|Thriller,...,116.0,English,Peru,R,45000000.0,1994.0,279.0,5.4,1.85,0
4334,Color,Adam Carolla,14.0,98.0,102.0,360.0,Jim O'Heir,563.0,105943.0,Comedy,...,11.0,English,USA,,1500000.0,2015.0,485.0,6.1,,212
3030,Color,Léa Pool,17.0,97.0,4.0,194.0,Marc Donato,882.0,,Adventure|Drama,...,28.0,English,Canada,PG,12500000.0,2004.0,450.0,6.3,1.85,392
175,Color,Louis Leterrier,354.0,135.0,255.0,882.0,Peter Mensah,3000.0,134518390.0,Action|Adventure|Sci-Fi,...,643.0,English,USA,PG-13,150000000.0,2008.0,1000.0,6.8,2.35,0
20,Color,Peter Jackson,422.0,164.0,0.0,773.0,Adam Brown,5000.0,255108370.0,Adventure|Fantasy,...,802.0,English,New Zealand,PG-13,250000000.0,2014.0,972.0,7.5,2.35,65000
3444,Color,David Mamet,158.0,99.0,342.0,501.0,Randy Couture,1000.0,2344847.0,Drama|Sport,...,119.0,English,USA,R,7000000.0,2008.0,518.0,6.8,2.35,0
4623,Color,Siddiq Barmak,105.0,83.0,6.0,0.0,Zubaida Sahar,30.0,1127331.0,Drama,...,77.0,Dari,Afghanistan,PG-13,46000.0,2003.0,0.0,7.4,1.85,0


In [22]:
# shape

movies_df.shape

(4916, 28)

In [23]:
# columns
movies_df.columns

Index(['color', 'director_name', 'num_critic_for_reviews', 'duration',
       'director_facebook_likes', 'actor_3_facebook_likes', 'actor_2_name',
       'actor_1_facebook_likes', 'gross', 'genres', 'actor_1_name',
       'movie_title', 'num_voted_users', 'cast_total_facebook_likes',
       'actor_3_name', 'facenumber_in_poster', 'plot_keywords',
       'movie_imdb_link', 'num_user_for_reviews', 'language', 'country',
       'content_rating', 'budget', 'title_year', 'actor_2_facebook_likes',
       'imdb_score', 'aspect_ratio', 'movie_facebook_likes'],
      dtype='object')

## Selecting a column

In [34]:
# index operator ([] notation)
directors = movies_df["director_name"]
directors

# attribute acccess (dot notation)
directors = movies_df.director_name
directors

0           James Cameron
1          Gore Verbinski
2              Sam Mendes
3       Christopher Nolan
4             Doug Walker
              ...        
4911          Scott Smith
4912                  NaN
4913     Benjamin Roberds
4914          Daniel Hsia
4915             Jon Gunn
Name: director_name, Length: 4916, dtype: object

## Selecting rows and columns

In [39]:
# select row with index 0
row0 = movies_df.loc[ 0 ]
row0


color                                                                    Color
director_name                                                    James Cameron
num_critic_for_reviews                                                     723
duration                                                                   178
director_facebook_likes                                                      0
actor_3_facebook_likes                                                     855
actor_2_name                                                  Joel David Moore
actor_1_facebook_likes                                                    1000
gross                                                              7.60506e+08
genres                                         Action|Adventure|Fantasy|Sci-Fi
actor_1_name                                                       CCH Pounder
movie_title                                                             Avatar
num_voted_users                                     

In [44]:
# custom index
fruit_df = pd.DataFrame(data, index=["john", "paul", "george", "ringo"])
print(fruit_df)
fruit_df.loc["paul"]

        apples  oranges
john         3        0
paul         2        3
george       0        7
ringo        1        2


apples     2
oranges    3
Name: paul, dtype: int64

In [47]:
# select row at location 3
movies_df.iloc[0]
fruit_df.iloc[3]

apples     1
oranges    2
Name: ringo, dtype: int64

In [49]:
# select row with index 0 and director name column
movies_df.loc[0, "director_name"]

'James Cameron'

In [50]:
# select row with index 10 and movie title column
movies_df.loc[10, "movie_title"]

'Batman v Superman: Dawn of Justice'

In [51]:
# select all rows and column director name
movies_df.loc[ :, "director_name"]

0           James Cameron
1          Gore Verbinski
2              Sam Mendes
3       Christopher Nolan
4             Doug Walker
              ...        
4911          Scott Smith
4912                  NaN
4913     Benjamin Roberds
4914          Daniel Hsia
4915             Jon Gunn
Name: director_name, Length: 4916, dtype: object

In [57]:
# select director name, movie title, and imdb score for all the rows

movies_df.loc[ 0:10 , ["director_name", "movie_title", "imdb_score"] ].sort_values("imdb_score", ascending=False)

Unnamed: 0,director_name,movie_title,imdb_score
3,Christopher Nolan,The Dark Knight Rises,8.5
0,James Cameron,Avatar,7.9
7,Nathan Greno,Tangled,7.8
8,Joss Whedon,Avengers: Age of Ultron,7.5
9,David Yates,Harry Potter and the Half-Blood Prince,7.5
1,Gore Verbinski,Pirates of the Caribbean: At World's End,7.1
4,Doug Walker,Star Wars: Episode VII - The Force Awakens,7.1
10,Zack Snyder,Batman v Superman: Dawn of Justice,6.9
2,Sam Mendes,Spectre,6.8
5,Andrew Stanton,John Carter,6.6


### `.value-counts()`

In [27]:
# get a count of all the directors
directors.value_counts()

Steven Spielberg         26
Woody Allen              22
Clint Eastwood           20
Martin Scorsese          20
Spike Lee                16
                         ..
John Michael McDonagh     1
Anthony Hickox            1
Sam Martin                1
Christian Sesma           1
Dan O'Bannon              1
Name: director_name, Length: 2397, dtype: int64

In [30]:
# size

directors.size

4916

In [31]:
# shape
directors.shape

(4916,)

In [32]:
# len()
len(directors)

4916

How many distinct directors are in the dataset?

In [None]:
# unique()

What is the minimum, maximum, and mean duration of all movies?

In [66]:
print(movies_df.duration.min())
print(movies_df.duration.max())
print(movies_df.duration.mean())
print(movies_df.duration.median())
print(movies_df.duration.mode())

7.0
511.0
107.0907977963681
103.0
0    90.0
dtype: float64


In [68]:
movies_df.duration.describe()

count    4901.000000
mean      107.090798
std        25.286015
min         7.000000
25%        93.000000
50%       103.000000
75%       118.000000
max       511.000000
Name: duration, dtype: float64

In [69]:
movies_df

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,num_user_for_reviews,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
0,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,...,3054.0,English,USA,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000
1,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,Action|Adventure|Fantasy,...,1238.0,English,USA,PG-13,300000000.0,2007.0,5000.0,7.1,2.35,0
2,Color,Sam Mendes,602.0,148.0,0.0,161.0,Rory Kinnear,11000.0,200074175.0,Action|Adventure|Thriller,...,994.0,English,UK,PG-13,245000000.0,2015.0,393.0,6.8,2.35,85000
3,Color,Christopher Nolan,813.0,164.0,22000.0,23000.0,Christian Bale,27000.0,448130642.0,Action|Thriller,...,2701.0,English,USA,PG-13,250000000.0,2012.0,23000.0,8.5,2.35,164000
4,,Doug Walker,,,131.0,,Rob Walker,131.0,,Documentary,...,,,,,,,12.0,7.1,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4911,Color,Scott Smith,1.0,87.0,2.0,318.0,Daphne Zuniga,637.0,,Comedy|Drama,...,6.0,English,Canada,,,2013.0,470.0,7.7,,84
4912,Color,,43.0,43.0,,319.0,Valorie Curry,841.0,,Crime|Drama|Mystery|Thriller,...,359.0,English,USA,TV-14,,,593.0,7.5,16.00,32000
4913,Color,Benjamin Roberds,13.0,76.0,0.0,0.0,Maxwell Moody,0.0,,Drama|Horror|Thriller,...,3.0,English,USA,,1400.0,2013.0,0.0,6.3,,16
4914,Color,Daniel Hsia,14.0,100.0,0.0,489.0,Daniel Henney,946.0,10443.0,Comedy|Drama|Romance,...,9.0,English,USA,PG-13,,2012.0,719.0,6.3,2.35,660


In [81]:
mask = movies_df.title_year <= 2000
mask.sum()

movies_df[mask]

movies_df[  movies_df.title_year <= 2000  ].title_year.size

1478

In [92]:
zs_movies = movies_df[  (movies_df.director_name =="Zack Snyder") & (movies_df.imdb_score > 7) ]
zs_movies[ ["duration", "movie_title"]]

# #good_zs_movies = zs_movies[ zs_movies.imdb_score > 7]
#good_zs_movies


Unnamed: 0,duration,movie_title
15,143.0,Man of Steel
164,215.0,Watchmen
701,117.0,300
1743,110.0,Dawn of the Dead


In [97]:
horror_movies = movies_df[ movies_df.genres.str.contains("Horror") ]
horror_movies.movie_facebook_likes.mean()

6411.364312267658