In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# Netflix Data

In [72]:
import pandas as pd

# https://www.kaggle.com/datasets/victorsoeiro/netflix-tv-shows-and-movies
df = pd.read_csv("titles.csv")
print(df.sample(5, random_state=0))
print(df.shape)

#             id                  title   type                                        description  release_year age_certification  runtime                                             genres production_countries  seasons     imdb_id  imdb_score  imdb_votes  tmdb_popularity  tmdb_score
# 1519   ts38761        Another Miss Oh   SHOW  Two women working in the same industry with th...          2016             TV-MA       69          ['drama', 'fantasy', 'romance', 'comedy']               ['KR']      1.0   tt5679572         7.9      1769.0           22.672         8.2
# 4942  ts225657                Halston   SHOW  American fashion designer Halston skyrockets t...          2021             TV-MA       47                                          ['drama']               ['US']      1.0  tt10920514         7.5     14040.0           21.349         7.3
# 895    tm34646             Sisterakas  MOVIE  A man takes revenge on his sister by hiring he...          2012               NaN      110                                ['drama', 'comedy']               ['PH']      NaN   tt2590214         5.2       286.0            2.552         4.9
# 5426  ts301609  Love Is Blind: Brazil   SHOW  The dating experiment comes to Brazil as local...          2021             TV-MA       56                             ['romance', 'reality']               ['BR']      1.0  tt15018224         6.1       425.0            5.109         6.4
# 2033   ts56038         Dave Chappelle   SHOW  Comedy icon Dave Chappelle makes his triumphan...          2017               NaN       60                        ['comedy', 'documentation']               ['US']      1.0   tt6963504         8.7      2753.0            2.962         7.6
# (5806, 15)

            id                  title   type  \
1519   ts38761        Another Miss Oh   SHOW   
4942  ts225657                Halston   SHOW   
895    tm34646             Sisterakas  MOVIE   
5426  ts301609  Love Is Blind: Brazil   SHOW   
2033   ts56038         Dave Chappelle   SHOW   

                                            description  release_year  \
1519  Two women working in the same industry with th...          2016   
4942  American fashion designer Halston skyrockets t...          2021   
895   A man takes revenge on his sister by hiring he...          2012   
5426  The dating experiment comes to Brazil as local...          2021   
2033  Comedy icon Dave Chappelle makes his triumphan...          2017   

     age_certification  runtime                                     genres  \
1519             TV-MA       69  ['drama', 'fantasy', 'romance', 'comedy']   
4942             TV-MA       47                                  ['drama']   
895                NaN      110       

# Anti-Pattern #1: Mutating instead of chaining

In [73]:
import pandas as pd

df = pd.read_csv("titles.csv")

# Mutation - DON'T DO THIS
df_bad = df.query("runtime > 30 & type == 'SHOW'")
df_bad["score"] = df_bad[["imdb_score", "tmdb_score"]].sum(axis=1)
df_bad = df_bad[["seasons", "score"]]
df_bad = df_bad.groupby("seasons").agg(["count", "mean"])
df_bad = df_bad.droplevel(axis=1, level=0)
df_bad = df_bad.query("count > 10")

# Chaining - DO THIS
# fmt: off
df_good = (df
    .query("runtime > 30 & type == 'SHOW'")
    .assign(score=lambda df_: df_[["imdb_score", "tmdb_score"]].sum(axis=1))
    [["seasons", "score"]]
    .groupby("seasons")
    .agg(["count", "mean"])
    .droplevel(axis=1, level=0)
    .query("count > 10")
)
# fmt: on

print(df_good)
print(pd.testing.assert_frame_equal(df_bad, df_good))

# returns:
#          count       mean
# seasons
# 1.0        835  13.064671
# 2.0        189  14.109524
# 3.0         83  14.618072
# 4.0         41  14.887805
# 5.0         38  15.242105
# 6.0         16  15.962500

         count       mean
seasons                  
1.0        835  13.064671
2.0        189  14.109524
3.0         83  14.618072
4.0         41  14.887805
5.0         38  15.242105
6.0         16  15.962500
None


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_bad["score"] = df_bad[["imdb_score", "tmdb_score"]].sum(axis=1)


In [75]:
import pandas as pd

df = pd.read_csv("titles.csv")


def split_production_countries(df_):
    # split `production_countries` column (containing lists of country
    # strings) into three individual columns of single country strings
    dfc = pd.DataFrame(df_["production_countries"].apply(eval).to_list())
    dfc = dfc.iloc[:, :3]
    dfc.columns = ["prod_country1", "prod_country2", "prod_country3"]
    return df_.drop("production_countries", axis=1).join(dfc)

df_pipe = df.pipe(split_production_countries)

print(df["production_countries"].sample(5, random_state=14))
# returns:
# 3052    ['CA', 'JP', 'US']
# 1962                ['US']
# 2229                ['GB']
# 2151          ['KH', 'US']
# 3623                ['ES']

print(df_pipe.sample(5, random_state=14).iloc[:, -3:])
# returns:
#      prod_country1 prod_country2 prod_country3
# 3052            CA            JP            US
# 1962            US          None          None
# 2229            GB          None          None
# 2151            KH            US          None
# 3623            ES          None          None

3052    ['CA', 'JP', 'US']
1962                ['US']
2229                ['GB']
2151          ['KH', 'US']
3623                ['ES']
Name: production_countries, dtype: object
     prod_country1 prod_country2 prod_country3
3052            CA            JP            US
1962            US          None          None
2229            GB          None          None
2151            KH            US          None
3623            ES          None          None


# Anti-Pattern #2: Manipulating pandas dataframes with for loops

In [115]:
import pandas as pd

df = pd.read_csv("titles.csv").pipe(split_production_countries)

# obtain country ranks
vcs = df["prod_country1"].value_counts()
top3 = vcs.index[:3]
top10 = vcs.index[:10]
top20 = vcs.index[:20]

# Looping - DON'T DO THIS
vals = []
for ind, row in df.iterrows():
    country = row["prod_country1"]
    if country in top3:
        vals.append("top3")
    elif country in top10:
        vals.append("top10")
    elif country in top20:
        vals.append("top20")
    else:
        vals.append("other")
df["prod_country_rank"] = vals

# df[col].apply() - DO THIS
def get_prod_country_rank(country):
    if country in top3:
        return "top3"
    elif country in top10:
        return "top10"
    elif country in top20:
        return "top20"
    else:
        return "other"

df["prod_country_rank"] = df["prod_country1"].apply(get_prod_country_rank)
print(df.sample(5, random_state=14).iloc[:, -4:])
# returns:
#      prod_country1 prod_country2 prod_country3 prod_country_rank
# 3052            CA            JP            US             top10
# 1962            US          None          None              top3
# 2229            GB          None          None              top3
# 2151            KH            US          None             other
# 3623            ES          None          None             top10

     prod_country1 prod_country2 prod_country3 prod_country_rank
3052            CA            JP            US             top10
1962            US          None          None              top3
2229            GB          None          None              top3
2151            KH            US          None             other
3623            ES          None          None             top10


## Looping: `df.iterrows()`

In [77]:
import pandas as pd

df = pd.read_csv("titles.csv").pipe(split_production_countries)

In [80]:
%%time

vcs = df["prod_country1"].value_counts()
top3 = vcs.index[:3]
top10 = vcs.index[:10]
top20 = vcs.index[:20]

vals = []
for ind, row in df.iterrows():
    country = row["prod_country1"]
    if country in top3:
        vals.append("top3")
    elif country in top10:
        vals.append("top10")
    elif country in top20:
        vals.append("top20")
    else:
        vals.append("other")
df = df.assign(prod_country_rank=vals)
print(df["prod_country_rank"].sample(5, random_state=0))

# 1519    top10
# 4942     top3
# 895     top20
# 5426    top10
# 2033     top3
# Name: prod_country_rank, dtype: object
# CPU times: total: 141 ms
# Wall time: 139 ms

1519    top10
4942     top3
895     top20
5426    top10
2033     top3
Name: prod_rank, dtype: object
CPU times: total: 141 ms
Wall time: 139 ms


## `df[col].apply()`

In [None]:
import pandas as pd

df = pd.read_csv("titles.csv").pipe(split_production_countries)

In [81]:
%%time

vcs = df["prod_country1"].value_counts()
top3 = vcs.index[:3]
top10 = vcs.index[:10]
top20 = vcs.index[:20]

def get_prod_country_rank(country):
    if country in top3:
        return "top3"
    elif country in top10:
        return "top10"
    elif country in top20:
        return "top20"
    else:
        return "other"

df = df.assign(prod_country_rank=lambda df_: df_["prod_country1"].apply(get_prod_country_rank))
print(df["prod_country_rank"].sample(5, random_state=0))

# 1519    top10
# 4942     top3
# 895     top20
# 5426    top10
# 2033     top3
# Name: prod_country_rank, dtype: object
# CPU times: total: 15.6 ms
# Wall time: 12 ms

1519    top10
4942     top3
895     top20
5426    top10
2033     top3
Name: prod_rank, dtype: object
CPU times: total: 15.6 ms
Wall time: 12 ms


## `np.select()`

In [82]:
import pandas as pd

df = pd.read_csv("titles.csv").pipe(split_production_countries)

In [83]:
%%time

def get_prod_country_rank(df_):
    vcs = df_["prod_country1"].value_counts()
    return np.select(
        condlist=(
            df_["prod_country1"].isin(vcs.index[:3]),
            df_["prod_country1"].isin(vcs.index[:10]),
            df_["prod_country1"].isin(vcs.index[:20]),
        ),
        choicelist=("top3", "top10", "top20"),
        default="other"
    )

df = df.assign(prod_country_rank = lambda df_: get_prod_country_rank(df_))
print(df["prod_country_rank"].sample(5, random_state=0))

# 1519    top10
# 4942     top3
# 895     top20
# 5426    top10
# 2033     top3
# Name: prod_country_rank, dtype: object
# CPU times: total: 0 ns
# Wall time: 5.95 ms

1519    top10
4942     top3
895     top20
5426    top10
2033     top3
Name: prod_rank, dtype: object
CPU times: total: 0 ns
Wall time: 5.95 ms


## `np.where`

In [99]:
import pandas as pd

df = pd.read_csv("titles.csv").pipe(split_production_countries)

In [106]:
df = df.assign(adjusted_score = lambda df_: np.where(df_["release_year"] > 2016, df_["imdb_score"] - 1, df_["imdb_score"]))

In [107]:
df

Unnamed: 0,id,title,type,description,release_year,age_certification,runtime,genres,seasons,imdb_id,imdb_score,imdb_votes,tmdb_popularity,tmdb_score,prod_country1,prod_country2,prod_country3,adjusted_score
0,ts300399,Five Came Back: The Reference Films,SHOW,This collection includes 12 World War II-era p...,1945,TV-MA,48,['documentation'],1.0,,,,0.600,,US,,,
1,tm84618,Taxi Driver,MOVIE,A mentally unstable Vietnam War veteran works ...,1976,R,113,"['crime', 'drama']",,tt0075314,8.3,795222.0,27.612,8.2,US,,,8.3
2,tm127384,Monty Python and the Holy Grail,MOVIE,"King Arthur, accompanied by his squire, recrui...",1975,PG,91,"['comedy', 'fantasy']",,tt0071853,8.2,530877.0,18.216,7.8,GB,,,8.2
3,tm70993,Life of Brian,MOVIE,"Brian Cohen is an average young Jewish man, bu...",1979,R,94,['comedy'],,tt0079470,8.0,392419.0,17.505,7.8,GB,,,8.0
4,tm190788,The Exorcist,MOVIE,12-year-old Regan MacNeil begins to adapt an e...,1973,R,133,['horror'],,tt0070047,8.1,391942.0,95.337,7.7,US,,,8.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5801,tm1014599,Fine Wine,MOVIE,A beautiful love story that can happen between...,2021,,100,"['romance', 'drama']",,tt13857480,6.9,39.0,0.966,,NG,,,5.9
5802,tm1108171,Edis Starlight,MOVIE,Rising star Edis's career journey with ups and...,2021,,74,"['music', 'documentation']",,,,,1.036,8.5,,,,
5803,tm1045018,Clash,MOVIE,A man from Nigeria returns to his family in Ca...,2021,,88,"['family', 'drama']",,tt14620732,6.5,32.0,0.709,,NG,CA,,5.5
5804,tm1098060,Shadow Parties,MOVIE,A family faces destruction in a long-running c...,2021,,116,"['action', 'thriller']",,tt10168094,6.2,9.0,2.186,,,,,5.2
