In [38]:
# Use this cell to regroup all your imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import set_config; set_config(display='diagram')

# Load

❓ Load the training dataset in a DataFrame `data` and create your `X` and `y`. Inspect their shape

In [39]:
data = pd.read_csv('../data/AllMoviesDetailsCleaned.csv')
data.head()

Unnamed: 0,id,budget,genres,imdb_id,original_language,original_title,overview,popularity,production_companies,production_countries,...,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,production_companies_number,production_countries_number,spoken_languages_number
0,2,0,Drama|Crime,tt0094675,fi,Ariel,Taisto Kasurinen is a Finnish coal miner whose...,0.823904,Villealfa Filmproduction Oy,Finland,...,69.0,suomi,Released,,Ariel,7.1,40,2,1,2
1,3,0,Drama|Comedy,tt0092149,fi,Varjoja paratiisissa,"An episode in the life of Nikander, a garbage ...",0.47445,Villealfa Filmproduction Oy,Finland,...,76.0,English,Released,,Shadows in Paradise,7.0,32,1,1,3
2,5,4000000,Crime|Comedy,tt0113101,en,Four Rooms,It's Ted the Bellhop's first night on the job....,1.698,Miramax Films,United States of America,...,98.0,English,Released,Twelve outrageous guests. Four scandalous requ...,Four Rooms,6.5,485,2,1,1
3,6,0,Action|Thriller|Crime,tt0107286,en,Judgment Night,"While racing to a boxing match, Frank, Mike, J...",1.32287,Universal Pictures,Japan,...,110.0,English,Released,Don't move. Don't whisper. Don't even breathe.,Judgment Night,6.5,69,3,2,1
4,8,42000,Documentary,tt0825671,en,Life in Loops (A Megacities RMX),Timo Novotny labels his new project an experim...,0.054716,inLoops,Austria,...,80.0,English,Released,A Megacities remix.,Life in Loops (A Megacities RMX),6.4,4,1,1,5


In [40]:
data['genres'].fillna('NA', inplace = True)

# separate all genres into one list, considering comma + space as separators
genre = data['genres'].str.split('|').tolist()

# flatten the list
flat_genre = [item for sublist in genre for item in sublist]

# convert to a set to make unique
set_genre = set(flat_genre)

# back to list
unique_genre = list(set_genre)
# remove NA
unique_genre.remove('NA')
unique_genre

['Foreign',
 'Mystery',
 'Action',
 'Western',
 'TV Movie',
 'Crime',
 'Romance',
 'Adventure',
 'Animation',
 'Drama',
 'Comedy',
 'Fantasy',
 'History',
 'Horror',
 'Family',
 'Science Fiction',
 'Documentary',
 'Music',
 'Thriller',
 'War']

In [41]:
data = data[data['revenue']!=0]
data = data[data['budget']!=0]

In [42]:
X = data.drop(columns='revenue')
y = data['revenue']

We know 2 main strategies to reduce the number of categorical features post-preprocessing:
- **[Remove](https://scikit-learn.org/stable/modules/classes.html#module-sklearn.feature_selection)** features that bring too little explanation to our model. This may require statistical analysis of feature importance 
- **[Ordinally encode](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OrdinalEncoder.html)** (instead of one-hot-encode) categorical features into integers. However this forces a notion of "order" (1>2>3...) that can be detrimental if not set properly!

💡 As a starter, what about simply **removing** all features that have **7 unique values or more**, and one-hot-encode every other? Let's keep ordinal encoding and statistical feature selection for the next iteration of our pipeline.

❓ Store features names to OHE in a list `feat_categorical_small` below. How many features will be OHE?

In [43]:
X.columns

Index(['id', 'budget', 'genres', 'imdb_id', 'original_language',
       'original_title', 'overview', 'popularity', 'production_companies',
       'production_countries', 'release_date', 'runtime', 'spoken_languages',
       'status', 'tagline', 'title', 'vote_average', 'vote_count',
       'production_companies_number', 'production_countries_number',
       'spoken_languages_number'],
      dtype='object')

In [44]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5986 entries, 2 to 328796
Data columns (total 21 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   id                           5986 non-null   int64  
 1   budget                       5986 non-null   int64  
 2   genres                       5986 non-null   object 
 3   imdb_id                      5760 non-null   object 
 4   original_language            5985 non-null   object 
 5   original_title               5986 non-null   object 
 6   overview                     5866 non-null   object 
 7   popularity                   5986 non-null   float64
 8   production_companies         5986 non-null   object 
 9   production_countries         5986 non-null   object 
 10  release_date                 5901 non-null   object 
 11  runtime                      5906 non-null   float64
 12  spoken_languages             5908 non-null   object 
 13  status          

In [45]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30,random_state=6)

🧪 Test your code below (and clear the cell once it passed)

### 1.2 Baseline pipe

#### a) Preprocessing

❓ Let's code the basic preprocessing pipeline described below. Save it under `preproc_baseline`.

For categorical features
- Simple-Impute with most frequent values
- One-Hot-Encode features that have less than 7 unique values to start with
- Drop all others features


As for numerical features
- Simple-Impute with strategy 'mean'
- Min-Max Scale 


<details>
    <summary>ℹ️ Click here for a pro tip</summary>

If you are confident, you can try sklearn's shorter syntax `make_pipeline` or `make_column_transformer` instead of the longer syntax `Pipeline` or `ColumnTransformer` if you want to avoid giving names manually to every steps.
</details>

In [46]:
from sklearn.impute import KNNImputer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import make_union
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.compose import make_column_selector
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import Ridge

In [47]:
num_transformer = make_pipeline(SimpleImputer(strategy='mean'), StandardScaler())
num_col = make_column_selector(dtype_include=['float64','int64'])

preproc_basic = make_column_transformer(
    (num_transformer, ['budget','popularity','runtime',
       'vote_average',
       'vote_count', 'production_companies_number',
       'production_countries_number', 'spoken_languages_number']))

preproc_basic

❓ Look at the **shape** of your preprocessed dataframe and save it to `shape_preproc_baseline` variable

In [48]:
X_train_transformed = preproc_basic.fit_transform(X_train)
X_train_transformed

array([[-0.52353911,  0.57539402, -0.37493582, ..., -0.28304895,
        -0.36571617, -0.45069928],
       [ 0.20893086, -0.02346841,  0.2808931 , ...,  0.17841996,
        -0.36571617,  0.65558659],
       [-0.3719936 , -0.09887149, -0.70285028, ..., -0.28304895,
        -0.36571617, -0.45069928],
       ...,
       [ 1.57284047,  0.33750081,  0.17158828, ...,  0.63988887,
        -0.36571617,  0.65558659],
       [ 2.45685596,  0.37994135,  0.57237262, ...,  1.10135779,
        -0.36571617,  3.97444419],
       [-0.1951905 , -0.30108278, -0.37493582, ..., -0.74451786,
        -0.36571617, -0.45069928]])

In [49]:
pd.DataFrame(X_train_transformed)

Unnamed: 0,0,1,2,3,4,5,6,7
0,-0.523539,0.575394,-0.374936,1.040610,1.185800,-0.283049,-0.365716,-0.450699
1,0.208931,-0.023468,0.280893,0.616405,-0.030641,0.178420,-0.365716,0.655587
2,-0.371994,-0.098871,-0.702850,-0.171403,-0.282125,-0.283049,-0.365716,-0.450699
3,-0.698827,0.161924,-0.994330,0.192201,0.354038,-0.283049,-0.365716,-0.450699
4,-0.384622,0.333801,-0.520676,1.040610,0.668859,0.178420,0.900689,-0.450699
...,...,...,...,...,...,...,...,...
4185,-0.548797,0.795540,-0.083456,1.283012,2.242967,-0.283049,-0.365716,2.868158
4186,-0.697842,-0.446921,-0.484241,-3.625639,-0.540130,-0.744518,-0.365716,0.655587
4187,1.572840,0.337501,0.171588,-0.292604,0.364283,0.639889,-0.365716,0.655587
4188,2.456856,0.379941,0.572373,0.131600,1.504347,1.101358,-0.365716,3.974444


🧪 Test your code below

#### b) Add estimator

In [50]:
from sklearn.linear_model import LinearRegression

In [51]:
pipe_baseline = make_pipeline(preproc_basic, LinearRegression())
pipe_baseline

#### c) Cross-Validate

In [52]:
from sklearn.model_selection import cross_val_score

# Cross-validate Pipeline
score_baseline = cross_val_score(pipe_baseline, X_train, y_train, cv=5, scoring='r2').mean()
score_baseline

0.7166437798170513

In [215]:
# from sklearn.model_selection import GridSearchCV
# grid_search = GridSearchCV(
#     pipe_baseline, 
#     tree_para = {'criterion':['gini','entropy', 'log_loss'],'max_depth':[4,5,6,7,8,9,10,11,12,15,20,30,40,50,70,90,120,150]},
#     cv=5,
#     scoring="rmsle_neg")

# grid_search.fit(X, y)

# grid_search.best_params_

❓5-fold cross-validate your `pipe_baseline` using this metric to get a first glance at your baseline performance.    

Store your mean score as `score_baseline`

#### d) Predict baseline

## 🏋️‍♀️ 2. ITERATIONS 

**Estimators**

- **Tree-based ensembles (must try today)**: Probably the best suited for problems with many categorical features
- Stacking !
- XGBoost !

**Preprocessing** (once your first ensemble model works)

- Ordinal Encoding of categorical features with a hidden notion of order in their values (e.g. "bad", "average", good")
- Statistical Feature Selection to remove useless features (avoid overfitting and reduce train time)
- Predict `log(SalePrice)` instead?
- ...

### 2.1 Preprocessing Iteration ♲ 

In [138]:
from sklearn.base import TransformerMixin, BaseEstimator

class GenreTranformer(TransformerMixin, BaseEstimator): 
    # BaseEstimator generates the get_params() and set_params() methods that all Pipelines require
    # TransformerMixin creates the fit_transform() method from fit() and transform()
    unique_genre_list = ['Foreign',
                 'Mystery',
                 'Action',
                 'Western',
                 'TV Movie',
                 'Crime',
                 'Romance',
                 'Adventure',
                 'Animation',
                 'Drama',
                 'Comedy',
                 'Fantasy',
                 'History',
                 'Horror',
                 'Family',
                 'Science Fiction',
                 'Documentary',
                 'Music',
                 'Thriller',
                 'War']
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self   
    
    def transform(self, X, y=None):
        genre_transformed = [] 
        for genres in X:
            genre_np = np.zeros((20,), dtype=int)
            for target_genre in genres[0].split('|'):
                if target_genre in unique_genre_list:
                        index = unique_genre_list.index(target_genre)
                        genre_np[index] = 1
            genre_transformed.append(list(genre_np))
        return np.array(genre_transformed)

In [139]:
num_transformer = make_pipeline(SimpleImputer(strategy='mean'), StandardScaler())
num_col = make_column_selector(dtype_include=['float64','int64'])

genre_transformer = make_pipeline(SimpleImputer(strategy='most_frequent'), GenreTranformer())

preproc_genre = make_column_transformer(
    (num_transformer, ['budget','popularity','runtime',
       'vote_average',
       'vote_count', 'production_companies_number',
       'production_countries_number', 'spoken_languages_number']),(
genre_transformer,['genres']))

pipe_genre = make_pipeline(preproc_genre, LinearRegression())
pipe_genre

In [140]:
from sklearn.model_selection import cross_val_score

# Cross-validate Pipeline
score_genre = cross_val_score(pipe_genre, X_train, y_train, cv=5, scoring='r2').mean()
score_genre

0.7259675431822937

#### a) Ordinal Encoding (1h)

❓ Look at the following feature below. Couldn't it be encoded numerically in a wise manner?
```
ExterQual: Evaluates the quality of the material on the exterior 
		
       Ex	Excellent
       Gd	Good
       TA	Average/Typical
       Fa	Fair
       Po	Poor
```

💡 Luckily, the `OrdinalEncoder` and its argument `categories`  allows us to do just that. Check it out below and make sure to understand how this works

In [222]:
# Define specific order for features
# Note: if you change this order, it will change the output for .transform()
feature_A_sorted_values = ['bad', 'average', 'good'] 
feature_B_sorted_values = ['dirty', 'clean', 'new']

encoder = OrdinalEncoder(
    categories=[
        feature_A_sorted_values,
        feature_B_sorted_values
    ],
    handle_unknown="use_encoded_value",
    unknown_value=-1
)

# Just some random training data
XX = [
    ['good', 'dirty'],
    ['bad', 'new'],
    ['average', 'clean'],
]

encoder.fit(XX)

encoder.transform([
        ['bad', "dirty"],
        ["average", "clean"],
        ['good', 'new'],
        ['bad', 'oooops never seen this label before']
])

array([[ 0.,  0.],
       [ 1.,  1.],
       [ 2.,  2.],
       [ 0., -1.]])

---
❓ **Your turn**: split your categorical preprocessor into

- `preproc_ordinal` to ordinally encode **some features** of your choice
- `preproc_nominal` to one-hot encode the other ones


<details>
    <summary>Hints</summary>

- You won't be able to avoid hard-coding names and ordered values of features! Be tidy!
- It's a good practice to sort alphabetically your features to avoid bad surprises
</details>

In [223]:
from sklearn.preprocessing import OrdinalEncoder
feat_ordinal_dict = {
    # considers "missing" as "neutral"
    "BsmtCond": ['missing', 'Po', 'Fa', 'TA', 'Gd'],
    "BsmtExposure": ['missing', 'No', 'Mn', 'Av', 'Gd'],
    "BsmtFinType1": ['missing', 'Unf', 'LwQ', 'Rec', 'BLQ', 'ALQ', 'GLQ'],
    "BsmtFinType2": ['missing', 'Unf', 'LwQ', 'Rec', 'BLQ', 'ALQ', 'GLQ'],
    "BsmtQual": ['missing', 'Fa', 'TA', 'Gd', 'Ex'],
    "Electrical": ['missing', 'Mix', 'FuseP', 'FuseF', 'FuseA', 'SBrkr'],
    "ExterCond": ['missing', 'Po', 'Fa', 'TA', 'Gd', 'Ex'],
    "ExterQual": ['missing', 'Fa', 'TA', 'Gd', 'Ex'],
    "Fence": ['missing', 'MnWw', 'GdWo', 'MnPrv', 'GdPrv'],
    "FireplaceQu": ['missing', 'Po', 'Fa', 'TA', 'Gd', 'Ex'],
    "Functional": ['missing', 'Sev', 'Maj2', 'Maj1', 'Mod', 'Min2', 'Min1', 'Typ'],
    "GarageCond": ['missing', 'Po', 'Fa', 'TA', 'Gd', 'Ex'],
    "GarageFinish": ['missing', 'Unf', 'RFn', 'Fin'],
    "GarageQual": ['missing', 'Po', 'Fa', 'TA', 'Gd', 'Ex'],
    "HeatingQC": ['missing', 'Po', 'Fa', 'TA', 'Gd', 'Ex'],
    "KitchenQual": ['missing', 'Fa', 'TA', 'Gd', 'Ex'],
    "LandContour": ['missing', 'Low', 'Bnk', 'HLS', 'Lvl'],
    "LandSlope": ['missing', 'Sev', 'Mod', 'Gtl'],
    "LotShape": ['missing', 'IR3', 'IR2', 'IR1', 'Reg'],
    "PavedDrive": ['missing', 'N', 'P', 'Y'],
    "PoolQC": ['missing', 'Fa', 'Gd', 'Ex']
}

feat_ordinal = sorted(feat_ordinal_dict.keys()) # sort alphabetically
feat_ordinal_values_sorted = [feat_ordinal_dict[i] for i in feat_ordinal]

encoder_ordinal = OrdinalEncoder(
    categories=feat_ordinal_values_sorted,
    dtype= np.int64,
    handle_unknown="use_encoded_value",
    unknown_value=-1 # Considers unknown values as worse than "missing"
)

preproc_ordinal = make_pipeline(
    SimpleImputer(strategy="constant", fill_value="missing"),
    encoder_ordinal,
    MinMaxScaler()
)

preproc_ordinal



In [224]:
feat_ordinal

['BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2',
 'BsmtQual',
 'Electrical',
 'ExterCond',
 'ExterQual',
 'Fence',
 'FireplaceQu',
 'Functional',
 'GarageCond',
 'GarageFinish',
 'GarageQual',
 'HeatingQC',
 'KitchenQual',
 'LandContour',
 'LandSlope',
 'LotShape',
 'PavedDrive',
 'PoolQC']

In [225]:
feat_nominal = list(set(feat_categorical_small) - set(feat_ordinal))
feat_nominal

['LotConfig',
 'SaleCondition',
 'Heating',
 'CentralAir',
 'BldgType',
 'MasVnrType',
 'MiscFeature',
 'Foundation',
 'GarageType',
 'Utilities',
 'MSZoning',
 'RoofStyle',
 'Alley',
 'Street']

In [226]:

preproc_nominal = make_pipeline(SimpleImputer(strategy='most_frequent'), OneHotEncoder(sparse=False,handle_unknown='ignore'))


In [227]:
num_transformer = make_pipeline(SimpleImputer(strategy='mean'), MinMaxScaler())
num_col = make_column_selector(dtype_include=['float64','int64'])

prepoc_nominal = make_pipeline(SimpleImputer(strategy='most_frequent'), OneHotEncoder(sparse=False,handle_unknown='ignore'))

preproc_combined = make_column_transformer(
    (num_transformer, num_col),
    (preproc_ordinal, feat_ordinal),
    (prepoc_nominal, feat_nominal)
)

preproc_combined

#### b) Statistical Feature Selection (~30min)

Our goal is to remove the least interesting features, to limit overfitting and shorten training time.  

🔥 We will make use of sklearn's [feature selection](https://scikit-learn.org/stable/modules/classes.html#module-sklearn.feature_selection) transformers directly in your pipeline!

❗️ We recommend you to **try only Option 1 today to start with**. Option 2 and 3 will be corrected in Recap

##### (Option 1 - recommended) <font color=green>Univariate</font> feature selection based on their mutual information with target `y`

- Feel free to add a `SelectPercentile` filter at the end of your `preproc` pipeline.
- This will filter-out features that, - taken individually - least explain your target!
- The statistical test we recommend to pass to SelectPercentile is the `mutual_info_regression`

<details>
    <summary markdown='span'>🤔 What is mutual information? Click here!</summary>

- Mutual information is a *statistical* distance between two probability distributions.
- Correlation is a *linear* distance between two random variables.
- Mutual information is more general and measures the reduction of uncertainty in Y after observing X.
- On the other hand, if you already know you are working with variables that are smooth (like continuous numerical variables), sometimes correlation may tell you more about them, for instance if their relationship is monotonic.

See [animation](https://twitter.com/ari_seff/status/1409296508634152964)
</details>

In [248]:
from sklearn.feature_selection import mutual_info_regression
# test_feature = mutual_info_regression(X.select_dtypes(include=['float64','int64']), y)
test_feature = mutual_info_regression(X[['MasVnrArea']], y)
test_feature

ValueError: Input X contains NaN.

##### (option 2) <font color=green>Multivariate</font> feature selection based their combined relationship with target `y`

🤔 We want to remove features that, when in combination with all the others, do not really help predict our target.

1️⃣ To do so, remember that we can use feature [`permutation_importance`](https://scikit-learn.org/stable/modules/permutation_importance.html) metric in combination with an estimator! It trains one pipe per feature, so as to estimate which feature makes our performance score *decrease* the most when shuffling it randomly. These would be our most important features, which we don't want to remove. 

The best thing is, scikit-learn allows you to integrate this methodology directly into your `preproc` pipeline thanks to the [`SequentialFeatureSelector`](https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SequentialFeatureSelector.html) transformer: this will recursively remove least important features according to the `cross_val_score`.

However, this process can take extremely long to train when you have many features.

2️⃣ Alternatively, a faster way would be to make use of models that already output some measure of feature_importance when fitting them. For instance, Trees with gini-based `feature_importance_`, or Lasso regressions with L1 `coef_`. Again here, scikit-learn already has the [`SelectFromModel`](https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SelectFromModel.html) transformer to do just that.

In [229]:
# YOUR CODE HERE

##### (option 3) <font color=green>Unsupervised</font> selection: Filter based only on the properties of `X`? 

❓ A quick win is to remove features with the lowest variance. Think about it: a feature which only has one value is useless (and has a variance of 0).  
- Feel free to add a [`VarianceThreshold`](https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.VarianceThreshold.html) to the end of your pipeline

In [230]:
# YOUR CODE HERE

❓ Additionally, we can check for correlation between our **numerical features** only

- Use [Pearson's correlation](https://en.wikipedia.org/wiki/Pearson_correlation_coefficient) combined with a heatmap to check visually whether some **numerical** features almost entirely correlate with others. 
- Use `VIF` from statsmodels to check for features that have the highest multicollinearity

In [231]:
# YOUR CODE HERE

❓ For **ordinal features**, we can use [Spearman's rank correlation](https://en.wikipedia.org/wiki/Spearman%27s_rank_correlation_coefficient) instead to check whether some **ordinally encoded** features are almost entirely "ordered" similarly to others. Feel free to plot a heatmap again

In [232]:
# YOUR CODE HERE

❓ Now, feel free to create a "filter" in your pipeline that removes any feature you want beyond a given (Spearman + Pearson) correlation threshold. You'll need a custom transformer class

In [233]:
# YOUR CODE HERE

#### e) Target engineering (15 min)

❓ We are asked to minimize the RMS**L**E. Why don't we transform our target to directly predict its `log`?
- Check out the histogram of the target `y`.
- Normally distributed variables should be easier to predict with linear or parametric models. 
- Create `y_log` and your new performance metrics
- Don't forget to take the exponent of your predictions at the end!

In [234]:
y_transform = np.log2(y)
y_transform

0       17.669688
1       17.469610
2       17.769915
3       17.095067
4       17.931569
          ...    
1455    17.416995
1456    17.680030
1457    18.023776
1458    17.116801
1459    17.170355
Name: SalePrice, Length: 1460, dtype: float64

### 2.2 Model Iteration ♻

#### a ) Final version of the preproc pipeline
❓ We advise you to start with a fresh definition below so you can quickly update it as need be.

In [235]:
from xgboost import XGBRegressor
xgb_reg = XGBRegressor(max_depth=6, n_estimators=300, learning_rate=0.05)

pipe_updated = make_pipeline(preproc_combined, xgb_reg)
pipe_updated

In [236]:
cv_score = cross_val_score(pipe_updated, X, data['SalePrice'], cv=5, scoring=rmsle)
score_updated = cv_score.mean()
score_updated

0.13063435636992934

In [237]:
y_pred_final=pipe_updated.fit(X,data['SalePrice']).predict(test_data.drop(columns="Id"))

y_pred_final

array([123245.91, 157634.97, 178806.44, ..., 170818.48, 114780.  ,
       216121.89], dtype=float32)

In [238]:
data_to_df_final = [test_data['Id'],y_pred_final]

In [239]:
pd.DataFrame({"Id":test_data.Id,"SalePrice":y_pred_final}).to_csv('./data/submission_final_new1.csv',index=False)

# 🏅FINAL SUBMISSION (submit at least 30 min before Recap)

Discover your real test score by submitting on Kaggle! 

👉 Write down your test score on the [result spreadsheet here](https://docs.google.com/spreadsheets/d/1ZEBKwa_k1Ytb0WCOh-Nopq3eaezwBNu1SAqKXEXRguc/edit#gid=0) (pick the correct batch!)

In [None]:
# YOUR CODE HERE