In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import os

from pathlib import Path
from src.data import make_dataset
from src.visualization import visualize 
from src.features import build_features

In [2]:
project_dir = Path(os.getcwd()).parent.absolute()
data_dir = project_dir / "data"
raw_dir = data_dir / "raw"
external_dir = data_dir / "external"
processed_dir = data_dir / "processed"

data_dir.mkdir(exist_ok=True)
raw_dir.mkdir(exist_ok=True)
external_dir.mkdir(exist_ok=True)
processed_dir.mkdir(exist_ok=True)

In [3]:
# Parameters
raw_data = raw_dir / make_dataset.NETFLIX_DATA
train_data = processed_dir / 'netflix_views_raw_train.pq'


In [4]:
df = pd.read_parquet(train_data)
df

Unnamed: 0,title,available_globally,release_date,hours_viewed
12832,Crocodiles – The Private Life of Primeaval Rep...,No,,200000
2173,Peter Rabbit: Season 1,No,,9300000
7776,The Amazing Race: Season 5,No,,1100000
6869,Baaghi // बागी,No,,1400000
12768,"Caution, Hazardous Wife: シーズン1 // 奥様は、取り扱い注意: ...",No,,200000
...,...,...,...,...
11290,Top of the Lake: Season 1,No,,400000
11970,Mommy Issues,Yes,,300000
5396,Whisper of the Heart // 耳をすませば,No,,2500000
866,Black Mirror: Season 4,Yes,2017-12-29 00:00:00,24200000


In [5]:
pipe = build_features.get_pipeline()
pipe

In [6]:
data_transformed = pipe.fit_transform(df)

In [7]:
df_transformed = build_features.output_to_df(pipe, data_transformed)

In [13]:
visualize.col_histogram(df, 'hours_viewed')

plotly.graph_objs._figure.Figure

Target is very long tail / right skewed

In [9]:
visualize.col_histogram(df_transformed, 'target_scale__hours_viewed')

In [10]:
target_scale = pipe.named_transformers_["target_scale"]
untransformed = build_features.output_to_df(target_scale, target_scale.inverse_transform(
    df_transformed.rename(columns={"target_scale__hours_viewed": "hours_viewed"})
))
(df.hours_viewed - untransformed.hours_viewed).describe()

count             14752.0
mean        460601.952278
std       27707842.342493
min     -778299999.999996
25%            -1700000.0
50%              100000.0
75%             2400000.0
max           812000000.0
Name: hours_viewed, dtype: double[pyarrow]

In [11]:
df['target_scale_error'] = (df.reset_index(drop=True).hours_viewed - untransformed.hours_viewed)
visualize.col_histogram(df, 'target_scale_error')