In [1]:
import numpy as np 
import pandas as pd 
import warnings
warnings.filterwarnings("ignore")

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/playground-series-s5e4/sample_submission.csv
/kaggle/input/playground-series-s5e4/train.csv
/kaggle/input/playground-series-s5e4/test.csv


In [2]:
_ = pd.read_csv("/kaggle/input/playground-series-s5e4/sample_submission.csv", index_col="id")
_

Unnamed: 0_level_0,Listening_Time_minutes
id,Unnamed: 1_level_1
750000,45.437
750001,45.437
750002,45.437
750003,45.437
750004,45.437
...,...
999995,45.437
999996,45.437
999997,45.437
999998,45.437


In [3]:
train_df = pd.read_csv("/kaggle/input/playground-series-s5e4/train.csv", index_col="id")
test_df = pd.read_csv("/kaggle/input/playground-series-s5e4/test.csv", index_col="id")
train_df.head()

Unnamed: 0_level_0,Podcast_Name,Episode_Title,Episode_Length_minutes,Genre,Host_Popularity_percentage,Publication_Day,Publication_Time,Guest_Popularity_percentage,Number_of_Ads,Episode_Sentiment,Listening_Time_minutes
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,Mystery Matters,Episode 98,,True Crime,74.81,Thursday,Night,,0.0,Positive,31.41998
1,Joke Junction,Episode 26,119.8,Comedy,66.95,Saturday,Afternoon,75.95,2.0,Negative,88.01241
2,Study Sessions,Episode 16,73.9,Education,69.97,Tuesday,Evening,8.97,0.0,Negative,44.92531
3,Digital Digest,Episode 45,67.17,Technology,57.22,Monday,Morning,78.7,2.0,Positive,46.27824
4,Mind & Body,Episode 86,110.51,Health,80.07,Monday,Afternoon,58.68,3.0,Neutral,75.61031


In [4]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 750000 entries, 0 to 749999
Data columns (total 11 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   Podcast_Name                 750000 non-null  object 
 1   Episode_Title                750000 non-null  object 
 2   Episode_Length_minutes       662907 non-null  float64
 3   Genre                        750000 non-null  object 
 4   Host_Popularity_percentage   750000 non-null  float64
 5   Publication_Day              750000 non-null  object 
 6   Publication_Time             750000 non-null  object 
 7   Guest_Popularity_percentage  603970 non-null  float64
 8   Number_of_Ads                749999 non-null  float64
 9   Episode_Sentiment            750000 non-null  object 
 10  Listening_Time_minutes       750000 non-null  float64
dtypes: float64(5), object(6)
memory usage: 68.7+ MB


In [5]:
test_df.describe()

Unnamed: 0,Episode_Length_minutes,Host_Popularity_percentage,Guest_Popularity_percentage,Number_of_Ads
count,221264.0,250000.0,201168.0,250000.0
mean,419.2987,59.716491,52.192796,1.355852
std,166854.5,22.880028,28.445034,4.274399
min,2.47,2.49,0.0,0.0
25%,35.78,39.25,28.32,0.0
50%,63.97,59.9,53.36,1.0
75%,94.15,79.39,76.56,2.0
max,78486260.0,117.76,116.82,2063.0


In [6]:
print(train_df.shape)
print(test_df.shape)

(750000, 11)
(250000, 10)


In [7]:
train_df.isnull().sum()

Podcast_Name                        0
Episode_Title                       0
Episode_Length_minutes          87093
Genre                               0
Host_Popularity_percentage          0
Publication_Day                     0
Publication_Time                    0
Guest_Popularity_percentage    146030
Number_of_Ads                       1
Episode_Sentiment                   0
Listening_Time_minutes              0
dtype: int64

In [8]:
test_df.isnull().sum()

Podcast_Name                       0
Episode_Title                      0
Episode_Length_minutes         28736
Genre                              0
Host_Popularity_percentage         0
Publication_Day                    0
Publication_Time                   0
Guest_Popularity_percentage    48832
Number_of_Ads                      0
Episode_Sentiment                  0
dtype: int64

In [9]:
import tensorflow as tf
from tensorflow.keras.layers import Dense
from tensorflow.keras.activations import linear, relu
from tensorflow.keras.regularizers import L2
from tensorflow.keras.models import Sequential

model = Sequential(
    [
        Dense(500, activation="relu", kernel_regularizer = L2(0.01)),
        Dense(1000, activation="relu", kernel_regularizer=L2(0.01)),
        Dense(1000, activation="relu", kernel_regularizer=L2(0.01)),
        Dense(1, activation="linear")
    ]
)

model.compile(
    loss = tf.keras.losses.mse,
    optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
)

In [10]:
from sklearn.model_selection import train_test_split


train_df.dropna(axis=0, subset=['Listening_Time_minutes'], inplace=True)
y = train_df.Listening_Time_minutes
train_df.drop(['Listening_Time_minutes'], axis=1, inplace=True)

numerical_data = [i for i in train_df.columns if train_df[i].dtype in ["int64", "float64"]]
categorical_data = [i for i in train_df.columns if train_df[i].dtype == "object" and train_df[i].nunique() < 10]

X = train_df[numerical_data + categorical_data].copy()
X_train, X_valid, y_train, y_valid = train_test_split(X,y,train_size=0.8,
                                                      test_size = 0.2, random_state=0)
X_test = test_df[numerical_data + categorical_data].copy()

In [11]:
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_squared_error

numerical_transformer = SimpleImputer(strategy="median")
categorical_transformer = Pipeline(steps=[
    ("impute", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(transformers=[
    ("num", numerical_transformer, numerical_data),
    ("cat", categorical_transformer, categorical_data)
])

my_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", model)
])

In [12]:
my_pipeline.fit(X, y)

preds = my_pipeline.predict(X_valid)

[1m23438/23438[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m467s[0m 20ms/step - loss: 190.0401
[1m4688/4688[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 5ms/step


In [13]:
preds = my_pipeline.predict(X_test)

[1m7813/7813[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 5ms/step


In [14]:
preds.flatten


<function ndarray.flatten>

In [15]:
output = pd.DataFrame({"id":X_test.index, "Listening_Time_minutes":preds[:,0]})
output

Unnamed: 0,id,Listening_Time_minutes
0,750000,53.964993
1,750001,17.388163
2,750002,48.628448
3,750003,79.731026
4,750004,49.219902
...,...,...
249995,999995,9.929546
249996,999996,57.871349
249997,999997,5.284143
249998,999998,74.480614


In [16]:
output.to_csv("submission.csv", index=False)