In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from IPython.display import display
from pandas.api.types import CategoricalDtype

from sklearn.impute import SimpleImputer
from category_encoders import MEstimateEncoder
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.feature_selection import mutual_info_regression
from sklearn.model_selection import KFold, cross_val_score
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from pathlib import Path
from scipy.stats import skew 
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Ridge


# Preprocessing

In [None]:
def clean(df):

    df['Episode_Number'] = df['Episode_Title'].str.extract(r'(\d+)').astype(float)
    df = df.drop('Episode_Title', axis=1)

    return df

def encode(df):
    # The nominative (unordered) categorical features
    features_nom = [
    'Podcast_Name',
    'Genre',
    'Publication_Day',
    ]
    
    # The ordinal (ordered) categorical features
    ordered_levels = {
        'Episode_Sentiment': ['Negative', 'Neutral', 'Positive'],
        'Publication_Time': ['Morning', 'Afternoon', 'Evening', 'Night']
    }

    ordered_levels = {key: ["None"] + value for key, value in
                  ordered_levels.items()}

    
    # Nominal categories
    for name in features_nom:
        df[name] = df[name].astype("category")
        if "None" not in df[name].cat.categories:
            df[name] = df[name].cat.add_categories("None")
    # Ordinal categories
    for name, levels in ordered_levels.items():
        df[name] = df[name].astype(CategoricalDtype(levels,
                                                    ordered=True))
    
    return df

def impute_upgraded(df):
    for name in df.select_dtypes("number").columns:
        df[name] = df[name].fillna(df[name].median())
    for name in df.select_dtypes("category").columns:
        df[name] = df[name].fillna(df[name].mode().iloc[0])
    return df

def load_data():

    data_dir = 'input/'
    # data_dir = Path("/kaggle/input/playground-series-s5e4/")


    df_train = pd.read_csv(data_dir + 'train.csv', index_col="id")
    df_test = pd.read_csv(data_dir + 'test.csv', index_col="id")

    #Merge the splits so we can preprocess them together
    df = pd.concat([df_train, df_test])
    
    #Preprocessing
    df = clean(df)
    df = encode(df)
    df = impute_upgraded(df)

    # df['is_weekend'] = df['Publication_Day'].apply(lambda x: 1 if x in ['Saturday', 'Sunday'] else 0)


    #reform splits
    df_train = df.loc[df_train.index, :]
    df_test = df.loc[df_test.index, :]

    # df_train = outlier_check(df_train, 'Episode_Length_minutes', log=True, return_filtered=True, plot=False)

    return df_train, df_test



# Feature Engineering

In [None]:
def label_encode(df):
    X = df.copy()
    for colname in X.select_dtypes(["category"]):
        X[colname] = X[colname].cat.codes
    return X

cluster_features = [
    "Episode_Length_minutes",
    "Host_Popularity_percentage",
    "Guest_Popularity_percentage",
    "Episode_Number"
]

def cluster_labels(df, features, n_clusters=20):
    X = df.copy()
    X_scaled = X.loc[:, features]
    X_scaled = (X_scaled - X_scaled.mean(axis=0)) / X_scaled.std(axis=0)
    kmeans = KMeans(n_clusters=n_clusters, n_init=50, random_state=0)               #TUNING
    X_new = pd.DataFrame()
    X_new["Cluster"] = kmeans.fit_predict(X_scaled)
    return X_new

def pca_inspired(df):
    X = pd.DataFrame()
    X["TopFeaturesCombined"] = df.Episode_Length_minutes * df.Host_Popularity_percentage * df.Episode_Number
    X["GuestImpact"] = df.Guest_Popularity_percentage * df.Episode_Length_minutes
    X["HostImpact"] = df.Host_Popularity_percentage * df.Episode_Length_minutes
    X["ContentDensity"] = df.Episode_Length_minutes / (df.Number_of_Ads + 1)
    return X

def create_features_categorical(df):
    df['is_weekend'] = df['Publication_Day'].apply(lambda x: 1 if x in ['Saturday', 'Sunday'] else 0)

    return df


    
def elm(df_train, df_test=None):
    df = df_train.copy()
    ELM = {}
    for k in range(3):
        col_name = f'ELm_r{k}'
        df[col_name] = df['Episode_Length_minutes'].round(k)
        ELM[col_name] = df[col_name]
    return pd.DataFrame(ELM, index=df.index)


def combination_features(df, df_test=None):
    X = df.copy()


    train = train.copy()
    test = test.copy()
    encoded_columns = []

    selected_comb = [
        # 2-interaction
        ['Episode_Length_minutes', 'Host_Popularity_percentage'],
        ['Episode_Length_minutes', 'Guest_Popularity_percentage'],
        ['Episode_Length_minutes', 'Number_of_Ads'],
        ['Episode_Num', 'Host_Popularity_percentage'],
        ['Episode_Num', 'Guest_Popularity_percentage'],
        ['Episode_Num', 'Number_of_Ads'],    
        ['Host_Popularity_percentage', 'Guest_Popularity_percentage'],
        ['Host_Popularity_percentage', 'Number_of_Ads'],
        ['Host_Popularity_percentage', 'Episode_Sentiment'],
        ['Episode_Length_minutes', 'Podcast_Name'],
        ['Episode_Num', 'Podcast_Name'],  
        ['Guest_Popularity_percentage', 'Podcast_Name'],
        ['ELm_r1', 'Episode_Num'],
        ['ELm_r1', 'Host_Popularity_percentage'], 
        ['ELm_r1', 'Guest_Popularity_percentage'],
        ['ELm_r2', 'Episode_Num'],
        ['ELm_r2', 'Episode_Sentiment'],
        ['ELm_r2', 'Publication_Day'],

        
        # 3-interaction
        ['Episode_Length_minutes', 'Episode_Num', 'Host_Popularity_percentage'],
        ['Episode_Length_minutes', 'Episode_Num', 'Guest_Popularity_percentage'],
        ['Episode_Length_minutes', 'Episode_Num', 'Number_of_Ads'],
        ['Episode_Length_minutes', 'Episode_Num', 'Episode_Sentiment'],
        ['Episode_Length_minutes', 'Episode_Num', 'Publication_Day'],
        ['Episode_Length_minutes', 'Host_Popularity_percentage', 'Guest_Popularity_percentage'],
        ['Episode_Length_minutes', 'Host_Popularity_percentage', 'Number_of_Ads'],
        ['Episode_Length_minutes', 'Host_Popularity_percentage', 'Episode_Sentiment'],
        ['Episode_Length_minutes', 'Host_Popularity_percentage', 'Publication_Day'],
        ['Episode_Length_minutes', 'Host_Popularity_percentage', 'Publication_Time'],
        ['Episode_Length_minutes', 'Guest_Popularity_percentage', 'Number_of_Ads'],
        ['Episode_Length_minutes', 'Guest_Popularity_percentage', 'Publication_Day'],
        ['Episode_Length_minutes', 'Guest_Popularity_percentage', 'Publication_Time'],
        ['Episode_Length_minutes', 'Number_of_Ads', 'Episode_Sentiment'],
        ['Episode_Length_minutes', 'Number_of_Ads', 'Publication_Day'],
        ['Episode_Length_minutes', 'Episode_Sentiment', 'Publication_Time'],
        ['Episode_Num', 'Host_Popularity_percentage', 'Guest_Popularity_percentage'],
        ['Episode_Num', 'Host_Popularity_percentage', 'Number_of_Ads'],
        ['Episode_Num', 'Host_Popularity_percentage', 'Episode_Sentiment'],
        ['Episode_Num', 'Host_Popularity_percentage', 'Publication_Day'],
        ['Episode_Num', 'Host_Popularity_percentage', 'Publication_Time'],
        ['Episode_Num', 'Host_Popularity_percentage', 'Genre'],
        ['Episode_Num', 'Guest_Popularity_percentage', 'Number_of_Ads'],
        ['Episode_Num', 'Guest_Popularity_percentage', 'Episode_Sentiment'],
        ['Episode_Num', 'Guest_Popularity_percentage', 'Publication_Day'],
        ['Episode_Num', 'Guest_Popularity_percentage', 'Publication_Time'],
        ['Episode_Num', 'Guest_Popularity_percentage', 'Genre'],
        ['Episode_Num', 'Number_of_Ads', 'Episode_Sentiment'],
        ['Host_Popularity_percentage', 'Guest_Popularity_percentage', 'Number_of_Ads'],
        ['Host_Popularity_percentage', 'Guest_Popularity_percentage', 'Episode_Sentiment'],
        ['Host_Popularity_percentage', 'Guest_Popularity_percentage', 'Publication_Day'],
        ['Host_Popularity_percentage', 'Guest_Popularity_percentage', 'Publication_Time'],
        ['Host_Popularity_percentage', 'Number_of_Ads', 'Publication_Day'],

        ['Guest_Popularity_percentage', 'Number_of_Ads', 'Episode_Sentiment'],
        ['Guest_Popularity_percentage', 'Number_of_Ads', 'Genre'],   
        ['ELm_r1', 'Number_of_Ads', 'Episode_Sentiment'],
        ['ELm_r2', 'Number_of_Ads', 'Podcast_Name'],
        
        # 4-interaction
        ['Episode_Length_minutes', 'Episode_Num', 'Host_Popularity_percentage', 'Guest_Popularity_percentage'],
        ['Episode_Length_minutes', 'Episode_Num', 'Host_Popularity_percentage', 'Number_of_Ads'],
        ['Episode_Length_minutes', 'Episode_Num', 'Host_Popularity_percentage', 'Episode_Sentiment'],
        ['Episode_Length_minutes', 'Episode_Num', 'Host_Popularity_percentage', 'Publication_Day'],
        ['Episode_Length_minutes', 'Episode_Num', 'Host_Popularity_percentage', 'Publication_Time'],
        ['Episode_Length_minutes', 'Episode_Num', 'Host_Popularity_percentage', 'Genre'],
        ['Episode_Length_minutes', 'Episode_Num', 'Guest_Popularity_percentage', 'Number_of_Ads'],
        ['Episode_Length_minutes', 'Episode_Num', 'Guest_Popularity_percentage', 'Episode_Sentiment'],
        ['Episode_Length_minutes', 'Episode_Num', 'Guest_Popularity_percentage', 'Publication_Day'],
        ['Episode_Length_minutes', 'Episode_Num', 'Guest_Popularity_percentage', 'Publication_Time'],
        ['Episode_Length_minutes', 'Episode_Num', 'Number_of_Ads', 'Episode_Sentiment'],
        ['Episode_Length_minutes', 'Episode_Num', 'Number_of_Ads', 'Publication_Day'],
        ['Episode_Length_minutes', 'Episode_Num', 'Number_of_Ads', 'Publication_Time'],
        ['Episode_Length_minutes', 'Episode_Num', 'Publication_Day', 'Publication_Time'],
        ['Episode_Length_minutes', 'Episode_Num', 'Publication_Day', 'Genre'],    
        ['Episode_Length_minutes', 'Host_Popularity_percentage', 'Guest_Popularity_percentage', 'Number_of_Ads'],
        ['Episode_Length_minutes', 'Host_Popularity_percentage', 'Guest_Popularity_percentage', 'Episode_Sentiment'],
        ['Episode_Length_minutes', 'Host_Popularity_percentage', 'Guest_Popularity_percentage', 'Publication_Day'],
        ['Episode_Length_minutes', 'Host_Popularity_percentage', 'Guest_Popularity_percentage', 'Publication_Time'],
        ['Episode_Length_minutes', 'Host_Popularity_percentage', 'Number_of_Ads', 'Episode_Sentiment'],
        ['Episode_Length_minutes', 'Host_Popularity_percentage', 'Number_of_Ads', 'Publication_Day'],
        ['Episode_Length_minutes', 'Host_Popularity_percentage', 'Publication_Day', 'Publication_Time'],
        ['Episode_Length_minutes', 'Host_Popularity_percentage', 'Publication_Day', 'Genre'],
        ['Episode_Length_minutes', 'Guest_Popularity_percentage', 'Number_of_Ads', 'Episode_Sentiment'],
        ['Episode_Length_minutes', 'Guest_Popularity_percentage', 'Number_of_Ads', 'Publication_Day'],
        ['Episode_Length_minutes', 'Guest_Popularity_percentage', 'Number_of_Ads', 'Publication_Time'],
        ['Episode_Length_minutes', 'Guest_Popularity_percentage', 'Number_of_Ads', 'Genre'],
        ['Episode_Length_minutes', 'Episode_Num', 'Publication_Time', 'Podcast_Name'],
        
        ['Episode_Num', 'Host_Popularity_percentage', 'Guest_Popularity_percentage', 'Number_of_Ads'],
        ['Episode_Num', 'Host_Popularity_percentage', 'Guest_Popularity_percentage', 'Episode_Sentiment'],
        ['Episode_Num', 'Host_Popularity_percentage', 'Number_of_Ads', 'Publication_Day'],
        ['Episode_Num', 'Host_Popularity_percentage', 'Number_of_Ads', 'Publication_Time'],
        ['Episode_Num', 'Host_Popularity_percentage', 'Episode_Sentiment', 'Publication_Day'],
        ['Episode_Num', 'Host_Popularity_percentage', 'Episode_Sentiment', 'Publication_Time'],
        ['Episode_Num', 'Host_Popularity_percentage', 'Episode_Sentiment', 'Genre'],
        ['Episode_Num', 'Host_Popularity_percentage', 'Publication_Day', 'Publication_Time'],
        ['Episode_Num', 'Host_Popularity_percentage', 'Publication_Time', 'Genre'],
        ['Episode_Num', 'Guest_Popularity_percentage', 'Number_of_Ads', 'Episode_Sentiment'],
        ['Episode_Num', 'Guest_Popularity_percentage', 'Number_of_Ads', 'Genre'],
        ['Episode_Num', 'Host_Popularity_percentage', 'Episode_Sentiment', 'Podcast_Name'],
        ['Host_Popularity_percentage', 'Number_of_Ads', 'Episode_Sentiment', 'Podcast_Name'],
        ['Host_Popularity_percentage', 'Number_of_Ads', 'Publication_Day', 'Podcast_Name'],
        ['Host_Popularity_percentage', 'Number_of_Ads', 'Publication_Time', 'Podcast_Name'],
        
    ]

    # Process combinations
    for comb in selected_comb:
        name = '_'.join(comb)
        
        if len(comb) == 2:
            X[name] = X[comb[0]].astype(str) + '_' + X[comb[1]].astype(str)
        elif len(comb) == 3:
            X[name] = (X[comb[0]].astype(str) + '_' +
                      X[comb[1]].astype(str) + '_' +
                      X[comb[2]].astype(str))
        elif len(comb) == 4:
            X[name] = (X[comb[0]].astype(str) + '_' +
                      X[comb[1]].astype(str) + '_' +
                      X[comb[2]].astype(str) + '_' +
                      X[comb[3]].astype(str))
        
        encoded_columns.append(name)
    
    # Convert to categorical
    X[encoded_columns] = X[encoded_columns].astype('category')
    
    # Handle test data if provided
    if df_test is not None:
        X_test = df_test.copy()
        
        # Apply same transformations to test data
        for comb in selected_comb:
            name = '_'.join(comb)
            
            if len(comb) == 2:
                X_test[name] = X_test[comb[0]].astype(str) + '_' + X_test[comb[1]].astype(str)
            elif len(comb) == 3:
                X_test[name] = (X_test[comb[0]].astype(str) + '_' +
                              X_test[comb[1]].astype(str) + '_' +
                              X_test[comb[2]].astype(str))
            elif len(comb) == 4:
                X_test[name] = (X_test[comb[0]].astype(str) + '_' +
                              X_test[comb[1]].astype(str) + '_' +
                              X_test[comb[2]].astype(str) + '_' +
                              X_test[comb[3]].astype(str))
        
        # Convert test data to categorical
        X_test[encoded_columns] = X_test[encoded_columns].astype('category')
        
        return X[encoded_columns], X_test[encoded_columns]
    
    return X[encoded_columns]

In [None]:
def create_features(df, df_test=None):
    X = df.copy()
    y = X.pop("Listening_Time_minutes")

    if df_test is not None:
        X_test = df_test.copy()
        X_test.pop("Listening_Time_minutes")
        X = pd.concat([X, X_test])

    X = create_features_categorical(X)
    # X = X.join(elm(X))
    X = X.join(combination_features(X))
    X = X.join(cluster_labels(X, cluster_features, n_clusters=20))
    # X = X.join(pca_components(X, pca_features))
    X = X.join(pca_inspired(X))

    X = label_encode(X)
    
    # Reform splits
    if df_test is not None:
        X_test = X.loc[df_test.index, :]
        X.drop(df_test.index, inplace=True)

    
    if df_test is not None:
        return X, X_test
    else:
        return X

# Evaluasi Model

In [None]:
def score_dataset(X, y, model=None):
    
    score = cross_val_score(
        model, X, y, 
        cv=5,
        scoring="neg_mean_squared_error"
    )

    #RMSE
    score = -1 * score.mean()
    score = np.sqrt(score) 
    return score

def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

def score_dataset_Kfold(X,y, X_test,model=None):

    X = X.copy()
    X_test = X_test.copy()
    
    for colname in X.select_dtypes("category"):
        X[colname] = X[colname].cat.codes
        
    n_splits = 5
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    scores = []
    test_preds = np.zeros(len(X_test))


    for fold, (train_idx, val_idx) in enumerate(kf.split(X, y)):
        print(f"Training fold {fold + 1}/{n_splits}...")    
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]   
        kmodel = model
        kmodel.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=100)    
        val_pred = kmodel.predict(X_val)
        score = rmse(y_val, val_pred)
        scores.append(score)
        test_preds += kmodel.predict(X_test) / n_splits      
        print(f"Fold {fold + 1} RMSE: {score:.4f}")

    print(f'Optimized Cross-validated RMSE score: {np.mean(scores):.5f} +/- {np.std(scores):.5f}')
    print(f'Max RMSE score: {np.max(scores):.5f}')
    print(f'Min RMSE score: {np.min(scores):.5f}')

    return test_preds

def stacking_regression_with_rmse(base_models, meta_model, X_train, y_train, X_test, n_splits=5):

    # Prepare arrays
    n_train, n_test = X_train.shape[0], X_test.shape[0]
    n_models = len(base_models)
    
    base_predictions_train = np.zeros((n_train, n_models))
    base_predictions_test = np.zeros((n_test, n_models))
    
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    scores = []
    
    for i, model in enumerate(base_models):
        test_fold_predictions = np.zeros((n_test, n_splits))

        for j, (train_idx, valid_idx) in enumerate(kf.split(X_train)):
            X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[valid_idx]
            y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[valid_idx]

            model.fit(X_tr, y_tr)
            base_predictions_train[valid_idx, i] = model.predict(X_val)
            test_fold_predictions[:, j] = model.predict(X_test)

        base_predictions_test[:, i] = test_fold_predictions.mean(axis=1)

    # Train meta model
    meta_model.fit(base_predictions_train, y_train)
    
    # Predict on training set
    train_pred = meta_model.predict(base_predictions_train)
    final_prediction = meta_model.predict(base_predictions_test)

    # Evaluate RMSE
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    for fold, (train_idx, valid_idx) in enumerate(kf.split(base_predictions_train)):
        X_val = base_predictions_train[valid_idx]
        y_val = y_train.iloc[valid_idx]
        val_pred = meta_model.predict(X_val)
        score = rmse(y_val, val_pred)
        scores.append(score)
        print(f"Fold {fold+1} RMSE: {score:.4f}")

    print(f'Optimized Cross-validated RMSE score: {np.mean(scores):.5f} +/- {np.std(scores):.5f}')
    print(f'Max RMSE score: {np.max(scores):.5f}')
    print(f'Min RMSE score: {np.min(scores):.5f}')

    return final_prediction

def score_stacking_Kfold(X, y, X_test, base_models, meta_model):

    X = X.copy()
    X_test = X_test.copy()

    n_splits = 5
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    scores = []
    test_preds = np.zeros(len(X_test))

    for fold, (train_idx, val_idx) in enumerate(kf.split(X, y)):
        print(f"Training fold {fold + 1}/{n_splits}...")
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

        # 1. Buat meta-features untuk training dan validasi
        X_train_meta = np.zeros((len(X_train), len(base_models)))
        X_val_meta = np.zeros((len(X_val), len(base_models)))
        X_test_meta = np.zeros((len(X_test), len(base_models)))

        for i, model in enumerate(base_models):
            model.fit(X_train, y_train)
            X_train_meta[:, i] = model.predict(X_train)
            X_val_meta[:, i] = model.predict(X_val)
            X_test_meta[:, i] += model.predict(X_test) / n_splits  # Rata-rata nanti

        # 2. Train meta-model
        meta_model.fit(X_train_meta, y_train)

        # 3. Predict dan hitung skor di validation
        val_pred = meta_model.predict(X_val_meta)
        score = rmse(y_val, val_pred)
        scores.append(score)

        # 4. Predict untuk test set
        test_preds += meta_model.predict(X_test_meta) / n_splits

        print(f"Fold {fold + 1} RMSE: {score:.4f}")

    print(f'Optimized Cross-validated RMSE score: {np.mean(scores):.5f} +/- {np.std(scores):.5f}')
    print(f'Max RMSE score: {np.max(scores):.5f}')
    print(f'Min RMSE score: {np.min(scores):.5f}')

    return test_preds

In [None]:
df_train, df_test = load_data()
X_train, X_test = create_features(df_train, df_test)
y_train = df_train.loc[:,'Listening_Time_minutes']

In [None]:
#Model

xgb_model = XGBRegressor(
    random_state = 0,
    n_estimators = 565,
    max_depth= 14,
    learning_rate = 0.04222221,
    subsample= 0.8,
    colsample_bytree = 0.8,   
    n_jobs= -1 
)

xgb_model_2 = XGBRegressor(
    random_state = 0,
    n_estimators = 565,
    max_depth=12,
    learning_rate=0.05858702616823876,
    subsample=0.9356075676850377,
    colsample_bytree=0.7895819265828284,
    gamma=1.7903575391246762
)

lgbm_model = LGBMRegressor(
    random_state = 0,
    n_iter=1000,
    max_depth=-1,
    num_leaves=1024,
    colsample_bytree=0.7,
    learning_rate=0.03,
    objective='l2',
    verbosity=-1,
    max_bin=1024,
)

random_forest_model = RandomForestRegressor(
    random_state=0,
    n_estimators=500,
    max_depth=25,
    min_samples_split=2,
    min_samples_leaf=1,
    max_features='sqrt',
    bootstrap=True,
    n_jobs=-1
)

In [None]:
print(f"Final Score : {score_dataset(X_train, y_train, model=random_forest_model):.5f} RMSE")

In [None]:
pred = score_dataset_Kfold(X_train, y_train, X_test, model=xgb_model)

# Training Model

In [None]:
# Without K-fold

xgb = xgb_model
xgb.fit(X_train, y_train)
y_pred_1 = xgb.predict(X_test)

In [None]:
# With K-Fold

xgb = xgb_model
y_pred_2 =score_dataset_Kfold(X_train, y_train, X_test)

In [None]:
# Stacking Model

base_models = [
    xgb_model,
    random_forest_model,
    lgbm_model
]

meta_model = Ridge(random_state=0)

y_pred_3 = stacking_regression_with_rmse(
    base_models, 
    meta_model, 
    X_train, 
    y_train, 
    X_test,
    n_splits=5
)


In [None]:
# K-Fold Stacking Model

final_test_prediction = score_stacking_Kfold(
    X_train, 
    y_train, 
    X_test, 
    base_models, 
    meta_model
)

# Submission

In [None]:
def make_submisson():
    output = pd.DataFrame({'id': X_test.index, 'Listening_Time_minutes': y_pred_1})
    output.to_csv('my_submission6.csv', index=False)
    print("Your submission was successfully saved!")

In [None]:
# make_submisson()