In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from IPython.display import display
from pandas.api.types import CategoricalDtype

from category_encoders import MEstimateEncoder
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.feature_selection import mutual_info_regression
from sklearn.model_selection import KFold, cross_val_score
from xgboost import XGBRegressor

In [None]:
# data_dir = './input/'
data_dir = Path("../input/house-prices-advanced-regression-techniques/")
X_train = pd.read_csv(data_dir + 'train.csv', index_col="id")
X_test = pd.read_csv(data_dir + 'test.csv', index_col="id")

In [None]:
X_train

## EDA (Exploratory Data Analysis)

hasil :
### missing values :
Missing values:
- Guest_Popularity_percentage    (146030) (19%)
- Episode_Length_minutes          (87093) (12%)
- Number_of_Ads                       (1)

In [None]:
def explore_data(df):
    print(f"DataFrame shape: {df.shape}")
    print(f"DataFrame columns: {df.columns.tolist()}")
    print(f"DataFrame info:\n{df.info()}")
    print(f"DataFrame description:\n{df.describe(include='all')}")
    print(f"Missing values:\n{df.isnull().sum().sort_values(ascending=False).head(20)}")
    print(f"Duplicate rows: {df.duplicated().sum()}")

In [None]:
explore_data(X_train)

In [None]:
X_train.head()

## Preprocessing

beberapa funtion untuk preprocessing :

- clean() - potensi regex
- encode() 
- impute() - masih bisa dikembangin


In [None]:
# disini bisa buat fitur episode_number
def clean(df):

    df['Episode_Number'] = df['Episode_Title'].str.extract(r'(\d+)').astype(float)
    df = df.drop('Episode_Title', axis=1)

    return df

    

In [None]:
def encode(df):
    # The nominative (unordered) categorical features
    features_nom = [
    'Podcast_Name',
    'Genre',
    'Publication_Day',
    ]
    
    features_ord = ['Episode_Sentiment', 'Publication_Time']

    ordered_levels = {
        'Episode_Sentiment': ['Negative', 'Neutral', 'Positive'],
        'Publication_Time': ['Morning', 'Afternoon', 'Evening', 'Night']
    }

    # Add a None level for missing values
    ordered_levels = {key: ["None"] + value for key, value in
                  ordered_levels.items()}

    
    # Nominal categories
    for name in features_nom:
        df[name] = df[name].astype("category")
        # Add a None category for missing values
        if "None" not in df[name].cat.categories:
            df[name] = df[name].cat.add_categories("None")
    # Ordinal categories
    for name, levels in ordered_levels.items():
        df[name] = df[name].astype(CategoricalDtype(levels,
                                                    ordered=True))
    
    return df
    

In [None]:
def impute(df):
    for name in df.select_dtypes("number"):
        df[name] = df[name].fillna(0)
    for name in df.select_dtypes("category"):
        df[name] = df[name].fillna("None")
    return df


## Load Data

- load_data()
- panggil load_data()

In [None]:
def load_data():
    #Read data
    # data_dir = 'input/'
    data_dir = Path("../input/house-prices-advanced-regression-techniques/")

    df_train = pd.read_csv(data_dir + 'train.csv', index_col="id")
    df_test = pd.read_csv(data_dir + 'test.csv', index_col="id")

    #Merge the splits so we can preprocess them together
    df = pd.concat([df_train, df_test])
    #Preprocessing
    df = clean(df)
    df = encode(df)
    df = impute(df)
    #reform splits
    df_train = df.loc[df_train.index, :]
    df_test = df.loc[df_test.index, :]


    return df_train, df_test

## Base line

- score_dataset()
- cek liat score

In [None]:
df_train, df_test = load_data()

In [None]:
def score_dataset(X, y, model=XGBRegressor()):
    # Label encoding for categoricals
    for colname in X.select_dtypes("category"):
        X[colname] = X[colname].cat.codes
    
    # Cross-validation pakai RMSE
    score = cross_val_score(
        model, X, y, 
        cv=5,
        scoring="neg_mean_squared_error"
    )

    score = -1 * score.mean()
    score = np.sqrt(score)  #matriknya make rmse
    return score


X = df_train.copy()
y = X.pop("Listening_Time_minutes")

baseline_score = score_dataset(X, y)
print(f"Baseline score: {baseline_score:.5f} RMSE")


## Train & Predict

In [None]:
def label_encode(df):
    X = df.copy()
    for colname in X.select_dtypes(["category"]):
        X[colname] = X[colname].cat.codes
    return X

In [None]:
def create_features(df, df_test=None):
    X = df.copy()
    y = X.pop("Listening_Time_minutes")

    if df_test is not None:
        X_test = df_test.copy()
        X_test.pop("Listening_Time_minutes")
        X = pd.concat([X, X_test])

    X = label_encode(X)
    
    # Reform splits
    if df_test is not None:
        X_test = X.loc[df_test.index, :]
        X.drop(df_test.index, inplace=True)

    
    if df_test is not None:
        return X, X_test
    else:
        return X


In [None]:
df_train, df_test = load_data()
X_train, X_test = create_features(df_train, df_test)


xgb = XGBRegressor()

xgb.fit(X_train, y)
y_pred = xgb.predict(X_test)


## Submission


In [None]:
def make_submisson():
    output = pd.DataFrame({'Id': X_test.index, 'Listening_Time_minutes': y_pred})
    output.to_csv('my_submission.csv', index=False)
    print("Your submission was successfully saved!")

In [None]:
make_submisson()