In [40]:
import pandas as pd


train_df = pd.read_csv('train.csv')

test_df = pd.read_csv('test.csv')


print(train_df.info())
print(train_df.columns)

print(test_df.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 750000 entries, 0 to 749999
Data columns (total 12 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   id                           750000 non-null  int64  
 1   Podcast_Name                 750000 non-null  object 
 2   Episode_Title                750000 non-null  object 
 3   Episode_Length_minutes       662907 non-null  float64
 4   Genre                        750000 non-null  object 
 5   Host_Popularity_percentage   750000 non-null  float64
 6   Publication_Day              750000 non-null  object 
 7   Publication_Time             750000 non-null  object 
 8   Guest_Popularity_percentage  603970 non-null  float64
 9   Number_of_Ads                749999 non-null  float64
 10  Episode_Sentiment            750000 non-null  object 
 11  Listening_Time_minutes       750000 non-null  float64
dtypes: float64(5), int64(1), object(6)
memory usage: 68.7+ MB


In [41]:
print(train_df.isnull().sum())

id                                  0
Podcast_Name                        0
Episode_Title                       0
Episode_Length_minutes          87093
Genre                               0
Host_Popularity_percentage          0
Publication_Day                     0
Publication_Time                    0
Guest_Popularity_percentage    146030
Number_of_Ads                       1
Episode_Sentiment                   0
Listening_Time_minutes              0
dtype: int64


In [42]:
podcast_name_dict = {}
index = 0
for podcast in train_df['Podcast_Name'].unique():
    podcast_name_dict[podcast] = index
    index += 1

print(podcast_name_dict)

{'Mystery Matters': 0, 'Joke Junction': 1, 'Study Sessions': 2, 'Digital Digest': 3, 'Mind & Body': 4, 'Fitness First': 5, 'Criminal Minds': 6, 'News Roundup': 7, 'Daily Digest': 8, 'Music Matters': 9, 'Sports Central': 10, 'Melody Mix': 11, 'Game Day': 12, 'Gadget Geek': 13, 'Global News': 14, 'Tech Talks': 15, 'Sport Spot': 16, 'Funny Folks': 17, 'Sports Weekly': 18, 'Business Briefs': 19, 'Tech Trends': 20, 'Innovators': 21, 'Health Hour': 22, 'Comedy Corner': 23, 'Sound Waves': 24, 'Brain Boost': 25, "Athlete's Arena": 26, 'Wellness Wave': 27, 'Style Guide': 28, 'World Watch': 29, 'Humor Hub': 30, 'Money Matters': 31, 'Healthy Living': 32, 'Home & Living': 33, 'Educational Nuggets': 34, 'Market Masters': 35, 'Learning Lab': 36, 'Lifestyle Lounge': 37, 'Crime Chronicles': 38, 'Detective Diaries': 39, 'Life Lessons': 40, 'Current Affairs': 41, 'Finance Focus': 42, 'Laugh Line': 43, 'True Crime Stories': 44, 'Business Insights': 45, 'Fashion Forward': 46, 'Tune Time': 47}


In [43]:
genre_dict = {}
index = 0
for genre in train_df['Genre'].unique():
    genre_dict[genre] = index
    index += 1
print(genre_dict)


{'True Crime': 0, 'Comedy': 1, 'Education': 2, 'Technology': 3, 'Health': 4, 'News': 5, 'Music': 6, 'Sports': 7, 'Business': 8, 'Lifestyle': 9}


In [44]:
sorted_days_dict = {
    'Monday': 0,
    'Tuesday': 1,
    'Wednesday': 2,
    'Thursday': 3,
    'Friday': 4,
    'Saturday': 5,
    'Sunday': 6
}

In [45]:
publication_time_dict = {
    'Morning': 0,
    'Afternoon': 1,
    'Evening': 2,
    'Night': 3
}

In [46]:
sentiment_dict = {
    'Negative': 0,
    'Neutral': 1,
    'Positive': 2
}

In [51]:
def process_data(df, is_train=True):
    print('geldi')
    df['Podcast_Name'] = df['Podcast_Name'].map(podcast_name_dict)
    df['Genre'] = df['Genre'].map(genre_dict)
    df['Publication_Day'] = df['Publication_Day'].map(sorted_days_dict)
    df['Publication_Time'] = df['Publication_Time'].map(publication_time_dict)
    df['Episode_Sentiment'] = df['Episode_Sentiment'].map(sentiment_dict)

    df.drop(columns=['Episode_Title'], inplace=True)

    


    episode_length_avg = train_df["Episode_Lenght_minutes"].mean()
    guest_popularity_avg =  train_df['Guest_Popularity_percentage'].mean()
    number_of_ads_mode = train_df['Number_of_Ads'].mode()

    df['Episode_Lenght_minutes'] = df['Episode_Lenght_minutes'].fillna(episode_length_avg)
    df['Guest_Popularity_percentage'] = df['Guest_Popularity_percentage'].fillna(guest_popularity_avg)
    df['Number_of_Ads'] = df['Number_of_Ads'].fillna(number_of_ads_mode)

    if is_train:
        y = df['Listening_Time_minutes']
        df.drop(columns=['Listening_Time_minutes', 'id'], inplace=True)
        return df, y
    else:
        ids = df['id']
        df.drop(columns=['id'], inplace=True)
        return df, ids

    
    
    


In [52]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
import xgboost as xgb

X_train, y_train = process_data(train_df, is_train=True)
X_test, ids = process_data(test_df, is_train=False)


scaler = StandardScaler()

# Columns you want to scale
columns_to_scale = ["Episode_Lenght_minutes", "Guest_Popularity_percentage","Number_of_Ads","Host_Popularity_percentage"]

X_train[columns_to_scale] = scaler.fit_transform(X_train[columns_to_scale])
X_test[columns_to_scale] = scaler.transform(X_test[columns_to_scale])

regressor = xgb.XGBRegressor()

param_grid = {
    'n_estimators': [100, 150, 200],
    'max_depth': [5, 7, 9],
    'learning_rate': [0.001, 0.01, 0.1, 0.2],
}

grid_search = GridSearchCV(
    estimator=regressor,
    param_grid=param_grid,
    cv=5,                  
    scoring='accuracy',    
    n_jobs=-1,             
    verbose=2
)

grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_

y_pred = best_model.predict(X_test)

submission = pd.DataFrame({
    "id": ids,
    "Transported": y_pred
})
submission.to_csv("submission.csv", index=False)

















KeyError: "['Episode_Title'] not found in axis"