In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import joblib
import numpy as np


In [2]:
encoded_df = pd.read_csv('../Data/processed/encoded_df.csv')
enriched_df = pd.read_csv('../Data/processed/enriched_df.csv')

In [3]:
X_encoded = encoded_df.drop(['Listening_Time_minutes', 'id'], axis=1)
y= encoded_df['Listening_Time_minutes']
X_train_encoded, X_test_encoded, y_train_encoded, y_test_encoded = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

In [None]:
# Training the model with encoded df (take several minutes)

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score

# Model Fitting
rf = RandomForestRegressor(n_estimators=100, random_state=42, max_depth=14, min_samples_leaf=5, max_features="sqrt", n_jobs=1)
rf.fit(X_train_encoded, y_train_encoded)

# Predictions
y_pred = rf.predict(X_test_encoded)

# Evaluation
print("MAE:", mean_absolute_error(y_test_encoded, y_pred))
print("R²:", r2_score(y_test_encoded, y_pred))

# Save the model
joblib.dump(rf, "../Models/random_forest_model_encoded.pkl", compress=3)

# Variables importance
importances = rf.feature_importances_
for feat, imp in zip(X_train_encoded.columns, importances):
    print(feat, imp)


MAE: 9.563420674113454
R²: 0.7695386439346017
Episode_Title 0.016256666874967917
Episode_Length_minutes 0.8794608337956058
Host_Popularity_percentage 0.024604736016382474
Guest_Popularity_percentage 0.018043411310565242
Number_of_Ads 0.016431688908110947
Episode_Sentiment 0.0035922922004748593
Guest_Popularity_missing 0.0014341063965997288
Genre_encoded 0.007392003293689498
Time_sin 0.003889106893044404
Time_cos 0.002748189079251405
Day_sin 0.006027257125289488
Day_cos 0.004084582008843231
Podcast_Encoded 0.016035126097174997


#### Model Results

MAE: 9.178606591906666

R²: 0.7787670204048296

Episode_Title 0.035188158193404276

Episode_Length_minutes 0.7755349868931583

Host_Popularity_percentage 0.04671164752805849

Guest_Popularity_percentage 0.03932785599254617

Number_of_Ads 0.010075566930163973

Episode_Sentiment 0.007535109071634484

Guest_Popularity_missing 0.00241642686649575

Genre_encoded 0.015080642882308414

Time_sin 0.008085297029639341

Time_cos 0.005304105746444449

Day_sin 0.012865945271237508

Day_cos 0.008212169071671991

Podcast_Encoded 0.03366208852323687

In [5]:
X_enriched = enriched_df.drop(['Listening_Time_minutes', 'id'], axis=1)
X_train_enriched, X_test_enriched, y_train_enriched, y_test_enriched = train_test_split(X_enriched, y, test_size=0.2, random_state=42)

In [6]:
"""
# Training the model with enriched df (take several minutes)

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score

# Model Fitting
rf = RandomForestRegressor(n_estimators=200, random_state=42)
rf.fit(X_train_enriched, y_train_enriched)

# Predictions
y_pred_enriched = rf.predict(X_test_enriched)

# Evaluation
print("MAE:", mean_absolute_error(y_test_enriched, y_pred_enriched))
print("R²:", r2_score(y_test_enriched, y_pred_enriched))

# Save the model
joblib.dump(rf, "../Models/random_forest_model_enriched.pkl")

# Variables importance
importances = rf.feature_importances_
for feat, imp in zip(X_train_enriched.columns, importances):
    print(feat, imp)
"""

'\n# Training the model with enriched df (take several minutes)\n\nfrom sklearn.ensemble import RandomForestRegressor\nfrom sklearn.metrics import mean_absolute_error, r2_score\n\n# Model Fitting\nrf = RandomForestRegressor(n_estimators=200, random_state=42)\nrf.fit(X_train_enriched, y_train_enriched)\n\n# Predictions\ny_pred_enriched = rf.predict(X_test_enriched)\n\n# Evaluation\nprint("MAE:", mean_absolute_error(y_test_enriched, y_pred_enriched))\nprint("R²:", r2_score(y_test_enriched, y_pred_enriched))\n\n# Save the model\njoblib.dump(rf, "../Models/random_forest_model_enriched.pkl")\n\n# Variables importance\nimportances = rf.feature_importances_\nfor feat, imp in zip(X_train_enriched.columns, importances):\n    print(feat, imp)\n'

#### Model Results

MAE: 9.178683020708752

R²: 0.778708588884982

Episode_Title 0.03521217498111948

Episode_Length_minutes 0.7754719587168192

Host_Popularity_percentage 0.0467416417498653

Guest_Popularity_percentage 0.039314073382994176

Number_of_Ads 0.010076853290742182

Episode_Sentiment 0.007528249364584272

Guest_Popularity_missing 0.0024199408683772843

Genre_encoded 0.015090304776617549

Time_sin 0.008084012654665532

Time_cos 0.005302286836509516

Day_sin 0.012876149906139901

Day_cos 0.008222757719399154

Podcast_Encoded 0.03365959575216653

In [7]:
model = joblib.load('../Models/random_forest_model_encoded.pkl')

In [8]:
encoded_df['prediction'] = model.predict(X_encoded)

In [9]:
encoded_df

Unnamed: 0,id,Episode_Title,Episode_Length_minutes,Host_Popularity_percentage,Guest_Popularity_percentage,Number_of_Ads,Episode_Sentiment,Guest_Popularity_missing,Genre_encoded,Listening_Time_minutes,Time_sin,Time_cos,Day_sin,Day_cos,Podcast_Encoded,prediction
0,202844,30,119.97,64.21,17.93,0.0,-1,0,44.913404,57.67601,0.258819,0.965926,-0.974928,-0.222521,43.980881,89.909619
1,110926,57,62.57,27.22,50.84,1.0,1,0,45.440905,48.82398,0.707107,-0.707107,-0.433884,-0.900969,45.628020,42.865424
2,200533,93,99.12,82.95,93.62,1.0,-1,0,44.371825,82.95502,0.707107,-0.707107,0.781831,0.623490,43.996397,75.642707
3,285483,30,91.05,99.51,13.44,3.0,0,0,44.913404,71.68635,-0.866025,0.500000,0.433884,-0.900969,41.702596,62.705810
4,79612,26,61.42,23.48,5.69,1.0,-1,0,45.440905,50.45880,-0.707107,-0.707107,0.000000,1.000000,45.239931,46.078881
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
599993,259180,86,63.41,20.34,67.59,2.0,0,0,46.533255,32.06549,0.258819,0.965926,-0.781831,0.623490,44.461635,37.896294
599994,365840,80,61.19,38.87,0.83,0.0,0,0,46.533255,53.73476,0.707107,-0.707107,-0.781831,0.623490,47.007739,46.896396
599995,131933,55,19.40,39.77,52.85,1.0,-1,0,45.819181,13.09288,0.707107,-0.707107,-0.781831,0.623490,45.873295,12.849851
599996,671157,69,17.51,92.29,57.35,0.0,0,0,44.371825,13.36094,-0.866025,0.500000,-0.433884,-0.900969,45.814585,12.836799


## DF for Visualization

### Decoding variables

In [10]:
# Decode Days of the week
day_angle = np.arctan2(encoded_df['Day_sin'], encoded_df['Day_cos']) % (2*np.pi)
uncoded_day = (day_angle * 7 / (2 * np.pi)).astype(int)

In [11]:
time_angle = np.arctan2(encoded_df['Time_sin'], encoded_df['Time_cos']) % (2*np.pi)
uncoded_time = round(time_angle * 24 / (2*np.pi)).astype(int)

In [12]:
# Decoding the df
sentiment_map = {
    -1: 'Negative',
    0: 'Neutral',
    1: 'Positive'
}

day_map = {
    0: 'Monday',
    1: 'Tuesday',
    2: 'Wednesday',
    3: 'Thursday',
    4: 'Friday',
    5: 'Saturday',
    6: 'Sunday'
}

time_map = {
    1: 'Night',
    9: 'Morning',
    15: 'Afternoon',
    20: 'Evening'
}

genre_encoding_map = round(joblib.load('../artifacts/genre_target_mean.pkl'), 2)
# Inverting genre map to decode
genre_map = {v: k for k, v in genre_encoding_map.items()}

# Importing Podcast Name encoding map
podcast_names = joblib.load('../artifacts/podcast_names_list.pkl')

In [13]:
df = encoded_df.copy()
df

Unnamed: 0,id,Episode_Title,Episode_Length_minutes,Host_Popularity_percentage,Guest_Popularity_percentage,Number_of_Ads,Episode_Sentiment,Guest_Popularity_missing,Genre_encoded,Listening_Time_minutes,Time_sin,Time_cos,Day_sin,Day_cos,Podcast_Encoded,prediction
0,202844,30,119.97,64.21,17.93,0.0,-1,0,44.913404,57.67601,0.258819,0.965926,-0.974928,-0.222521,43.980881,89.909619
1,110926,57,62.57,27.22,50.84,1.0,1,0,45.440905,48.82398,0.707107,-0.707107,-0.433884,-0.900969,45.628020,42.865424
2,200533,93,99.12,82.95,93.62,1.0,-1,0,44.371825,82.95502,0.707107,-0.707107,0.781831,0.623490,43.996397,75.642707
3,285483,30,91.05,99.51,13.44,3.0,0,0,44.913404,71.68635,-0.866025,0.500000,0.433884,-0.900969,41.702596,62.705810
4,79612,26,61.42,23.48,5.69,1.0,-1,0,45.440905,50.45880,-0.707107,-0.707107,0.000000,1.000000,45.239931,46.078881
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
599993,259180,86,63.41,20.34,67.59,2.0,0,0,46.533255,32.06549,0.258819,0.965926,-0.781831,0.623490,44.461635,37.896294
599994,365840,80,61.19,38.87,0.83,0.0,0,0,46.533255,53.73476,0.707107,-0.707107,-0.781831,0.623490,47.007739,46.896396
599995,131933,55,19.40,39.77,52.85,1.0,-1,0,45.819181,13.09288,0.707107,-0.707107,-0.781831,0.623490,45.873295,12.849851
599996,671157,69,17.51,92.29,57.35,0.0,0,0,44.371825,13.36094,-0.866025,0.500000,-0.433884,-0.900969,45.814585,12.836799


In [14]:
df['Genre'] = df['Genre_encoded'].round(2).map(genre_map)
#-------------------------------------------------------------------------
df['Episode_Sentiment'] = df['Episode_Sentiment'].map(sentiment_map)
#-------------------------------------------------------------------------
df['Publication_day_number'] = uncoded_day
df['Publication_day'] = df['Publication_day_number'].map(day_map)
#-------------------------------------------------------------------------
df['Publication_time_hour'] = uncoded_time
df['Publication_time'] = df['Publication_time_hour'].map(time_map)
#-------------------------------------------------------------------------
df['Podcast_Name'] = podcast_names

In [15]:
# Removing encoded columns
columns_to_remove = ['Podcast_Encoded', 'Day_cos', 'Day_sin', 'Time_cos', 'Time_sin', 'Genre_encoded']
df = df.drop(columns_to_remove, axis=1)

In [None]:
df.to_csv('./Data/processed/Viz_df.csv', index=False)

OSError: Cannot save file into a non-existent directory: 'Data\processed'

: 

In [None]:
df

Unnamed: 0,id,Episode_Title,Episode_Length_minutes,Host_Popularity_percentage,Guest_Popularity_percentage,Number_of_Ads,Episode_Sentiment,Guest_Popularity_missing,Listening_Time_minutes,prediction,Genre,Publication_day_number,Publication_day,Publication_time_hour,Publication_time,Podcast_Name
0,202844,30,119.97,64.21,17.93,0.0,Negative,0,57.67601,99.824983,Sports,5,Saturday,1,Night,Game Day
1,110926,57,62.57,27.22,50.84,1.0,Positive,0,48.82398,47.413588,Lifestyle,4,Friday,9,Morning,Lifestyle Lounge
2,200533,93,99.12,82.95,93.62,1.0,Negative,0,82.95502,76.563204,News,1,Tuesday,9,Morning,World Watch
3,285483,30,91.05,99.51,13.44,3.0,Neutral,0,71.68635,70.170154,Sports,3,Thursday,20,Evening,Sports Central
4,79612,26,61.42,23.48,5.69,1.0,Negative,0,50.45880,50.319279,Lifestyle,0,Monday,15,Afternoon,Fashion Forward
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
599993,259180,86,63.41,20.34,67.59,2.0,Neutral,0,32.06549,37.200416,Music,6,Sunday,1,Night,Sound Waves
599994,365840,80,61.19,38.87,0.83,0.0,Neutral,0,53.73476,50.997901,Music,6,Sunday,9,Morning,Tune Time
599995,131933,55,19.40,39.77,52.85,1.0,Negative,0,13.09288,12.820011,Health,6,Sunday,9,Morning,Fitness First
599996,671157,69,17.51,92.29,57.35,0.0,Neutral,0,13.36094,12.357595,News,4,Friday,20,Evening,Daily Digest
