In [2]:
!pip install pandas numpy scikit-learn



In [8]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [9]:
#File Path
train_file_path = "playground-series-s5e4/train.csv"
test_file_path = "playground-series-s5e4/test.csv"
submission_file_path = "playground-series-s5e4/sample_submission.csv"

#Loading the CSV files
train_df = pd.read_csv(train_file_path)
test_df = pd.read_csv(test_file_path)

#Displaying the head of the data
train_df.head()

Unnamed: 0,id,Podcast_Name,Episode_Title,Episode_Length_minutes,Genre,Host_Popularity_percentage,Publication_Day,Publication_Time,Guest_Popularity_percentage,Number_of_Ads,Episode_Sentiment,Listening_Time_minutes
0,0,Mystery Matters,Episode 98,,True Crime,74.81,Thursday,Night,,0.0,Positive,31.41998
1,1,Joke Junction,Episode 26,119.8,Comedy,66.95,Saturday,Afternoon,75.95,2.0,Negative,88.01241
2,2,Study Sessions,Episode 16,73.9,Education,69.97,Tuesday,Evening,8.97,0.0,Negative,44.92531
3,3,Digital Digest,Episode 45,67.17,Technology,57.22,Monday,Morning,78.7,2.0,Positive,46.27824
4,4,Mind & Body,Episode 86,110.51,Health,80.07,Monday,Afternoon,58.68,3.0,Neutral,75.61031


In [10]:
#dropinng the unnecessary columns ID, Podcast Names, and Episode Title
train_df = train_df.drop(columns=["id", "Podcast_Name", "Episode_Title"])

#Saving the test IDS for final Submission
test_ids = test_df["id"]
test_df = test_df.drop(columns=["id", "Podcast_Name", "Episode_Title"])

In [12]:
#using the median imputation for numerical computtation
imputer = SimpleImputer(strategy="median")

for col in ["Episode_Length_minutes", "Guest_Popularity_percentage"]:
    train_df[col] = imputer.fit_transform(train_df[[col]])
    test_df[col] = imputer.transform(test_df[[col]])

#filling the missing values in Number_of_Ads with 0
train_df["Number_of_Ads"] = train_df["Number_of_Ads"].fillna(0)
test_df["Number_of_Ads"] = test_df["Number_of_Ads"].fillna(0)

In [15]:
#Labeling encode categorical features
label_encoders = {}

for col in ["Genre", "Publication_Day", "Publication_Time", "Episode_Sentiment"]:
    le = LabelEncoder()
    train_df[col] = le.fit_transform(train_df[col])
    test_df[col] = le.transform(test_df[col]) #using the same encoder for test set
    label_encoders[col] = le

In [16]:
#Defining features x and target y
X = train_df.drop(columns=["Listening_Time_minutes"])
y = train_df["Listening_Time_minutes"]

In [18]:
#selecting a random subset of 1,00,000 rows for training
train_sampled = train_df.sample(n=100000, random_state=42)


X_sampled = train_sampled.drop(columns=["Listening_Time_minutes"])
y_sampled = train_sampled["Listening_Time_minutes"]

In [20]:
X_train, X_val, y_train, y_val = train_test_split(X_sampled, y_sampled, test_size=0.2, random_state=42)

In [21]:
#initializing and training a linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

In [22]:
#making predictions on validation set
y_pred = model.predict(X_val)

#computing RSME
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
print("Validation RMSE:", rmse)

Validation RMSE: 13.22441607445841


In [23]:
#Predicting on test set
test_predictions = model.predict(test_df)

In [25]:
#Creating s submission file
submission_df = pd.DataFrame({"id": test_ids, "Listening_Time_minutes": test_predictions})

#Saving to CSV
submission_df.to_csv("submission.csv", index=False)

#Displaying the head od the submission file
submission_df.head()

Unnamed: 0,id,Listening_Time_minutes
0,750000,55.896594
1,750001,20.028373
2,750002,50.87677
3,750003,82.05138
4,750004,50.67637
