In [48]:
import pandas as pd
import pandas.plotting as pd_plotting
import matplotlib.pyplot as plt
import numpy as np
import sklearn
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [49]:
FILEPATH = 'Spotify_Youtube.csv'
TARGET_COLUMN = 'Views' # Target for regression task
TEST_SIZE = 0.2
RANDOM_STATE = 42

music_df = pd.read_csv('Spotify_Youtube.csv')
print(music_df.isnull().sum())

Unnamed: 0            0
Artist                0
Url_spotify           0
Track                 0
Album                 0
Album_type            0
Uri                   0
Danceability          2
Energy                2
Key                   2
Loudness              2
Speechiness           2
Acousticness          2
Instrumentalness      2
Liveness              2
Valence               2
Tempo                 2
Duration_ms           2
Url_youtube         470
Title               470
Channel             470
Views               470
Likes               541
Comments            569
Description         876
Licensed            470
official_video      470
Stream              576
dtype: int64


In [50]:
print(f"--- 3: Loading Data from {FILEPATH} ---")
df = pd.read_csv(FILEPATH)
print(f"Data loaded successfully. Shape: {df.shape}")

print("\n--- Initial Data Info ---")
print(df.info())
print("\n--- Data Head ---")
print(df.head())

--- 3: Loading Data from Spotify_Youtube.csv ---
Data loaded successfully. Shape: (20718, 28)

--- Initial Data Info ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20718 entries, 0 to 20717
Data columns (total 28 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Unnamed: 0        20718 non-null  int64  
 1   Artist            20718 non-null  object 
 2   Url_spotify       20718 non-null  object 
 3   Track             20718 non-null  object 
 4   Album             20718 non-null  object 
 5   Album_type        20718 non-null  object 
 6   Uri               20718 non-null  object 
 7   Danceability      20716 non-null  float64
 8   Energy            20716 non-null  float64
 9   Key               20716 non-null  float64
 10  Loudness          20716 non-null  float64
 11  Speechiness       20716 non-null  float64
 12  Acousticness      20716 non-null  float64
 13  Instrumentalness  20716 non-null  float64
 14  Liveness     

In [51]:
print("\n--- 4: Cleaning Data ---")

# --- 4a: Drop Unnamed Column ---
if df.columns[0].startswith('Unnamed:'):
    print(f"Dropping the first unnamed column: {df.columns[0]}")
    df = df.iloc[:, 1:]
else:
    print("First column is not unnamed, not dropping.")

# --- 4b: Handle Missing Values ---
initial_rows = len(df)
df.dropna(inplace=True)
rows_dropped = initial_rows - len(df)
if rows_dropped > 0:
    print(f"Dropped {rows_dropped} rows with missing values.")
else:
    print("No rows with missing values found.")
print(f"DataFrame shape after cleaning: {df.shape}")

# --- 4c: Placeholder for Outlier Handling ---
print("\nPlaceholder: Outlier detection and handling needed.")

# --- 4d: Placeholder for Incorrectly Labeled Points ---
print("Placeholder: Handling of incorrectly labeled points needed.")



--- 4: Cleaning Data ---
Dropping the first unnamed column: Unnamed: 0
Dropped 1548 rows with missing values.
DataFrame shape after cleaning: (19170, 27)

Placeholder: Outlier detection and handling needed.
Placeholder: Handling of incorrectly labeled points needed.


In [52]:
print("\n--- 5: Encoding Categorical Features ---")
label_encoders = {}
# Identify object columns that are likely categorical
# Adjust this list based on actual dataset analysis
cols_to_encode = ['Track', 'Artist', 'Album'] # Add other relevant object columns

for col in cols_to_encode:
    if col in df.columns:
        if df[col].dtype == 'object':
            print(f"Encoding categorical column: {col}")
            le = LabelEncoder()
            df[col] = le.fit_transform(df[col])
            label_encoders[col] = le # Store encoder if needed later
        else:
             print(f"Skipping encoding for non-object column: {col}")
    else:
        print(f"Warning: Column '{col}' not found for encoding.")
print(f"DataFrame shape after encoding: {df.shape}")


--- 5: Encoding Categorical Features ---
Encoding categorical column: Track
Encoding categorical column: Artist
Encoding categorical column: Album
DataFrame shape after encoding: (19170, 27)


In [53]:
print("\n--- 6: Feature Selection ---")

# Drop columns that don't add value for ML predictions
columns_to_drop = [
    'Url_spotify',
    'Uri',
    'Url_youtube',
    'Description',
    'Licensed',
    'official_video'
]

# Drop the specified columns
df = df.drop(columns=columns_to_drop)
print(f"Dropped columns: {columns_to_drop}")
print(f"DataFrame shape after feature selection: {df.shape}")

# Display remaining columns
print("\nRemaining columns:")
print(df.columns.tolist())


--- 6: Feature Selection ---
Dropped columns: ['Url_spotify', 'Uri', 'Url_youtube', 'Description', 'Licensed', 'official_video']
DataFrame shape after feature selection: (19170, 21)

Remaining columns:
['Artist', 'Track', 'Album', 'Album_type', 'Danceability', 'Energy', 'Key', 'Loudness', 'Speechiness', 'Acousticness', 'Instrumentalness', 'Liveness', 'Valence', 'Tempo', 'Duration_ms', 'Title', 'Channel', 'Views', 'Likes', 'Comments', 'Stream']


In [54]:
print("\n--- 7: Splitting Data into Features (X) and Target (y) ---")
if TARGET_COLUMN not in df.columns:
    print(f"Error: Target column '{TARGET_COLUMN}' not found in DataFrame.")
    sys.exit(1)

X = df.drop(TARGET_COLUMN, axis=1)
y = df[TARGET_COLUMN]
print(f"Features shape (X): {X.shape}")
print(f"Target shape (y): {y.shape}")


--- 7: Splitting Data into Features (X) and Target (y) ---
Features shape (X): (19170, 20)
Target shape (y): (19170,)


In [55]:
print("\n--- 8: Splitting Data into Training and Testing Sets ---")
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE
)
print(f"Data split into training and testing sets (test_size={TEST_SIZE}, random_state={RANDOM_STATE}).")



--- 8: Splitting Data into Training and Testing Sets ---
Data split into training and testing sets (test_size=0.2, random_state=42).


In [56]:
print("\n--- 9: Final Shapes ---")
print("Training features shape (X_train):", X_train.shape)
print("Testing features shape (X_test):", X_test.shape)
print("Training target shape (y_train):", y_train.shape)
print("Testing target shape (y_test):", y_test.shape)
print("\n--- Preprocessing Script Complete ---") 


--- 9: Final Shapes ---
Training features shape (X_train): (15336, 20)
Testing features shape (X_test): (3834, 20)
Training target shape (y_train): (15336,)
Testing target shape (y_test): (3834,)

--- Preprocessing Script Complete ---
