In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [4]:
df_tiktok = pd.read_csv('tiktok_predicted_genres_with_clusters.csv')

In [5]:
df_tiktok.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 263 entries, 0 to 262
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   track_name        263 non-null    object 
 1   artist_name       263 non-null    object 
 2   artist_pop        263 non-null    int64  
 3   album             263 non-null    object 
 4   track_pop         263 non-null    int64  
 5   danceability      263 non-null    float64
 6   energy            263 non-null    float64
 7   loudness          263 non-null    float64
 8   mode              263 non-null    int64  
 9   key               263 non-null    int64  
 10  speechiness       263 non-null    float64
 11  acousticness      263 non-null    float64
 12  instrumentalness  263 non-null    float64
 13  liveness          263 non-null    float64
 14  valence           263 non-null    float64
 15  tempo             263 non-null    float64
 16  time_signature    263 non-null    int64  
 1

In [6]:
df_tiktok['predicted_genre'] = df_tiktok['predicted_genre'].astype('category')

In [7]:
features = ['energy', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'danceability', 'predicted_genre']
target = 'track_pop'

In [8]:
X = df_tiktok[features]
y = df_tiktok[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
categorical_features = ['predicted_genre']
preprocessor = ColumnTransformer(transformers=[('cat', OneHotEncoder(), categorical_features)], remainder='passthrough')
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

In [10]:
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

In [11]:
y_pred = rf_model.predict(X_test)

In [12]:
predict_data_values = {
    'energy': 0.8,
    'speechiness': 0.1,
    'acousticness': 0.2,
    'instrumentalness': 0.5,
    'liveness': 0.3,
    'valence': 0.7,
    'tempo': 120,
    'danceability': 0.6,
    'predicted_genre': 'rap'  
}

In [13]:
new_data = pd.DataFrame([predict_data_values])

In [14]:
X_new = preprocessor.transform(new_data)

In [15]:
predictions = rf_model.predict(X_new)

In [16]:
print("Predicted Popularity:", predictions)

Predicted Popularity: [54.89]


In [17]:
y_pred = rf_model.predict(X_test)

In [18]:
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

In [19]:
print(f"Mean Squared Error (MSE): {mse}")
print(f"Mean Absolute Error (MAE): {mae}")

Mean Squared Error (MSE): 594.7968641509434
Mean Absolute Error (MAE): 19.649811320754715


In [25]:
y_train_pred = rf_model.predict(X_train)
train_mse = mean_squared_error(y_train, y_train_pred)
train_mae = mean_absolute_error(y_train, y_train_pred)
print(f"Training Mean Squared Error (MSE): {train_mse}")
print(f"Training Mean Absolute Error (MAE): {train_mae}")

# Performance on test set (already calculated as mse and mae)

Training Mean Squared Error (MSE): 97.72954952380952
Training Mean Absolute Error (MAE): 7.6863809523809525
