In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math as math
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression

In [None]:
df = pd.read_csv("group_9.csv")

In [None]:
print("Dataset dimension:", df.shape)
display(df.head())

Graphic settings

In [None]:
# sns.set(style="whitegrid", context="notebook")
# plt.rcParams["figure.figsize"] = (10, 6)
sns.set(style="whitegrid")

Dataset info

In [None]:
df.info()

DESCRIPTIVE STATISTICS
 Basic statistics + skewness and curtose

In [None]:
num_desc = df.describe().T
num_desc["skew"] = df.select_dtypes(include=[np.number]).skew()
num_desc["kurtosis"] = df.select_dtypes(include=[np.number]).kurtosis()
display(num_desc.head(10))

UNIVARIATE ANALYSIS 
    (Distribution of individual features)

In [None]:
# Initial setup
numeric_columns = [
    'time_signature', 'key_mode', 'artist_song_count', 'album_freq', 'movement_index',
    'intensity_level', 'verbal_density', 'purity_score', 'positivity_index', 'activity_rate',
    'loudness_intensity', 'happy_dance', 'acoustics_instrumental', 'artists_avg_popularity',
    'tempo_vs_genre', 'energy_rank_pct', 'loud_energy_ratio', 'mood_pca', 'mood_cluster',
    'acoustic_valence_mood_cluster', 'signal_strength', 'focus_factor', 'ambient_level',
    'key_sin', 'key_cos', 'duration_log', 'duration_log_z', 'loudness_yeo', 'temp_zscore',
    'resonance_factor', 'timbre_index', 'distorted_movement', 'signal_power', 'target_regression'
]

# Grid setup
n = len(numeric_columns)
ncols = 3
nrows = math.ceil(n / ncols)
fig, axes = plt.subplots(nrows, ncols, figsize=(4 * ncols, 3 * nrows))
fig.suptitle('Distribution of Numerical Variables', fontsize=16)

# Flatten axes
axes_flat = axes.flatten() if isinstance(axes, np.ndarray) else [axes]

# Generate histograms
for i, col in enumerate(numeric_columns):
    ax = axes_flat[i]
    sns.histplot(df[col], kde=True, bins=20, ax=ax)
    ax.set_title(f'{col} Distribution')

# Hide unused axes if any
for j in range(len(numeric_columns), len(axes_flat)):
    axes_flat[j].set_visible(False)

plt.tight_layout(rect=[0, 0, 1, 0.97])
plt.show()

In [None]:
# Initial setup
categorical_columns = [
    'duration_1',
    'duration_2',
    'duration_3',
    'duration_4',
    'duration_5',
    'loudness_level',
    'popularity_level',
    'tempo_class',
    'explicit',
    'mode_indicator',
    'time_signature_class_boolean',
    'is_instrumental',
    'is_dance_hit',
    'echo_constant',
    'target_class'
]

# Grid setup
n_cols = 3
n_rows = math.ceil(len(categorical_columns) / n_cols)

fig, axes = plt.subplots(n_rows, n_cols, figsize=(n_cols * 5, n_rows * 4))
fig.suptitle('Distribution of Categorical Variables', fontsize=16)

axes = axes.flatten()  # Flatten axes

for idx, col in enumerate(categorical_columns):
    sns.countplot(x=col, data=df, ax=axes[idx])
    axes[idx].set_title(f'{col} Distribution')
    axes[idx].set_xlabel("")                        # Remove x labels for cleaner look if long names
    axes[idx].tick_params(axis='x', rotation=45)    # Rotate labels for readability

# Hide any unused subplots
for j in range(idx + 1, len(axes)):
    axes[j].set_visible(False)

plt.tight_layout(rect=[0, 0, 1, 0.97])
plt.show()



BIVARIATE ANALYSIS (Correlation between features and the different target variables)

In [None]:
# Initial setup
target_column = 'target_regression'

numeric_columns = [
    'time_signature', 'key_mode', 'artist_song_count', 'album_freq', 'movement_index',
    'intensity_level', 'verbal_density', 'purity_score', 'positivity_index', 'activity_rate',
    'loudness_intensity', 'happy_dance', 'acoustics_instrumental', 'artists_avg_popularity',
    'tempo_vs_genre', 'energy_rank_pct', 'loud_energy_ratio', 'mood_pca', 'mood_cluster',
    'acoustic_valence_mood_cluster', 'signal_strength', 'focus_factor', 'ambient_level',
    'key_sin', 'key_cos', 'duration_log', 'duration_log_z', 'loudness_yeo', 'temp_zscore',
    'resonance_factor', 'timbre_index', 'distorted_movement', 'signal_power'
]

# Grid setup
n_cols = 3
n_rows = math.ceil(len(numeric_columns) / n_cols)

fig, axes = plt.subplots(n_rows, n_cols, figsize=(n_cols * 6, n_rows * 4))
fig.suptitle(f'Scatter Plots of Numerical Variables vs {target_column}', fontsize=16)

axes = axes.flatten() # Flatten axes

for idx, col in enumerate(numeric_columns):
    sns.scatterplot(x=col, y=target_column, data=df, ax=axes[idx])
    axes[idx].set_title(f'{col} vs {target_column}')
    axes[idx].set_xlabel(col)
    axes[idx].set_ylabel(target_column)

# Hide extra subplots (if any)
for j in range(idx + 1, len(axes)):
    axes[j].set_visible(False)

plt.tight_layout(rect=[0, 0, 1, 0.97])
plt.show()


In [None]:
# Initial setup
target_column = 'target_regression'

categorical_columns = [
    'duration_1', 'duration_2', 'duration_3', 'duration_4', 'duration_5',
    'loudness_level', 'popularity_level', 'tempo_class', 'explicit',
    'mode_indicator', 'time_signature_class_boolean', 'is_instrumental',
    'is_dance_hit', 'echo_constant', 'target_class'
]

# Grid setup
n_cols = 3
n_rows = math.ceil(len(categorical_columns) / n_cols)

fig, axes = plt.subplots(n_rows, n_cols, figsize=(n_cols * 6, n_rows * 4))
fig.suptitle(f'Box Plots of Categorical Variables vs {target_column}', fontsize=16)

axes = axes.flatten() # Flatten axes

for idx, col in enumerate(categorical_columns):
    sns.boxplot(x=col, y=target_column, data=df, ax=axes[idx])
    axes[idx].set_title(f'{col} vs {target_column}')
    axes[idx].tick_params(axis='x', rotation=45)

# Hide any unused subplots
for j in range(idx + 1, len(axes)):
    axes[j].set_visible(False)

plt.tight_layout(rect=[0, 0, 1, 0.97])
plt.show()


Best Feature for Regression

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Setup

for col in df.columns:
    df[col] = df[col].astype(str).str.replace(',', '.', regex=False)

target_column = 'target_regression'  # Target variable
features = [
    'duration_1','duration_2','duration_3','duration_4','duration_5','loudness_level',
    'popularity_level','tempo_class','time_signature','key_mode','artist_song_count',
    'album_freq','movement_index','intensity_level','verbal_density','purity_score',
    'positivity_index','activity_rate','loudness_intensity','happy_dance',
    'acoustics_instrumental','artists_avg_popularity','tempo_vs_genre','energy_rank_pct',
    'loud_energy_ratio','mood_pca','mood_cluster','acoustic_valence_mood_cluster',
    'explicit','signal_strength','mode_indicator','focus_factor','ambient_level',
    'key_sin','key_cos','duration_log','duration_log_z','time_signature_class_boolean',
    'loudness_yeo','is_instrumental','is_dance_hit','temp_zscore','resonance_factor',
    'timbre_index','echo_constant','distorted_movement','signal_power'
]

results = []

# Loop Through Features and Fit SLR

for feature in features:
    X = df[[feature]]
    y = df[[target_column]]

    lr = LinearRegression()
    lr.fit(X, y)
    pred = lr.predict(X)

    mae = mean_absolute_error(y, pred)
    mse = mean_squared_error(y, pred)
    r2 = r2_score(y, pred)

    results.append([feature, mae, mse, r2])

# Convert to DataFrame and Sort

results_df = pd.DataFrame(results, columns=['Feature', 'MAE', 'MSE', 'R2'])
results_df = results_df.sort_values(by='R2', ascending=False)
print("Top 10 features by R2 score:")
print(results_df.head(10))

# Plot R² Bar Chart

plt.figure(figsize=(12, 6))
sns.barplot(x='R2', y='Feature', data=results_df.head(10))  # Plot top 10
plt.title('Top 10 Features Ranked by R² (Simple Linear Regression)')
plt.xlabel('R² Score')
plt.ylabel('Feature')
plt.show()


Simple Linear Regression

In [None]:
# Replace commas with dots and convert to float
df['artists_avg_popularity'] = df['artists_avg_popularity'].astype(str).str.replace(',', '.', regex=False)
df['target_regression'] = df['target_regression'].astype(str).str.replace(',', '.', regex=False)

# Convert to numeric, coerce errors to NaN
df['artists_avg_popularity'] = pd.to_numeric(df['artists_avg_popularity'], errors='coerce')
df['target_regression'] = pd.to_numeric(df['target_regression'], errors='coerce')

x=df[['artists_avg_popularity']]
y=df[['target_regression']]
lr = LinearRegression()
lr_model = lr.fit(x, y)

predictions = lr_model.predict(x)
df['slr_result'] = predictions

slr_error = y - predictions
df['slr_error'] = slr_error

print ('Slope: ', lr_model.coef_)
print ('Intercept: ',lr_model.intercept_)

print("Mean absolute error: %.2f" % np.mean(np.absolute(predictions - y.values)))
print("Residual sum of squares (MSE): %.2f" % np.mean((predictions - y.values) ** 2))
print("R2-score: %.2f" % r2_score(y.values , predictions) )

In [None]:
fig, axes =plt.subplots(1,3, figsize=(16,4))
axes[0].plot(x['artists_avg_popularity'], y,'bo',label='Actual Values')
axes[0].plot(x['artists_avg_popularity'], predictions,'go',label='Predicted Values')
axes[0].set_title("Scatter plot: Actual Vs. Predicted Values")
axes[0].set_xlabel("artists_avg_popularity")
axes[0].set_ylabel("target_regression")
axes[0].legend()

sns.distplot(y, hist=False, color="g", label="Actual Values",ax=axes[1])
sns.distplot(predictions, hist=False, color="r", label="Predicted Values" , ax=axes[1])
axes[1].set_title("Dist plot: Actual Vs. Predicted Values")
axes[1].legend()

sns.scatterplot(x=y.index,y='slr_error',data=df,color="r", ax=axes[2])
axes[2].set_title("Prediction Error")
axes[2].set_ylabel("Prediction Error")

fig.tight_layout()

Based on the analysis performed across all features, the variable artists_avg_popularity achieved the highest R² score.

This indicates that artists_avg_popularity is the best single predictor of target_regression, explaining the largest proportion of its variance among all tested features.

In other words, the average popularity of an artist has the strongest linear relationship with the target regression score, making it the most influential variable in this simple predictive model.