In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import numpy as np # linear algebra
import pandas as pd

In [None]:
pd.read_csv('/kaggle/input/tiktok/TikToksongs2022.csv')

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_selection import mutual_info_regression
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.preprocessing import OneHotEncoder

from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from sklearn.neighbors import KNeighborsRegressor
from catboost import CatBoostRegressor

<h1 id="1.-Exploratory-Data-Analysis"><p style="background-color:#A378C6;color:white;font-size:100%;text-align:center;border-radius:100px 100px;">1. Exploratory Data Analysis</p><a class="anchor-link" href="#1.-Exploratory-Data-Analysis">&#182;</a></h1><p>In this section, I will explore the dataset and try to answer the different questions asked above.</p>

In [None]:
df = pd.read_csv("/kaggle/input/tiktok/TikToksongs2022.csv")
print("There are {} observations for {} predictors.".format(df.shape[0],df.shape[1]))
df.head()

In [None]:
df.isnull().sum()

This shows there are no missing values

In [None]:
df.describe(include='all')

In [None]:
plt.figure(figsize = (15,5))
plt.subplot(1,2,2)
sns.histplot(x = 'track_pop', data=df, kde = True)
sns.histplot(data=df, x='track_pop', color='purple', edgecolor='yellow')
plt.subplot(1,2,1)
sns.boxplot(x = 'track_pop', data = df)
sns.boxplot(data=df, x='track_pop', color='purple')

<blockquote><p>📌 The average popularity is 65 and the scores go between 0 and 100.</p>
</blockquote>

<h1> <span style="color:purple"> What are the most popular tracks? </h1>
</div>

In [None]:
df.groupby("track_pop").track_name.max()[:5]

<blockquote><p>📌 The most famous tracks are "Without You", "Write This Down (Instrumental)", "Aesthetic", "Positions" and "WHATS POPPIN:.</p>
</blockquote>

<h1> <span style="color:purple"> Who are the most popular artists? </h1>

In [None]:
top_artists = df[["artist_name", "artist_pop"]].sort_values('artist_pop',ascending=False)
top_artists = top_artists.artist_name.unique()[:5]
top_artists

In [None]:
df.loc[~df["artist_name"].isin(top_artists),'artist_name'] = 'Other'

In [None]:
plt.figure(figsize=(10,5))

sns.barplot(data=df[["artist_name", "artist_pop"]], x='artist_name', color='purple', edgecolor='black', y='artist_pop')
plt.ylabel('Song')
plt.xlabel('Popularity')
plt.legend(loc='center left', bbox_to_anchor=(1.0, 0.5))
plt.show()

In [None]:
most_artists = df[["artist_name"]].value_counts().sort_values(ascending=False)
most_artists = pd.DataFrame(most_artists[:5]).reset_index().artist_name
most_artists

In [None]:
df.loc[~df["artist_name"].isin(most_artists),'artist_name'] = 'Other'

In [None]:
plt.figure(figsize=(12,6))

sns.countplot(data=df, x="artist_name", color='purple')
plt.ylabel('Song')
plt.xlabel('Popularity')
plt.legend(loc='center left', bbox_to_anchor=(1.0, 0.5))
plt.show()

<h3>📌 Some artists seem to have more tracks in the most popular tracks. But the most popular artists are Drake, The Weeknd, Harry Styles, Ed Sheeran, Justin Bieber. </h3>

<h1> <span style="color:purple"> How do the different mood descriptions of the track influence the popularity? </h1>

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Select the relevant columns for the heatmap
selected_columns = ['danceability', 'energy', 'speechiness', 'liveness', 'valence', 'track_pop']
correlation_matrix = df[selected_columns].corr()

# Create the heatmap
plt.figure(figsize=(20, 18))
sns.heatmap(correlation_matrix, annot=True, cmap='RdBu_r', fmt='.2f', linewidths=1,vmin= -1.0, vmax= 1.0, center = 0)
plt.title('Correlation Heatmap of Mood Descriptors and Track Popularity')
plt.show()

📌 It seems that the liveness and the valence are the most lineary correlated to the popularity. There are also some correlations between the predictors like the valence and the danceability.


From the heatmap, we can observe the following:

- "Danceability" has a moderate positive correlation (0.56) with track popularity, suggesting that more danceable tracks are generally more popular.
- "Energy" has a weak positive correlation (0.17) with track popularity.
- "Speechiness" has a weak negative correlation (-0.13) with track popularity.
- "Liveness" has a very weak negative correlation (-0.07) with track popularity.
- "Valence" has a very weak negative correlation (-0.08) with track popularity.



<h1> <span style="color:purple">Are the different accoustic characteristics of the track important for popularity?</h1>

In [None]:
# Select the relevant columns for the second heatmap
acoustic_columns = ['loudness', 'mode', 'key', 'tempo', 'time_signature', 'acousticness','track_pop']
acoustic_correlation_matrix = df[acoustic_columns].corr()

mask = np.triu(np.ones_like(df[acoustic_columns].corr(), dtype=bool))

# Create the heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(acoustic_correlation_matrix, mask=mask, annot=True, cmap='RdBu_r', fmt='.2f', linewidths=1,vmin= -1.0, vmax= 1.0, center = 0)
plt.title('Correlation Heatmap of Acoustic Characteristics and Track Popularity')
plt.show()

The heatmap above shows the correlation coefficients between different acoustic characteristics ("loudness," "mode," "key," "tempo," "time_signature") and track popularity ("track_pop").

Here's how to interpret the correlation values:

- A value close to 1 indicates a strong positive correlation.
- A value close to -1 indicates a strong negative correlation.
- A value close to 0 indicates no correlation.

From the heatmap, we can observe the following:

- "Loudness" has a weak positive correlation (0.25) with track popularity, suggesting that louder tracks may be slightly more popular.
- "Mode" has a very weak negative correlation (-0.06) with track popularity.
- "Key" has a very weak negative correlation (-0.02) with track popularity.
- "Tempo" has a very weak negative correlation (-0.06) with track popularity.
- "Time_signature" has a very weak positive correlation (0.05) with track popularity.

Here are the correlations between different acoustic characteristics and track popularity:

**Loudness:**
0.0957
0.0957

**Acousticness:**
0.0311
0.0311

**Instrumentalness:**
−0.0644
−0.0644

**Liveness:** 
−0.1274
−0.1274

From the correlations:

Loudness and Acousticness have positive correlations with track popularity, suggesting tracks that are louder and more acoustic in nature tend to be slightly more popular. Instrumentalness has a negative correlation, indicating tracks with more instrumental content might be slightly less popular on TikTok. Liveness has the most negative correlation among these characteristics. This suggests tracks that sound more like they're performed live (or have audience sounds) might be less popular.



<h1><span style="color:purple">Does the duration of the track influence its popularity?</span>.</h1>

In [None]:
sns.regplot(x=df["duration_ms"], y=df["track_pop"],scatter_kws={"color": "purple"}, line_kws={"color": "pink"})
plt.show()

📌 There is a slight link between the duration of the track and the popularity.

<h1><span style="color:purple">General importance of every predictor:</h1>

In [None]:
def make_mi_scores(X, y):
    X = X.copy()
    for colname in X.select_dtypes(["object", "category"]):
        X[colname], _ = X[colname].factorize()
    # All discrete features should now have integer dtypes
    discrete_features = [pd.api.types.is_integer_dtype(t) for t in X.dtypes]
    mi_scores = mutual_info_regression(X, y, discrete_features=discrete_features, random_state=0)
    mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
    mi_scores = mi_scores.sort_values(ascending=False)
    return mi_scores

In [None]:
X = df.copy()
y = X.pop("track_pop")

mi_scores = make_mi_scores(X, y)

In [None]:
print(mi_scores.sort_values(ascending=False))

📌 For the rest of the analysis, the predictors that have a MI score equals to zero will not be included.

<h1 id="2. Predictions"><p style="background-color:#A378C6;color:white;font-size:100%;text-align:center;border-radius:100px 100px;">2. Predictions</p>

Now for the fun part :) I will try to predict the popularity with the given dataset.

In [None]:
def load_data():
    # Read data
    df = pd.read_csv("/kaggle/input/tiktok/TikToksongs2022.csv",index_col=0)
    df = df[["artist_name", "danceability", "energy", "speechiness", "acousticness", "liveness", "mode", "tempo", "time_signature", "track_pop"]]
    # Preprocessing the data
    
    dummies = pd.get_dummies(df["artist_name"], drop_first= True)
    df = pd.concat([df,dummies],axis=1)
    df.drop(columns=["artist_name"], inplace=True)
    
    X = df.copy()
    y = X.pop("track_pop")
    
    xtrain,xtest,ytrain,ytest = train_test_split(X,y,random_state = 1,test_size=0.3, shuffle=True)
    
    return xtrain,xtest,ytrain,ytest

In [None]:
xtrain,xtest,ytrain,ytest = load_data()

**Define the best model:**

In [None]:
def stackblend_reg(x_train,y_train,x_test,models,code,N=20,final_layer=LinearRegression()):
    
    def get_dataset(x_train,y_train,N=5) :
        merge = pd.concat([x_train,y_train],axis=1)
        merge = merge.sample(frac=1, random_state=1).reset_index(drop=True)
        y_train = merge.iloc[:,(merge.shape[1]-1):(merge.shape[1])]
        x_train = merge.iloc[:,0:(merge.shape[1]-1)]

        z = int(len(x_train)/N)
        start = [0]
        stop = []
        for i in range(1,N):
            start.append(z*i)
            stop.append(z*i)
        stop.append(len(x_train))

        c = list()
        train_data = list()
        test_data = list()
        y_data = list()
        for i in range(0,N):
            c=list(range(start[i],stop[i]))
            train_data.append(x_train.iloc[[k for k in range(0,len(x_train)) if k not in c],:])
            y_data.append(y_train.iloc[[k for k in range(0,len(y_train)) if k not in c],:])
            test_data.append(x_train.iloc[c,:])

        return(train_data,y_data,test_data,y_train)
    
    datasets = get_dataset(x_train,y_train,N)
    train_data = datasets[0]
    y_data = datasets[1]
    test_data = datasets[2]
    final_y =  datasets[3]
    
    def stack(x_train, y_train , x_test , models=models,code=code):
    
        def flatten_list(_2d_list):
            flat_list = []
            for element in _2d_list:
                if type(element) is list:
                    for item in element:
                        flat_list.append(item)
                else:
                    flat_list.append(element)
            return flat_list

        result = list()
        for i in list(range(len(models))):
            reg = models[i]
            reg.fit(x_train,y_train)
            test_pred = flatten_list(reg.predict(x_test).tolist())
            result.append(test_pred)

        result_df = pd.DataFrame()    
        for i in list(range(len(code))):
            result_df[code[i]] = result[i]
        return result_df
    final_df = pd.DataFrame(columns = code)
    
    for i in range(0,len(train_data)):
        current_df = stack(train_data[i],y_data[i],test_data[i],models,code)
        final_df = pd.concat([final_df,current_df])
        
    final_test = stack(x_train,y_train,x_test,models,code)
    
    reg2 = final_layer
    reg2.fit(final_df,final_y)
    test_pred = reg2.predict(final_test)
    
    return test_pred

In [None]:
models = {}

models["KNeighborsRegressor"]={"model":KNeighborsRegressor(n_neighbors=50)}
models["CatBoostRegressor"]={"model":CatBoostRegressor(logging_level ='Silent',iterations=100)}
models["LinearRegression"]={"model":LinearRegression()}
models["XGBRegressor"]={"model":XGBRegressor(n_jobs=5,learning_rate=0.1,max_depth=10,random_state=1)}

In [None]:
def score_dataset(X, y, model):
    for colname in X.select_dtypes(["category"]):
        X[colname] = X[colname].cat.codes
    
    scores = cross_validate(model, X, y, cv=5, scoring=["r2","neg_mean_absolute_error"])
    score_r2 = scores["test_r2"].mean()
    score_mae = -1*scores["test_neg_mean_absolute_error"].mean()
    
    return score_r2, score_mae

In [None]:
def get_scores(models,xtrain,ytrain):
    for name,model in models.items():
        model["model"].fit(xtrain,ytrain)

        score_r2, score_mae = score_dataset(xtrain, ytrain, model=model["model"])
        print("--- "+name+" ---")
        print("Score r2: {}".format(score_r2))
        print("Score MAE: {}".format(score_mae))
        print("\n")

In [None]:
get_scores(models,xtrain,ytrain)

In [None]:
stack_pred = stackblend_reg(xtrain,ytrain,xtest,
                            models = [XGBRegressor(n_jobs=5,learning_rate=0.1,max_depth=10,random_state=1),
                                    KNeighborsRegressor(n_neighbors=50),
                                     CatBoostRegressor(logging_level ='Silent',iterations=500,random_state=1)],
                            code = ['xgb_reg','knn_reg','cat_reg'],N=10,
                            final_layer=KNeighborsRegressor(n_neighbors=50))

In [None]:
score_r2 = r2_score(ytest, stack_pred) 
score_mae = mean_absolute_error(ytest, stack_pred)
fig,ax = plt.subplots(figsize=(10,10))
ax.set_title("price",fontsize=20)
ax.set_ylabel('Test Predicted price',fontsize=12)
ax.set_xlabel('Test Actual price',fontsize=12)
ax.scatter(ytest,stack_pred)

score_r2 = r2_score(ytest, stack_pred) 
score_mae = mean_absolute_error(ytest, stack_pred)
plt.text(0,66,'$ R^{2} $=' + str(round(score_r2, 4)),fontsize=20)
plt.text(0,65.5,'MAE =' + str(round(score_mae)),fontsize=20)
plt.show()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Selecting the features and target variable
features = ['danceability', 'energy', 'speechiness', 'liveness', 'valence', 'loudness', 'mode', 'key', 'tempo', 'time_signature']
target = 'track_pop'

# Splitting the data into training and testing sets
X = df[features]
y = df[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize the models
models = {
    'Linear Regression': LinearRegression(),
    'Random Forest': RandomForestRegressor(random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(random_state=42),
    'Support Vector Machines': SVR(),
    'Neural Network': MLPRegressor(random_state=42, max_iter=500)
}

# Initialize a dictionary to store evaluation metrics
evaluation_metrics = {
    'Model': [],
    'Mean Absolute Error': [],
    'Mean Squared Error': [],
    'R2 Score': []
}

# Train and evaluate each model
for model_name, model in models.items():
    # Train the model
    model.fit(X_train_scaled, y_train)
    
    # Make predictions on the test set
    y_pred = model.predict(X_test_scaled)
    
    # Evaluate the model
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    # Store the evaluation metrics
    evaluation_metrics['Model'].append(model_name)
    evaluation_metrics['Mean Absolute Error'].append(mae)
    evaluation_metrics['Mean Squared Error'].append(mse)
    evaluation_metrics['R2 Score'].append(r2)

# Create a DataFrame to display the evaluation metrics
evaluation_df = pd.DataFrame(evaluation_metrics)

evaluation_df


In [None]:
# Initialize the K-Nearest Neighbors model
additional_model = {
    'K-Nearest Neighbors': KNeighborsRegressor()
}

# Train and evaluate the K-Nearest Neighbors model
for model_name, model in additional_model.items():
    # Train the model
    model.fit(X_train_scaled, y_train)
    
    # Make predictions on the test set
    y_pred = model.predict(X_test_scaled)
    
    # Evaluate the model
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    # Store the evaluation metrics
    evaluation_metrics['Model'].append(model_name)
    evaluation_metrics['Mean Absolute Error'].append(mae)
    evaluation_metrics['Mean Squared Error'].append(mse)
    evaluation_metrics['R2 Score'].append(r2)

# Update the DataFrame to display the new evaluation metrics
evaluation_df = pd.DataFrame(evaluation_metrics)

evaluation_df


In [None]:
score_r2 = r2_score(ytest, stack_pred) 
score_mae = mean_absolute_error(ytest, stack_pred)

# Initialize a plot to compare actual vs predicted popularity for different models
plt.figure(figsize=(16, 10))
plt.scatter(range(len(y_test)), y_test, label='Actual', alpha=0.6, color='blue')

# Generate predictions and plot them for each model
for model_name, model in {**models, **additional_model}.items():
    y_pred = model.predict(X_test_scaled)
    plt.scatter(range(len(y_test)), y_pred, label=f'Predicted ({model_name})', alpha=0.4, color='green')

plt.title('Comparison of Actual and Predicted Track Popularity')
plt.xlabel('Test Samples')
plt.ylabel('Track Popularity')
plt.legend()
plt.show()


The prediction is not totally accurate, but with with more historical data and a bigger work on the artists, album and NLP study with the titles, the final score could be increased.

# <p style="background-color:#8E3277;color:white;font-size:100%;text-align:center;border-radius:10px 10px;">Take away points</p>

1. The top 5 tracks are Without You, Write This Down (Instrumental), Aesthetic, positions and WHATS POPPIN.

2. The top 5 artists are Drake, The Weeknd, Harry Styles, Ed Sheeran, Justin Bieber.

3. We can try to predict the popularity based on the mood of the song and its accoustic characteritics.

Here are some extra visualizations I wanted to throw in:

1. **Most Popular Artists**: Ranking Top 10 artists based on their average track popularity.
2. **Influence of Mood on Popularity**: Examining how different mood descriptors (like danceability, energy, valence, etc.) correlate with track popularity.
3. **Influence of Track Duration**: We'll analyze if the duration of the track has any correlation with its popularity.

Let's start with the first question: identifying the most popular artists based on their average track popularity.

In [None]:
# Compute the most popular artists based on average track popularity
popular_artists = df.groupby('artist_name')['track_pop'].mean().sort_values(ascending=False).head(10)

# Visualization for Most Popular Artists
plt.figure(figsize=(12, 6))
popular_artists.sort_values().plot(kind='barh', color='purple')
plt.title('Top 10 Most Popular Artists on TikTok in 2022')
plt.xlabel('Average Track Popularity')
plt.ylabel('Artist Name')
plt.grid(axis='x', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

Here's the visualization showcasing the top 10 most popular artists on TikTok in 2022 based on their average track popularity.

**Next, we'll examine how different mood descriptors (danceability, energy, valence, etc.) correlate with track popularity.**

In [None]:
# Correlation between mood descriptors and track popularity
mood_descriptors = ['danceability', 'energy', 'valence', 'speechiness', 'mode']
mood_correlation = df[mood_descriptors + ['track_pop']].corr()['track_pop'].drop('track_pop')

mood_correlation

In [None]:
# Compute the correlation between mood descriptors and track popularity
mood_descriptors = ['danceability', 'energy', 'valence', 'speechiness', 'mode']
mood_correlation = df[mood_descriptors + ['track_pop']].corr()['track_pop'].drop('track_pop')

# Visualization for Influence of Mood on Popularity
plt.figure(figsize=(12, 6))
mood_correlation.sort_values().plot(kind='bar', color='purple')
plt.title('Correlation of Mood Descriptors with Track Popularity')
plt.ylabel('Correlation Coefficient')
plt.xlabel('Mood Descriptors')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()



Finally, let's explore how the duration of the track influences its popularity.

In [None]:
# Correlation between track duration and track popularity
duration_correlation = df[['duration_ms', 'track_pop']].corr()['track_pop'].drop('track_pop')

duration_correlation

In [None]:
# Visualization for Influence of Track Duration on Popularity
plt.figure(figsize=(12, 6))
plt.scatter(df['duration_ms'], df['track_pop'], alpha=0.5, color='purple')
plt.title('Relationship between Track Duration and Popularity')
plt.xlabel('Track Duration (in milliseconds)')
plt.ylabel('Track Popularity')
plt.grid(True, which='both', linestyle='--', linewidth=0.5)
plt.tight_layout()
plt.show()

The correlation between track duration and track popularity is 0.2251.

As observed from the plot, there's a general trend suggesting longer tracks tend to be more popular, aligning with the positive correlation we calculated earlier.

This positive correlation suggests that longer tracks tend to be more popular on TikTok. However, it's essential to understand that while correlation indicates a relationship, it doesn't necessarily imply causation. There might be other factors influencing track popularity.

**To sum up:**

Valence, loudness, acousticness, and track duration have a positive influence on track popularity. Danceability, energy, speechiness, instrumentalness, and liveness tend to have a negative correlation with popularity, but these are relatively mild.