In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

import random
plt.style.use("fivethirtyeight")

import warnings
warnings.filterwarnings('ignore')

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [3]:
train = pd.read_csv('../input/song-popularity-prediction/train.csv')
test = pd.read_csv('../input/song-popularity-prediction/test.csv')

In [4]:
train.head()

In [5]:
train.info()

In [6]:
train.describe()

# Missing Values 

In [7]:
train.isnull().sum()

In [8]:
plt.figure(figsize=(18,16))
sns.displot(
    data=train.isna().melt(value_name="missing"),
    y="variable",
    hue="missing",
    multiple="fill",
    aspect=3,
    palette='copper' 
)
plt.title('Bar plot showing Non-Missing Values in Train data', weight = 'bold', size = 20, color = 'blue')
plt.xlabel(" Percentage ")
plt.ylabel(" Columns ")
plt.xticks(size = 12, weight = 'bold', color = 'maroon')
plt.yticks(size = 12, weight = 'bold', color = 'maroon');

plt.figure(figsize=(18,16))
sns.displot(
    data=test.isna().melt(value_name="missing"),
    y="variable",
    hue="missing",
    multiple="fill",
    aspect=3,
    palette='copper' 
)
plt.title('Bar plot showing Non-Missing Values in Train data', weight = 'bold', size = 20, color = 'blue')
plt.xlabel(" Percentage ")
plt.ylabel(" Columns ")
plt.xticks(size = 12, weight = 'bold', color = 'maroon')
plt.yticks(size = 12, weight = 'bold', color = 'maroon');

# Feature With Missing Values : 8 Columns
- song_duration_ms
- acousticness
- danceability 
- energy
- instrumentalness 
- key
- liveness 
- loudness

In [9]:
useful_cols = [col for col in train.columns if col not in ['id', 'song_popularity']]
cols_dist = [col for col in useful_cols if col not in ['key', 'audio_mode', 'time_signature']]
color_ = [ '#9D2417', '#AF41B4', '#003389' ,'#3C5F41',  '#967032', '#2734DE'] 
cmap_ = ['magma', 'copper', 'crest']


plt.figure(figsize= (16,18))
for i,col in enumerate(train[useful_cols].columns):
    rand_col = color_[random.sample(range(6), 1)[0]]
    plt.subplot(5,3, i+1)
    if col in cols_dist:
        
        sns.kdeplot(train[col], color = rand_col, fill = rand_col )
        plt.title(col,weight = 'bold', color = rand_col)
        plt.ylabel(" ")
        plt.xlabel(" ")
        plt.tight_layout()
    else:
        sns.countplot(data = train , x = col, palette = cmap_[random.sample(range(3), 1)[0]] )
        plt.title(col,weight = 'bold', color = 'black')
        plt.ylabel(" ")
        plt.xlabel(" ")
        plt.tight_layout()
        
plt.subplot(5,3, 14)
sns.kdeplot(np.log(train['instrumentalness']), color = rand_col, fill = rand_col )
plt.title('instrumentalness (log transformed)',weight = 'bold', color = rand_col, size = 17)
plt.ylabel(" ")
plt.xlabel(" ")
plt.tight_layout()
plt.show();

In [10]:
# Increase the size of the heatmap.
plt.figure(figsize=(16, 6))
# Store heatmap object in a variable to easily access it when you want to include more features (such as title).
# Set the range of values to be displayed on the colormap from -1 to 1, and set the annotation to True to display the correlation values on the heatmap.
heatmap = sns.heatmap(train.corr(), vmin=-1, vmax=1, annot=True)
# Give a title to the heatmap. Pad defines the distance of the title from the top of the heatmap.
heatmap.set_title('Correlation Heatmap', fontdict={'fontsize':12}, pad=12);

# Handlling with Missing Values :

In [11]:
# Function that print the missing Values's percent of each column
def percent_missing_values(df) :
    percent_missing = df.isnull().sum() * 100 / len(df)
    missing_value_df = pd.DataFrame({'column_name': df.columns,
                                 'percent_missing': percent_missing})
    return missing_value_df.percent_missing.sort_values( ascending = False)
percent_missing_values(train) , percent_missing_values(test)

In [12]:
mean_list = ['song_duration_ms' ,'liveness' ,'danceability', 'acousticness' ,'instrumentalness' ,'energy' , 'loudness']
for i in mean_list : 
    train.loc[(train[i].isna() == True) , i] = train[i].mean() #Train Data
    test.loc[(test[i].isna()   == True) , i] = train[i].mean() #Test Data

In [13]:
train.loc[(train['key'].isna() == True) , 'key'] = 0 #Train Data
test.loc[(test['key'].isna()   == True) , 'key'] = 0 #Test Data

In [14]:
X_train = train.iloc[:, 1:14 ].values
y_train = train.song_popularity

X_test = test.iloc[:, 1:14 ].values

X_train.shape , y_train.shape , X_test.shape

In [15]:
from sklearn.ensemble import RandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier

from sklearn.ensemble import VotingClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

from numpy import mean
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.ensemble import BaggingClassifier
from imblearn.ensemble import BalancedBaggingClassifier


# Best: 0.635508 using {'class_weight': None, 'max_depth': 10, 'max_features': 'auto', 'n_estimators': 300}
# 0.628425 (0.002461) with: {'class_weight': 'balanced', 'max_depth': 30, 'max_features': 'auto', 'n_estimators': 100}


# model_RF = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=1)

# model   = RandomForestClassifier(class_weight='balanced_subsample',max_depth= 10, max_features= 'auto', n_estimators=500, random_state=1) #Best Param & Best Score 500>300
# model_LR   = LogisticRegression()
# model_XGB  = XGBClassifier()
# model_SVC  = SVC(probability=True)

model   = BalancedRandomForestClassifier(max_depth= 10, max_features= 'auto', n_estimators=500, random_state=1) #Best Param & Best Score 500>300


# # Voting Classifiers : 
# model = VotingClassifier(
# estimators=[('lr', model_LR), ('rf', model_RF), ('xgb', model_XGB), ('svc', model_SVC)], voting='soft') #
# model.fit(X_train, y_train)

 
    
# define model
# model = BalancedBaggingClassifier(DecisionTreeClassifier(), n_estimators=500,max_samples=100, bootstrap=True, n_jobs=-1,
#                                   sampling_strategy = 0.7 ) 
# define evaluation procedure
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# evaluate model
scores = cross_val_score(model, X_train, y_train, scoring='roc_auc', cv=cv, n_jobs=-1)
# summarize performance
print('Mean ROC AUC: %.3f' % mean(scores))       


# unique, counts = np.unique(song_popularity, return_counts=True)
# dict(zip(unique, counts))

In [16]:
#Scores : 
# AUC : 0.571   RF(n=500)  >>  RF(n=500) With Weight 0.59538 
# AUC : 0.572   BalancedRandomForestClassifier 

# AUC : 0.561   BalancedBaggingClassifier : Majority     
# AUC : BalancedBaggingClassifier : (Strategy=0.7) >> Cost A lot of Time    

In [17]:
## model_RF = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=1) 
model.fit(X_train, y_train)
predictions = model.predict(X_test) 
song_popularity = predictions
predict_proba = model.predict_proba(X_test)

output = pd.DataFrame({'id': test.id, 'song_popularity': predict_proba[:,1]})
output.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")

In [None]:
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.model_selection import GridSearchCV
# from sklearn.model_selection import RepeatedStratifiedKFold

# model = RandomForestClassifier()


# cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=1)


# model_RF   = RandomForestClassifier(max_depth= 10, max_features= 'auto', n_estimators=300, random_state=1) #Best Param
# param_grid = [
#   {     'class_weight' :['balanced', None ], 
#         'n_estimators' : [10,100,300],
#         'max_depth' : [10,20,30],
#         'max_features' : ['auto',None],
#   } ]

# grid_search = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=cv, scoring='accuracy' , verbose = 4)
# grid_result = grid_search.fit(X_train, y_train)

In [None]:
# summarize results
# Best: 0.635508 using {'class_weight': None, 'max_depth': 10, 'max_features': 'auto', 'n_estimators': 300}
# 0.628425 (0.002461) with: {'class_weight': 'balanced', 'max_depth': 30, 'max_features': 'auto', 'n_estimators': 100}



# print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
# means = grid_result.cv_results_['mean_test_score']
# stds = grid_result.cv_results_['std_test_score']
# params = grid_result.cv_results_['params']
# for mean, stdev, param in zip(means, stds, params):
#     print("%f (%f) with: %r" % (mean, stdev, param))