Dependencies installation:

In [77]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

#Metrics used across models
from sklearn.metrics import classification_report, confusion_matrix, f1_score, precision_score, recall_score, accuracy_score, roc_curve, auc
from sklearn.model_selection import train_test_split



#Dependencies used in Logistic Regression
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer



#Dependencies used in KNN
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.neighbors import KNeighborsClassifier

#Random Forest
from sklearn.ensemble import RandomForestClassifier



Now we are going to read our dataset. Notice that the dataset being loaded is the test.csv & train.csv and not JobApplicants.csv - the reason for that is because we have been working on different computers.
The split dataset guarantees that the testrainsplit is consistent as it is not run on both computers.

For more information on how the training set was created please visit traintestspilit.ipynb

In [78]:
# Load the training data
df_train = pd.read_csv('train_modified_no_columns.csv')

# Load the test data
df_test = pd.read_csv('test_modified_no_columns.csv')

#Check headers
df_train.head()

Unnamed: 0,artists,duration_ms,explicit,danceability,key,loudness,speechiness,acousticness,instrumentalness,valence,time_signature,track_genre,popularity
0,Seu Jorge,358733,False,0.641,11,-6.401,0.0604,0.151,0.000761,0.423,4,0,41
1,Chyi Chin,231520,False,0.668,5,-9.71,0.0353,0.795,0.0,0.432,3,1,52
2,Babyboomboom,98386,False,0.786,9,-16.516,0.573,0.679,0.0,0.658,4,2,11
3,Sidhu Moose Wala;DIVINE,232173,False,0.709,0,-5.817,0.245,0.0698,0.0,0.654,4,3,61
4,Rumbavana,360320,False,0.786,0,-6.742,0.0456,0.511,0.0,0.696,4,4,37


In [79]:
df_test.head()

Unnamed: 0,ID,artists,album_name,track_name,duration_ms,explicit,danceability,key,loudness,speechiness,acousticness,instrumentalness,valence,time_signature,track_genre
0,113186,1,No Other Name,No Other Name,440247,False,0.369,7,-6.984,0.0304,0.00511,0.0,0.0466,4,1
1,42819,2,Grieving Birth,Failed Organum,93933,False,0.171,7,-3.586,0.118,0.00521,0.801,0.0294,4,2
2,59311,3,Noise A Noise 20.4-1,"Save the Trees, Pt. 1",213578,False,0.173,9,-10.071,0.144,0.613,0.00191,0.0887,3,3
3,90417,4,A Thousand Stars,It's Only Make Believe,146706,False,0.419,9,-13.438,0.0322,0.32,0.0,0.462,4,4
4,61000,5,バレッタ TypeD,月の大きさ,236293,False,0.555,9,-3.294,0.0481,0.484,0.0,0.813,4,5


# Classification

## Random Forest Classifier

In [80]:
#Drop the artists column
df_train = df_train.drop(columns = ['artists'])
df_test = df_test.drop(columns = ['artists'])

In [81]:
# Creating 'popularity_category' in the training data
df_train['popularity_category'] = pd.qcut(df_train['popularity'], q=3, labels=False)


In [82]:
categorical_cols = ['explicit']
numerical_cols = ['duration_ms', 'danceability', 'key', 'loudness', 'speechiness', 'acousticness', 'instrumentalness', 'valence', 'time_signature']

X_train = df_train[categorical_cols + numerical_cols]  # Input features
y_train = df_train['popularity_category']  # Target variable
X_test = df_test[categorical_cols + numerical_cols]
X_train_split, X_val, y_train_split, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [83]:
# Step 3: Create a ColumnTransformer to transform the categorical columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', numerical_cols),
        ('cat', OneHotEncoder(), categorical_cols)
    ])

In [88]:
# Step 4: Create a pipeline that includes the preprocessor and the Random Forest model
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42, max_features='sqrt', n_jobs=-1))
])

In [89]:
# Step 6: Train the pipeline on the training data
pipeline.fit(X_train_split, y_train_split)

# Step 7: Make predictions on the test data
y_pred = pipeline.predict(X_test)

In [90]:
# Step 8: Evaluate the model
y_pred_val = pipeline.predict(X_val)

# Evaluate the model's performance on the validation set
accuracy_val = accuracy_score(y_val, y_pred_val)
f1_val = f1_score(y_val, y_pred_val, average='macro')  # Adjust 'average' as appropriate for your scenario
precision_val = precision_score(y_val, y_pred_val, average='macro')
recall_val = recall_score(y_val, y_pred_val, average='macro')

# Print the evaluation metrics for the validation set
print(f"Validation Accuracy: {accuracy_val}")
print(f"Validation F1 Score: {f1_val}")
print(f"Validation Precision: {precision_val}")
print(f"Validation Recall: {recall_val}")

Validation Accuracy: 0.6492324561403509
Validation F1 Score: 0.6493169396689212
Validation Precision: 0.6507969007196577
Validation Recall: 0.6484633078659062


### Tuned Model

In [91]:
"""# Define the hyperparameters and their possible values, including 'max_features'
param_grid = {
    'classifier__n_estimators': [85, 100, 115],
    'classifier__max_depth': [7, 10, 13],
    'classifier__min_samples_split': [10, 12, 15],
    'classifier__min_samples_leaf': [4, 6, 8],
    'classifier__max_features': ['auto', 'sqrt', 'log2']  # Including max_features in the grid
}

# Create a GridSearchCV object
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='f1_macro', n_jobs=-1)


grid_search.fit(X_train_split, y_train_split) 

# Get the best parameters and model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

# Print the best parameters and model
print(f"Best Parameters: {best_params}")
print(f"Best Model: {best_model}")"""

  warn(


Best Parameters: {'classifier__max_depth': 13, 'classifier__max_features': 'auto', 'classifier__min_samples_leaf': 4, 'classifier__min_samples_split': 10, 'classifier__n_estimators': 115}
Best Model: Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num', 'passthrough',
                                                  ['duration_ms',
                                                   'danceability', 'key',
                                                   'loudness', 'speechiness',
                                                   'acousticness',
                                                   'instrumentalness',
                                                   'valence',
                                                   'time_signature']),
                                                 ('cat', OneHotEncoder(),
                                                  ['explicit'])])),
                ('classifier',
                 RandomForestClas

In [None]:
"""c:\Users\Simon\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\ensemble\_forest.py:424: FutureWarning: `max_features='auto'` has been deprecated in 1.1 and will be removed in 1.3. To keep the past behaviour, explicitly set `max_features='sqrt'` or remove this parameter as it is also the default value for RandomForestClassifiers and ExtraTreesClassifiers.
  warn(
Best Parameters: {'classifier__max_depth': 13, 'classifier__max_features': 'auto', 'classifier__min_samples_leaf': 4, 'classifier__min_samples_split': 10, 'classifier__n_estimators': 115}
Best Model: Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num', 'passthrough',
                                                  ['duration_ms',
                                                   'danceability', 'key',
                                                   'loudness', 'speechiness',
                                                   'acousticness',
                                                   'instrumentalness',
                                                   'valence',
                                                   'time_signature']),
                                                 ('cat', OneHotEncoder(),
                                                  ['explicit'])])),
                ('classifier',
                 RandomForestClassifier(max_depth=13, max_features='auto',
                                        min_samples_leaf=4,
                                        min_samples_split=10, n_estimators=115,
                                        n_jobs=-1, random_state=42))])
"""

### Retraining random forest classifier with best parameters

In [92]:
best_model.fit(X_train_split, y_train_split)

  warn(


In [93]:
y_pred_test = best_model.predict(X_test)

In [94]:
# Make predictions on the validation set using the newly trained model
y_pred_val = best_model.predict(X_val)

# Evaluate the model's performance on the validation set
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

accuracy_val = accuracy_score(y_val, y_pred_val)
f1_val = f1_score(y_val, y_pred_val, average='macro')  # Use 'macro' or 'weighted' based on your classification problem
precision_val = precision_score(y_val, y_pred_val, average='macro')
recall_val = recall_score(y_val, y_pred_val, average='macro')

# Print the evaluation metrics for the validation set
print(f"Validation Accuracy: {accuracy_val}")
print(f"Validation F1 Score: {f1_val}")
print(f"Validation Precision: {precision_val}")
print(f"Validation Recall: {recall_val}")

Validation Accuracy: 0.5448464912280702
Validation F1 Score: 0.5451654657396157
Validation Precision: 0.5467932655150806
Validation Recall: 0.5444673247502131


In [None]:
# Making final predictions on the test data
y_pred_test = pipeline.predict(X_test)

# Exporting predictions to CSV for submission
predictions_df = pd.DataFrame(y_pred_test, columns=['Predicted_Popularity_Category'])
predictions_df.to_csv('random_forest_classifier_predictions.csv', index=False)
print("Final test set predictions have been exported to 'final_test_predictions.csv'.")