Dataset balancing

In [None]:
import pandas as pd
df_output_data_new = pd.read_csv('output_data_new.txt', delimiter='\t' )
df_output_data_new.gender.value_counts()


m    34085
f    13501
Name: gender, dtype: int64

In [None]:
male_rows = df_output_data_new[df_output_data_new['gender'] == 'm']
female_rows = df_output_data_new[df_output_data_new['gender'] == 'f']

In [None]:
df_male = male_rows.iloc[0:13500]
df_male

Unnamed: 0,user_id,country,age,gender,playcount,registered_unixtime
0,384,UK,35,m,42139,1035849600
1,3653,UK,31,m,18504,1041033600
2,4813,US,43,m,640,1050364800
3,5069,AT,30,m,31867,1051488000
4,6958,US,36,m,34788,1057536000
...,...,...,...,...,...,...
17597,13818257,UK,20,m,23788,1227186966
17599,13823101,FI,18,m,81203,1227204495
17600,13823675,RU,21,m,33808,1227206360
17602,13824897,UK,-1,m,50371,1227210107


In [None]:
combined_df = pd.concat([df_male , female_rows], ignore_index=True)
combined_df

Unnamed: 0,user_id,country,age,gender,playcount,registered_unixtime
0,384,UK,35,m,42139,1035849600
1,3653,UK,31,m,18504,1041033600
2,4813,US,43,m,640,1050364800
3,5069,AT,30,m,31867,1051488000
4,6958,US,36,m,34788,1057536000
...,...,...,...,...,...,...
26996,50074447,BR,23,f,151,1338427464
26997,50077367,BR,21,f,161,1338445489
26998,50081211,ES,21,f,58,1338469348
26999,50796677,PL,110,f,1495,1342344762


Data pre-processing

In [None]:
import numpy as np
df = pd.read_csv('LFM-1b_users_additional.txt', delimiter='\t' )
df_add = df.rename (columns={'user-id':'user_id'})
data  = pd.merge(df_add, combined_df, on='user_id')
data = data.replace('?',np.NaN)
data = data.dropna()

1. Logistic Regression using 2 independent features

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

# Select features (listening events and count of artists) and target variable (gender)
X = data[['cnt_listeningevents', 'cnt_distinct_artists']]
y = data['gender']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Standardize features (to improve model performance)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize and train a logistic regression model
model = LogisticRegression(random_state=42)
model.fit(X_train_scaled, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test_scaled)

# Calculate accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Evaluate model performance
print(classification_report(y_test, y_pred))

# Get feature importances (coefficients) from the trained model
feature_importances = model.coef_[0]

# Create a DataFrame to display feature importances
importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': feature_importances})

# Sort the DataFrame by importance
importance_df = importance_df.sort_values(by='Importance', ascending=False)

print("Feature Importances:")
print(importance_df)


Accuracy: 0.60451197053407
              precision    recall  f1-score   support

           f       0.63      0.19      0.29      1859
           m       0.60      0.92      0.73      2485

    accuracy                           0.60      4344
   macro avg       0.61      0.55      0.51      4344
weighted avg       0.61      0.60      0.54      4344

Feature Importances:
                Feature  Importance
1  cnt_distinct_artists    0.396262
0   cnt_listeningevents    0.167917


2. Logistic Regression using all 5 independent features - Best Accuracy

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

# Select features (listening events and count of artists) and target variable (gender)
X = data[['cnt_listeningevents', 'cnt_distinct_artists','cnt_distinct_tracks','novelty_artist_avg_year','mainstreaminess_avg_year' ]]
y = data['gender']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Standardize features (to improve model performance)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize and train a logistic regression model
model = LogisticRegression(random_state=42)
model.fit(X_train_scaled, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test_scaled)

# Calculate accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Evaluate model performance
print(classification_report(y_test, y_pred))

# Get feature importances (coefficients) from the trained model
feature_importances = model.coef_[0]

# Create a DataFrame to display feature importances
importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': feature_importances})

# Sort the DataFrame by importance
importance_df = importance_df.sort_values(by='Importance', ascending=False)

print("Feature Importances:")
print(importance_df)

Accuracy: 0.6503222836095764
              precision    recall  f1-score   support

           f       0.62      0.46      0.53      1859
           m       0.66      0.79      0.72      2485

    accuracy                           0.65      4344
   macro avg       0.64      0.63      0.63      4344
weighted avg       0.65      0.65      0.64      4344

Feature Importances:
                    Feature  Importance
2       cnt_distinct_tracks    0.972168
3   novelty_artist_avg_year   -0.058334
1      cnt_distinct_artists   -0.076637
4  mainstreaminess_avg_year   -0.144761
0       cnt_listeningevents   -0.180881


3. SVM with all 5 features

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC  # Import the Support Vector Classifier
from sklearn.metrics import accuracy_score


# Select features (listening events and count of artists) and target variable (gender)
X = data[['cnt_listeningevents', 'cnt_distinct_artists', 'cnt_distinct_tracks', 'novelty_artist_avg_year', 'mainstreaminess_avg_year']]
y = data['gender']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Standardize features (to improve model performance)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize and train a Support Vector Machine (SVM) model
model = SVC(random_state=42)
model.fit(X_train_scaled, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test_scaled)

# Calculate accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Evaluate model performance
print(classification_report(y_test, y_pred))



Accuracy: 0.6553867403314917
              precision    recall  f1-score   support

           f       0.67      0.38      0.49      1859
           m       0.65      0.86      0.74      2485

    accuracy                           0.66      4344
   macro avg       0.66      0.62      0.61      4344
weighted avg       0.66      0.66      0.63      4344



4. K  Means clustering with all 5 features

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans  # Import KMeans clustering
from sklearn.metrics import silhouette_score
from sklearn.metrics import classification_report

# Select features (listening events and count of artists) and target variable (gender)
X = data[['cnt_listeningevents', 'cnt_distinct_artists', 'cnt_distinct_tracks', 'novelty_artist_avg_year', 'mainstreaminess_avg_year']]
y = data['gender']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Standardize features (to improve model performance)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize and train a KMeans clustering model
num_clusters = 2  # Number of clusters (assuming 2 for gender)
model = KMeans(n_clusters=num_clusters, random_state=42)
model.fit(X_train_scaled)

# Predict the cluster labels for the test set
cluster_labels = model.predict(X_test_scaled)

# Convert cluster labels to strings ('f' and 'm')
cluster_labels_str = np.where(cluster_labels == 0, 'f', 'm')

# Calculate silhouette score to assess clustering quality
silhouette_avg = silhouette_score(X_test_scaled, cluster_labels)
print("Silhouette Score:", silhouette_avg)

# Evaluate cluster performance using classification report
print(classification_report(y_test, cluster_labels_str))




Silhouette Score: 0.43097143277513156
              precision    recall  f1-score   support

           f       0.23      0.08      0.11      1859
           m       0.54      0.81      0.65      2485

    accuracy                           0.50      4344
   macro avg       0.39      0.44      0.38      4344
weighted avg       0.41      0.50      0.42      4344



5. Random forest classifier (all 5 features) - Highest accuracy

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Select features (listening events and count of artists) and target variable (gender)
X = data[['cnt_listeningevents', 'cnt_distinct_artists', 'cnt_distinct_tracks', 'novelty_artist_avg_year', 'mainstreaminess_avg_year']]
y = data['gender']


# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Initialize and train a Random Forest classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = rf_model.predict(X_test)

# Evaluate model performance
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

# Get feature importance
feature_importance = rf_model.feature_importances_
feature_names = X.columns

# Print feature importance
print("Feature Importance:")
for name, importance in zip(feature_names, feature_importance):
    print(f"{name}: {importance:.4f}")


Accuracy: 0.6498849252013809
              precision    recall  f1-score   support

           f       0.61      0.49      0.54      1479
           m       0.67      0.77      0.72      1997

    accuracy                           0.65      3476
   macro avg       0.64      0.63      0.63      3476
weighted avg       0.64      0.65      0.64      3476

Feature Importance:
cnt_listeningevents: 0.1980
cnt_distinct_artists: 0.1892
cnt_distinct_tracks: 0.2532
novelty_artist_avg_year: 0.1831
mainstreaminess_avg_year: 0.1764


In [None]:
df = pd.read_csv('LFM-1b_users_additional.txt', delimiter='\t' )
df = df[['cnt_listeningevents', 'cnt_distinct_artists','novelty_artist_avg_year']]
df.isna().sum()

In [None]:
# Load dataset
import numpy as np
df = pd.read_csv('LFM-1b_users_additional.txt', delimiter='\t' )
df_output_data_new = pd.read_csv('output_data_new.txt', delimiter='\t' )

df_add = df.rename (columns={'user-id':'user_id'})
data  = pd.merge(df_add, df_output_data_new, on='user_id')
data = data.replace('?',np.NaN)
data = data.dropna()

prediction classification model based on listening events and the count of artists

Actual - Imbalanced dataset

##1. Logistic Regression (2 input features)

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

# Select features (listening events and count of artists) and target variable (gender)
X = data[['cnt_listeningevents', 'cnt_distinct_artists']]
y = data['gender']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Standardize features (to improve model performance)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize and train a logistic regression model
model = LogisticRegression(random_state=42)
model.fit(X_train_scaled, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test_scaled)

# Calculate accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Evaluate model performance
print(classification_report(y_test, y_pred))


Accuracy: 0.7451577199778638
              precision    recall  f1-score   support

           f       0.00      0.00      0.00      1842
           m       0.75      1.00      0.85      5386

    accuracy                           0.75      7228
   macro avg       0.37      0.50      0.43      7228
weighted avg       0.56      0.75      0.64      7228



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Explanation of the report for the binary classification problem. This report contains various metrics that assess the performance of the model on both classes ("f" and "m").

Interpretation
1. Precision is the ratio of correctly predicted positive observations
to the total predicted positives. In the "m" class, the precision is 0.71, which means that when the model predicts "m," it is correct 71% of the time. However, in the "f" class, the precision is 0.00, indicating that the model doesn't predict the "f" class correctly at all.

2. Recall (Sensitivity): Recall is the ratio of correctly predicted positive observations to the all observations in the actual class. In the "m" class, the recall is 1.00, indicating that the model correctly captures all instances of the "m" class. However, in the "f" class, the recall is 0.00, suggesting that the model doesn't capture any instances of the "f" class.•••

3. F1-Score: The F1-score is the weighted average of precision and recall. It takes into account both false positives and false negatives. In the "m" class, the F1-score is 0.83, which indicates a reasonable balance between precision and recall. In the "f" class, the F1-score is 0.00, reflecting the poor performance of the model in this class.

4. Support: Support is the number of actual occurrences of the class in the specified dataset. In the "f" class, there are 3451 instances, and in the "m" class, there are 8446 instances.

5. Accuracy: Accuracy is the ratio of correctly predicted observations to the total observations. The overall accuracy is 0.71, meaning that the model correctly predicts the class for about 71% of the instances.

6. Macro Avg: Macro average calculates the average performance metrics across classes without considering class imbalance. In this case, the macro average precision is 0.35, recall is 0.50, and F1-score is 0.42.

7. Weighted Avg: Weighted average also calculates the average performance metrics across classes, but it takes into account class imbalance. In this case, the weighted average precision is 0.50, recall is 0.71, and F1-score is 0.59.

8. It's clear from the evaluation metrics that the model performs well for the "m" class but poorly for the "f" class. This could indicate a class imbalance issue or some other challenges with the data or the model itself. Further analysis and potentially model adjustments are needed to improve performance for both classes.



##2. Logistic Regression ( 3 input features )

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Separate features and target variable
X = data[['cnt_listeningevents', 'cnt_distinct_artists', 'cnt_distinct_tracks']]
y = data['gender']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize the Logistic Regression model
model = LogisticRegression()

# Train the model
model.fit(X_train_scaled, y_train)

# Get feature importances (coefficients) from the trained model
feature_importances = model.coef_[0]

# Create a DataFrame to display feature importances
importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': feature_importances})

# Sort the DataFrame by importance
importance_df = importance_df.sort_values(by='Importance', ascending=False)

print("Feature Importances:")
print(importance_df)

# Predict on the test set
y_pred = model.predict(X_test_scaled)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print("Classification Report:\n", report)


Feature Importances:
                Feature  Importance
2   cnt_distinct_tracks    0.653384
0   cnt_listeningevents   -0.181698
1  cnt_distinct_artists   -0.225781
Accuracy: 0.7480110688343133
Classification Report:
               precision    recall  f1-score   support

           f       0.44      0.00      0.01      1456
           m       0.75      1.00      0.86      4326

    accuracy                           0.75      5782
   macro avg       0.60      0.50      0.43      5782
weighted avg       0.67      0.75      0.64      5782



##3. Logistic regression (feature engineering - 4 features in all )

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

data['listening_per_artist'] = data['cnt_listeningevents'] / data['cnt_distinct_artists']

# Separate features and target variable
X = data[['cnt_listeningevents', 'cnt_distinct_artists', 'cnt_distinct_tracks','listening_per_artist' ]]
y = data['gender']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize the Logistic Regression model
model = LogisticRegression()

# Train the model
model.fit(X_train_scaled, y_train)

# Predict on the test set
y_pred = model.predict(X_test_scaled)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print("Classification Report:\n", report)

Accuracy: 0.7473192666897267
Classification Report:
               precision    recall  f1-score   support

           f       0.27      0.00      0.00      1456
           m       0.75      1.00      0.86      4326

    accuracy                           0.75      5782
   macro avg       0.51      0.50      0.43      5782
weighted avg       0.63      0.75      0.64      5782



#4. Logistic regression ( 4 features )

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report


# Separate features and target variable
X = data[['cnt_listeningevents', 'cnt_distinct_artists', 'cnt_distinct_tracks','cnt_listeningevents_per_week' ]]
y = data['gender']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize the Logistic Regression model
model = LogisticRegression()

# Train the model
model.fit(X_train_scaled, y_train)

# Predict on the test set
y_pred = model.predict(X_test_scaled)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print("Classification Report:\n", report)

Accuracy: 0.7474922172258734
Classification Report:
               precision    recall  f1-score   support

           f       0.30      0.00      0.00      1456
           m       0.75      1.00      0.86      4326

    accuracy                           0.75      5782
   macro avg       0.52      0.50      0.43      5782
weighted avg       0.64      0.75      0.64      5782



##5. Random Forest Classifier

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report


# Select features (listening events and count of artists) and target variable (gender)
X = data[['cnt_listeningevents', 'cnt_distinct_artists']]
y = data['gender']


# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train a Random Forest classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = rf_model.predict(X_test)

# Evaluate model performance
print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


0.6876513317191283
              precision    recall  f1-score   support

           f       0.24      0.11      0.16      1456
           m       0.75      0.88      0.81      4326

    accuracy                           0.69      5782
   macro avg       0.50      0.50      0.48      5782
weighted avg       0.62      0.69      0.64      5782




1. Precision: For the "f" class, the precision is 0.34, indicating that out of the instances predicted as "f," only 34% are actually "f." For the "m" class, the precision is 0.72, meaning that 72% of the instances predicted as "m" are actually "m."
2. Recall (Sensitivity): The "f" class has a recall of 0.19, implying that only 19% of the actual "f" instances were correctly identified by the model. The "m" class has a recall of 0.85, indicating that the model correctly captures 85% of the actual "m" instances.
3. F1-Score: The F1-score for the "f" class is 0.24, which is the harmonic mean of precision and recall. It balances both false positives and false negatives. The F1-score for the "m" class is 0.78, which indicates a better balance between precision and recall.
4. Support: The "f" class has 2749 instances, and the "m" class has 6769 instances.
5. Accuracy: The overall accuracy of the model is 0.66, meaning that it correctly predicts the class for approximately 66% of the instances.
6. Macro Avg: The macro average calculates the average performance metrics across classes without considering class imbalance. In this case, the macro average precision is 0.53, recall is 0.52, and F1-score is 0.51.
7. Weighted Avg: The weighted average calculates the average performance metrics across classes, taking into account class imbalance. In this case, the weighted average precision is 0.61, recall is 0.66, and F1-score is 0.62.

The evaluation metrics suggest that the model performs reasonably well for the "m" class, with a higher precision and recall, but not as well for the "f" class. The lower precision and recall for the "f" class might indicate that the model struggles to correctly identify instances of this class. One of the  factors for this could be class imbalance.

When the classes are imbalanced, the weighted average metrics can be higher than the macro average metrics because the weighted average gives more emphasis to the larger class. If the larger class is performing well, it can pull up the weighted average metrics, even if some smaller classes are not performing as well.

In my research, the weighted average metrics are higher than the macro average metrics because the "m" class (the larger class) has better precision, recall, and F1-score compared to the "f" class. The better performance of the "m" class contributes more to the weighted average, leading to the observed difference. This is a common situation in cases where there's a significant class imbalance, and it's important to interpret the metrics in the context of the data distribution and the probllem at hand

- Class Imbalance: In many real-world scenarios, datasets may have an unequal distribution of classes. One class could have significantly more instances than the other. This class imbalance can impact the average metrics.
- Macro Average: The macro average calculates the average of performance metrics (precision, recall, F1-score, etc.) for each class separately and then averages those values. It treats each class equally, regardless of its actual frequency in the dataset. As a result, if one class is very small in terms of instances, it has the same impact on the macro average as a larger class.
- Weighted Average: The weighted average, on the other hand, calculates the average of performance metrics by considering the frequency (or "weight") of each class. It gives more weight to classes with a larger number of instances. This means that metrics for classes with more instances will have a larger influence on the weighted average.


##6. Random forest classifier and feature engineering

In [None]:
data['listening_per_artist'] = data['cnt_listeningevents'] / data['cnt_distinct_artists']


import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Select features (listening events and count of artists) and target variable (gender)
X = data[['cnt_listeningevents', 'cnt_distinct_artists','listening_per_artist']]
y = data['gender']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train a Random Forest classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = rf_model.predict(X_test)

# Evaluate model performance
print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

0.6912832929782082
              precision    recall  f1-score   support

           f       0.25      0.11      0.16      1456
           m       0.75      0.89      0.81      4326

    accuracy                           0.69      5782
   macro avg       0.50      0.50      0.48      5782
weighted avg       0.62      0.69      0.65      5782



##7. Random forest classifier (4 features)


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Select features (listening events and count of artists) and target variable (gender)
X = data[['cnt_listeningevents', 'cnt_distinct_artists','cnt_distinct_tracks','cnt_listeningevents_per_week' ]]
y = data['gender']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train a Random Forest classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = rf_model.predict(X_test)

# Evaluate model performance
print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

0.7286406087858872
              precision    recall  f1-score   support

           f       0.33      0.08      0.13      1456
           m       0.75      0.95      0.84      4326

    accuracy                           0.73      5782
   macro avg       0.54      0.51      0.48      5782
weighted avg       0.65      0.73      0.66      5782



8. Random forest classifier (all 5 features)

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Select features (listening events and count of artists) and target variable (gender)
X = data[['cnt_listeningevents', 'cnt_distinct_artists','cnt_distinct_tracks','cnt_listeningevents_per_week','mainstreaminess_avg_year']]
y = data['gender']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train a Random Forest classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = rf_model.predict(X_test)

# Evaluate model performance
print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

0.736942234520927
              precision    recall  f1-score   support

           f       0.40      0.08      0.14      1456
           m       0.76      0.96      0.84      4326

    accuracy                           0.74      5782
   macro avg       0.58      0.52      0.49      5782
weighted avg       0.67      0.74      0.67      5782

