In [3]:
# Imports
import pandas as pd
import numpy as np
from IPython.display import display

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [2]:
# Load data
df = pd.read_csv("./preproc_data/clean_text_all_subreddits_d2v.csv", 
                 usecols=["link_flair_text"], low_memory=False)

# Print length
print(len(df.index))

# Reset index, so it can be merged with clustered results
df = df.reset_index()

# Check
df.head()

652146


Unnamed: 0,index,link_flair_text
0,0,
1,1,
2,2,
3,3,
4,4,


In [9]:
# Check accuracy for just BoW features
bow = pd.read_csv("./preproc_data/clustered_BoW.csv")

# Merge with df
#bow = bow.merge(df)

print(len(bow.index))

# Display
display(bow.head())

# Increment values in hdbscan_label
#bow['hdbscan_label'] = bow['hdbscan_label'] + 1

# Compare accuracy of each clustering method
print("Classification metrics for each clustering methods (All data)")
print("Accuracy score between label and kmeans_label is:", 
      accuracy_score(bow['label'], bow['kmeans_label']))

print("Accuracy score between label and birch_label is:", 
      accuracy_score(bow['label'], bow['birch_label']))

print("Accuracy score between label and hdbscan_label is:", 
      accuracy_score(bow['label'], bow['hdbscan_label']))

# Compare precision of each clustering method
print("Precision score between label and kmeans_label is:", 
      precision_score(bow['label'], bow['kmeans_label']))

print("Precision score between label and birch_label is:", 
      precision_score(bow['label'], bow['birch_label']))

print("Precision score between label and hdbscan_label is:", 
      precision_score(bow['label'], bow['hdbscan_label']))

# Compare recall of each clustering method
print("Recall score between label and kmeans_label is:", 
      recall_score(bow['label'], bow['kmeans_label']))

print("Recall score between label and birch_label is:", 
      recall_score(bow['label'], bow['birch_label']))

print("Recall score between label and hdbscan_label is:", 
      recall_score(bow['label'], bow['hdbscan_label']))

# Compare f1 of each clustering method
print("F1 score between label and kmeans_label is:", 
      f1_score(bow['label'], bow['kmeans_label']))

print("F1 score between label and birch_label is:", 
      f1_score(bow['label'], bow['birch_label']))

print("F1 score between label and hdbscan_label is:", 
      f1_score(bow['label'], bow['hdbscan_label']))

# The clusters could be swapped (i.e. kmeans_label 0 could be 1 and vice versa)
# Inverse the column labels
bow['inv_kmeans_label'] = (~bow['kmeans_label'].astype(bool)).astype(int)
bow['inv_birch_label'] = (~bow['birch_label'].astype(bool)).astype(int)
bow['inv_hdbscan_label'] = (~bow['hdbscan_label'].astype(bool)).astype(int)

# Compare accuracy of each clustering method (inversed)
print("\nClassification metrics for each clustering methods (All data, inversed)")
print("Accuracy score between label and inv_kmeans_label is:", 
      accuracy_score(bow['label'], bow['inv_kmeans_label']))

print("Accuracy score between label and inv_birch_label is:", 
      accuracy_score(bow['label'], bow['inv_birch_label']))

print("Accuracy score between label and inv_hdbscan_label is:", 
      accuracy_score(bow['label'], bow['inv_hdbscan_label']))

# Compare precision of each clustering method
print("Precision score between label and inv_kmeans_label is:", 
      precision_score(bow['label'], bow['inv_kmeans_label']))

print("Precision score between label and inv_birch_label is:", 
      precision_score(bow['label'], bow['inv_birch_label']))

print("Precision score between label and inv_hdbscan_label is:", 
      precision_score(bow['label'], bow['inv_hdbscan_label']))

# Compare recall of each clustering method
print("Recall score between label and inv_kmeans_label is:", 
      recall_score(bow['label'], bow['inv_kmeans_label']))

print("Recall score between label and inv_birch_label is:", 
      recall_score(bow['label'], bow['inv_birch_label']))

print("Recall score between label and inv_hdbscan_label is:", 
      recall_score(bow['label'], bow['inv_hdbscan_label']))

# Compare f1 of each clustering method
print("F1 score between label and kmeans_label is:", 
      f1_score(bow['label'], bow['inv_kmeans_label']))

print("F1 score between label and birch_label is:", 
      f1_score(bow['label'], bow['inv_birch_label']))

print("F1 score between label and inv_hdbscan_label is:", 
      f1_score(bow['label'], bow['inv_hdbscan_label']))

# Drop NaNs in link_flair_text and check scores again
bow = bow.dropna()

# Compare accuracy of each clustering method
print("\nClassification metrics for each clustering methods (NaNs dropped from lft)")
print("Accuracy score between label and kmeans_label is:", 
      accuracy_score(bow['label'], bow['kmeans_label']))

print("Accuracy score between label and birch_label is:", 
      accuracy_score(bow['label'], bow['birch_label']))

print("Accuracy score between label and hdbscan_label is:", 
      accuracy_score(bow['label'], bow['hdbscan_label']))

# Compare precision of each clustering method
print("Precision score between label and kmeans_label is:", 
      precision_score(bow['label'], bow['kmeans_label']))

print("Precision score between label and birch_label is:", 
      precision_score(bow['label'], bow['birch_label']))

print("Precision score between label and hdbscan_label is:", 
      precision_score(bow['label'], bow['hdbscan_label']))

# Compare recall of each clustering method
print("Recall score between label and kmeans_label is:", 
      recall_score(bow['label'], bow['kmeans_label']))

print("Recall score between label and birch_label is:", 
      recall_score(bow['label'], bow['birch_label']))

print("Recall score between label and hdbscan_label is:", 
      recall_score(bow['label'], bow['hdbscan_label']))

# Compare f1 of each clustering method
print("F1 score between label and kmeans_label is:", 
      f1_score(bow['label'], bow['kmeans_label']))

print("F1 score between label and birch_label is:", 
      f1_score(bow['label'], bow['birch_label']))

print("F1 score between label and hdbscan_label is:", 
      f1_score(bow['label'], bow['hdbscan_label']))

# Compare accuracy of each clustering method (inversed)
print("\nClassification metrics for each clustering methods (NaNs dropped from lft, inversed)")
print("Accuracy score between label and inv_kmeans_label is:", 
      accuracy_score(bow['label'], bow['inv_kmeans_label']))

print("Accuracy score between label and inv_birch_label is:", 
      accuracy_score(bow['label'], bow['inv_birch_label']))

print("Accuracy score between label and inv_hdbscan_label is:", 
      accuracy_score(bow['label'], bow['inv_hdbscan_label']))

# Compare precision of each clustering method
print("Precision score between label and inv_kmeans_label is:", 
      precision_score(bow['label'], bow['inv_kmeans_label']))

print("Precision score between label and inv_birch_label is:", 
      precision_score(bow['label'], bow['inv_birch_label']))

print("Precision score between label and inv_hdbscan_label is:", 
      precision_score(bow['label'], bow['inv_hdbscan_label']))

# Compare recall of each clustering method
print("Recall score between label and inv_kmeans_label is:", 
      recall_score(bow['label'], bow['inv_kmeans_label']))

print("Recall score between label and inv_birch_label is:", 
      recall_score(bow['label'], bow['inv_birch_label']))

print("Recall score between label and inv_hdbscan_label is:", 
      recall_score(bow['label'], bow['inv_hdbscan_label']))

# Compare f1 of each clustering method
print("F1 score between label and kmeans_label is:", 
      f1_score(bow['label'], bow['inv_kmeans_label']))

print("F1 score between label and birch_label is:", 
      f1_score(bow['label'], bow['inv_birch_label']))

print("F1 score between label and inv_hdbscan_label is:", 
      f1_score(bow['label'], bow['inv_hdbscan_label']))

651430


Unnamed: 0,index,subreddit,clean_text,label,kmeans_label,birch_label,hdbscan_label,link_flair_text,inv_kmeans_label,inv_birch_label,inv_hdbscan_label
0,0,ADHD,recently diagnosed need talk others diagnosed ...,0,1,0,1,,0,1,0
1,1,ADHD,really annoyed family drunk friend family host...,0,0,0,1,,1,1,0
2,2,ADHD,medication journey current disappointment wan...,0,1,0,1,,0,1,0
3,3,ADHD,wearable rem sleep detected gaming sleeping hy...,0,0,0,1,,1,1,0
4,4,ADHD,picking friend carpool nye party texted way dr...,0,0,0,1,,1,1,0


Classification metrics for each clustering methods (All data)
Accuracy score between label and kmeans_label is: 0.6282762537801452
Accuracy score between label and birch_label is: 0.8632086333144007
Accuracy score between label and hdbscan_label is: 0.14438235881061665
Precision score between label and kmeans_label is: 0.19119153832815775
Precision score between label and birch_label is: 0.22382238223822382
Precision score between label and hdbscan_label is: 0.1304998676853071
Recall score between label and kmeans_label is: 0.5679168422531254
Recall score between label and birch_label is: 0.017464999765884722
Recall score between label and hdbscan_label is: 0.9755700707028141
F1 score between label and kmeans_label is: 0.28607481484975705
F1 score between label and birch_label is: 0.032401676547875
F1 score between label and hdbscan_label is: 0.23020563298193372

Classification metrics for each clustering methods (All data, inversed)
Accuracy score between label and inv_kmeans_label is

In [10]:
# Check accuracy for just d2v features
d2v = pd.read_csv("./preproc_data/clustered_d2v.csv")

# Merge with df
#d2v = d2v.merge(df)

print(len(d2v.index))

# Display
display(d2v.head())

# Increment values in hdbscan_label
#d2v['hdbscan_label'] = d2v['hdbscan_label'] + 1

# Compare accuracy of each clustering method
print("Classification metrics for each clustering methods (All data)")
print("Accuracy score between label and kmeans_label is:", 
      accuracy_score(d2v['label'], d2v['kmeans_label']))

print("Accuracy score between label and birch_label is:", 
      accuracy_score(d2v['label'], d2v['birch_label']))

print("Accuracy score between label and hdbscan_label is:", 
      accuracy_score(d2v['label'], d2v['hdbscan_label']))

# Compare precision of each clustering method
print("Precision score between label and kmeans_label is:", 
      precision_score(d2v['label'], d2v['kmeans_label']))

print("Precision score between label and birch_label is:", 
      precision_score(d2v['label'], d2v['birch_label']))

print("Precision score between label and hdbscan_label is:", 
      precision_score(d2v['label'], d2v['hdbscan_label']))

# Compare recall of each clustering method
print("Recall score between label and kmeans_label is:", 
      recall_score(d2v['label'], d2v['kmeans_label']))

print("Recall score between label and birch_label is:", 
      recall_score(d2v['label'], d2v['birch_label']))

print("Recall score between label and hdbscan_label is:", 
      recall_score(d2v['label'], d2v['hdbscan_label']))

# Compare f1 of each clustering method
print("F1 score between label and kmeans_label is:", 
      f1_score(d2v['label'], d2v['kmeans_label']))

print("F1 score between label and birch_label is:", 
      f1_score(d2v['label'], d2v['birch_label']))

print("F1 score between label and hdbscan_label is:", 
      f1_score(d2v['label'], d2v['hdbscan_label']))

# The clusters could be swapped (i.e. kmeans_label 0 could be 1 and vice versa)
# Inverse the column labels
d2v['inv_kmeans_label'] = (~d2v['kmeans_label'].astype(bool)).astype(int)
d2v['inv_birch_label'] = (~d2v['birch_label'].astype(bool)).astype(int)
d2v['inv_hdbscan_label'] = (~d2v['hdbscan_label'].astype(bool)).astype(int)

# Compare accuracy of each clustering method (inversed)
print("\nClassification metrics for each clustering methods (All data, inversed)")
print("Accuracy score between label and inv_kmeans_label is:", 
      accuracy_score(d2v['label'], d2v['inv_kmeans_label']))

print("Accuracy score between label and inv_birch_label is:", 
      accuracy_score(d2v['label'], d2v['inv_birch_label']))

print("Accuracy score between label and inv_hdbscan_label is:", 
      accuracy_score(d2v['label'], d2v['inv_hdbscan_label']))

# Compare precision of each clustering method
print("Precision score between label and inv_kmeans_label is:", 
      precision_score(d2v['label'], d2v['inv_kmeans_label']))

print("Precision score between label and inv_birch_label is:", 
      precision_score(d2v['label'], d2v['inv_birch_label']))

print("Precision score between label and inv_hdbscan_label is:", 
      precision_score(d2v['label'], d2v['inv_hdbscan_label']))

# Compare recall of each clustering method
print("Recall score between label and inv_kmeans_label is:", 
      recall_score(d2v['label'], d2v['inv_kmeans_label']))

print("Recall score between label and inv_birch_label is:", 
      recall_score(d2v['label'], d2v['inv_birch_label']))

print("Recall score between label and inv_hdbscan_label is:", 
      recall_score(d2v['label'], d2v['inv_hdbscan_label']))

# Compare f1 of each clustering method
print("F1 score between label and kmeans_label is:", 
      f1_score(d2v['label'], d2v['inv_kmeans_label']))

print("F1 score between label and birch_label is:", 
      f1_score(d2v['label'], d2v['inv_birch_label']))

print("F1 score between label and inv_hdbscan_label is:", 
      f1_score(d2v['label'], d2v['inv_hdbscan_label']))

# Drop NaNs in link_flair_text and check scores again
d2v = d2v.dropna()

# Compare accuracy of each clustering method
print("\nClassification metrics for each clustering methods (NaNs dropped from lft)")
print("Accuracy score between label and kmeans_label is:", 
      accuracy_score(d2v['label'], d2v['kmeans_label']))

print("Accuracy score between label and birch_label is:", 
      accuracy_score(d2v['label'], d2v['birch_label']))

print("Accuracy score between label and hdbscan_label is:", 
      accuracy_score(d2v['label'], d2v['hdbscan_label']))

# Compare precision of each clustering method
print("Precision score between label and kmeans_label is:", 
      precision_score(d2v['label'], d2v['kmeans_label']))

print("Precision score between label and birch_label is:", 
      precision_score(d2v['label'], d2v['birch_label']))

print("Precision score between label and hdbscan_label is:", 
      precision_score(d2v['label'], d2v['hdbscan_label']))

# Compare recall of each clustering method
print("Recall score between label and kmeans_label is:", 
      recall_score(d2v['label'], d2v['kmeans_label']))

print("Recall score between label and birch_label is:", 
      recall_score(d2v['label'], d2v['birch_label']))

print("Recall score between label and hdbscan_label is:", 
      recall_score(d2v['label'], d2v['hdbscan_label']))

# Compare f1 of each clustering method
print("F1 score between label and kmeans_label is:", 
      f1_score(d2v['label'], d2v['kmeans_label']))

print("F1 score between label and birch_label is:", 
      f1_score(d2v['label'], d2v['birch_label']))

print("F1 score between label and hdbscan_label is:", 
      f1_score(d2v['label'], d2v['hdbscan_label']))

# Compare accuracy of each clustering method (inversed)
print("\nClassification metrics for each clustering methods (NaNs dropped from lft, inversed)")
print("Accuracy score between label and inv_kmeans_label is:", 
      accuracy_score(d2v['label'], d2v['inv_kmeans_label']))

print("Accuracy score between label and inv_birch_label is:", 
      accuracy_score(d2v['label'], d2v['inv_birch_label']))

print("Accuracy score between label and inv_hdbscan_label is:", 
      accuracy_score(d2v['label'], d2v['inv_hdbscan_label']))

# Compare precision of each clustering method
print("Precision score between label and inv_kmeans_label is:", 
      precision_score(d2v['label'], d2v['inv_kmeans_label']))

print("Precision score between label and inv_birch_label is:", 
      precision_score(d2v['label'], d2v['inv_birch_label']))

print("Precision score between label and inv_hdbscan_label is:", 
      precision_score(d2v['label'], d2v['inv_hdbscan_label']))

# Compare recall of each clustering method
print("Recall score between label and inv_kmeans_label is:", 
      recall_score(d2v['label'], d2v['inv_kmeans_label']))

print("Recall score between label and inv_birch_label is:", 
      recall_score(d2v['label'], d2v['inv_birch_label']))

print("Recall score between label and inv_hdbscan_label is:", 
      recall_score(d2v['label'], d2v['inv_hdbscan_label']))

# Compare f1 of each clustering method
print("F1 score between label and kmeans_label is:", 
      f1_score(d2v['label'], d2v['inv_kmeans_label']))

print("F1 score between label and birch_label is:", 
      f1_score(d2v['label'], d2v['inv_birch_label']))

print("F1 score between label and inv_hdbscan_label is:", 
      f1_score(d2v['label'], d2v['inv_hdbscan_label']))

651430


Unnamed: 0,index,subreddit,clean_text,label,kmeans_label,birch_label,hdbscan_label,link_flair_text,inv_kmeans_label,inv_birch_label,inv_hdbscan_label
0,0,ADHD,recently diagnosed need talk others diagnosed ...,0,0,1,1,,1,0,0
1,1,ADHD,really annoyed family drunk friend family host...,0,1,1,1,,0,0,0
2,2,ADHD,medication journey current disappointment wan...,0,0,1,1,,1,0,0
3,3,ADHD,wearable rem sleep detected gaming sleeping hy...,0,0,1,1,,1,0,0
4,4,ADHD,picking friend carpool nye party texted way dr...,0,1,1,1,,0,0,0


Classification metrics for each clustering methods (All data)
Accuracy score between label and kmeans_label is: 0.42580476797200006
Accuracy score between label and birch_label is: 0.13373347865465207
Accuracy score between label and hdbscan_label is: 0.1450808221911794
Precision score between label and kmeans_label is: 0.0999101731661741
Precision score between label and birch_label is: 0.13104190037105065
Precision score between label and hdbscan_label is: 0.13089482320227463
Recall score between label and kmeans_label is: 0.4218406143184904
Recall score between label and birch_label is: 0.9954815751275928
Recall score between label and hdbscan_label is: 0.978625275085452
F1 score between label and kmeans_label is: 0.16155670421992188
F1 score between label and birch_label is: 0.23159712198868188
F1 score between label and hdbscan_label is: 0.230905204081069

Classification metrics for each clustering methods (All data, inversed)
Accuracy score between label and inv_kmeans_label is: 

In [11]:
# Check accuracy for just bow_d2v features
bow_d2v = pd.read_csv("./preproc_data/clustered_bow_d2v.csv")

# Merge with df
#bow_d2v = bow_d2v.merge(df)

print(len(bow_d2v.index))

# Display
display(bow_d2v.head())

# Increment values in hdbscan_label
#bow_d2v['hdbscan_label'] = bow_d2v['hdbscan_label'] + 1

# Compare accuracy of each clustering method
print("Classification metrics for each clustering methods (All data)")
print("Accuracy score between label and kmeans_label is:", 
      accuracy_score(bow_d2v['label'], bow_d2v['kmeans_label']))

print("Accuracy score between label and birch_label is:", 
      accuracy_score(bow_d2v['label'], bow_d2v['birch_label']))

print("Accuracy score between label and hdbscan_label is:", 
      accuracy_score(bow_d2v['label'], bow_d2v['hdbscan_label']))

# Compare precision of each clustering method
print("Precision score between label and kmeans_label is:", 
      precision_score(bow_d2v['label'], bow_d2v['kmeans_label']))

print("Precision score between label and birch_label is:", 
      precision_score(bow_d2v['label'], bow_d2v['birch_label']))

print("Precision score between label and hdbscan_label is:", 
      precision_score(bow_d2v['label'], bow_d2v['hdbscan_label']))

# Compare recall of each clustering method
print("Recall score between label and kmeans_label is:", 
      recall_score(bow_d2v['label'], bow_d2v['kmeans_label']))

print("Recall score between label and birch_label is:", 
      recall_score(bow_d2v['label'], bow_d2v['birch_label']))

print("Recall score between label and hdbscan_label is:", 
      recall_score(bow_d2v['label'], bow_d2v['hdbscan_label']))

# Compare f1 of each clustering method
print("F1 score between label and kmeans_label is:", 
      f1_score(bow_d2v['label'], bow_d2v['kmeans_label']))

print("F1 score between label and birch_label is:", 
      f1_score(bow_d2v['label'], bow_d2v['birch_label']))

print("F1 score between label and hdbscan_label is:", 
      f1_score(bow_d2v['label'], bow_d2v['hdbscan_label']))

# The clusters could be swapped (i.e. kmeans_label 0 could be 1 and vice versa)
# Inverse the column labels
bow_d2v['inv_kmeans_label'] = (~bow_d2v['kmeans_label'].astype(bool)).astype(int)
bow_d2v['inv_birch_label'] = (~bow_d2v['birch_label'].astype(bool)).astype(int)
bow_d2v['inv_hdbscan_label'] = (~bow_d2v['hdbscan_label'].astype(bool)).astype(int)

# Compare accuracy of each clustering method (inversed)
print("\nClassification metrics for each clustering methods (All data, inversed)")
print("Accuracy score between label and inv_kmeans_label is:", 
      accuracy_score(bow_d2v['label'], bow_d2v['inv_kmeans_label']))

print("Accuracy score between label and inv_birch_label is:", 
      accuracy_score(bow_d2v['label'], bow_d2v['inv_birch_label']))

print("Accuracy score between label and inv_hdbscan_label is:", 
      accuracy_score(bow_d2v['label'], bow_d2v['inv_hdbscan_label']))

# Compare precision of each clustering method
print("Precision score between label and inv_kmeans_label is:", 
      precision_score(bow_d2v['label'], bow_d2v['inv_kmeans_label']))

print("Precision score between label and inv_birch_label is:", 
      precision_score(bow_d2v['label'], bow_d2v['inv_birch_label']))

print("Precision score between label and inv_hdbscan_label is:", 
      precision_score(bow_d2v['label'], bow_d2v['inv_hdbscan_label']))

# Compare recall of each clustering method
print("Recall score between label and inv_kmeans_label is:", 
      recall_score(bow_d2v['label'], bow_d2v['inv_kmeans_label']))

print("Recall score between label and inv_birch_label is:", 
      recall_score(bow_d2v['label'], bow_d2v['inv_birch_label']))

print("Recall score between label and inv_hdbscan_label is:", 
      recall_score(bow_d2v['label'], bow_d2v['inv_hdbscan_label']))

# Compare f1 of each clustering method
print("F1 score between label and kmeans_label is:", 
      f1_score(bow_d2v['label'], bow_d2v['inv_kmeans_label']))

print("F1 score between label and birch_label is:", 
      f1_score(bow_d2v['label'], bow_d2v['inv_birch_label']))

print("F1 score between label and inv_hdbscan_label is:", 
      f1_score(bow_d2v['label'], bow_d2v['inv_hdbscan_label']))

# Drop NaNs in link_flair_text and check scores again
bow_d2v = bow_d2v.dropna()

# Compare accuracy of each clustering method
print("\nClassification metrics for each clustering methods (NaNs dropped from lft)")
print("Accuracy score between label and kmeans_label is:", 
      accuracy_score(bow_d2v['label'], bow_d2v['kmeans_label']))

print("Accuracy score between label and birch_label is:", 
      accuracy_score(bow_d2v['label'], bow_d2v['birch_label']))

print("Accuracy score between label and hdbscan_label is:", 
      accuracy_score(bow_d2v['label'], bow_d2v['hdbscan_label']))

# Compare precision of each clustering method
print("Precision score between label and kmeans_label is:", 
      precision_score(bow_d2v['label'], bow_d2v['kmeans_label']))

print("Precision score between label and birch_label is:", 
      precision_score(bow_d2v['label'], bow_d2v['birch_label']))

print("Precision score between label and hdbscan_label is:", 
      precision_score(bow_d2v['label'], bow_d2v['hdbscan_label']))

# Compare recall of each clustering method
print("Recall score between label and kmeans_label is:", 
      recall_score(bow_d2v['label'], bow_d2v['kmeans_label']))

print("Recall score between label and birch_label is:", 
      recall_score(bow_d2v['label'], bow_d2v['birch_label']))

print("Recall score between label and hdbscan_label is:", 
      recall_score(bow_d2v['label'], bow_d2v['hdbscan_label']))

# Compare f1 of each clustering method
print("F1 score between label and kmeans_label is:", 
      f1_score(bow_d2v['label'], bow_d2v['kmeans_label']))

print("F1 score between label and birch_label is:", 
      f1_score(bow_d2v['label'], bow_d2v['birch_label']))

print("F1 score between label and hdbscan_label is:", 
      f1_score(bow_d2v['label'], bow_d2v['hdbscan_label']))

# Compare accuracy of each clustering method (inversed)
print("\nClassification metrics for each clustering methods (NaNs dropped from lft, inversed)")
print("Accuracy score between label and inv_kmeans_label is:", 
      accuracy_score(bow_d2v['label'], bow_d2v['inv_kmeans_label']))

print("Accuracy score between label and inv_birch_label is:", 
      accuracy_score(bow_d2v['label'], bow_d2v['inv_birch_label']))

print("Accuracy score between label and inv_hdbscan_label is:", 
      accuracy_score(bow_d2v['label'], bow_d2v['inv_hdbscan_label']))

# Compare precision of each clustering method
print("Precision score between label and inv_kmeans_label is:", 
      precision_score(bow_d2v['label'], bow_d2v['inv_kmeans_label']))

print("Precision score between label and inv_birch_label is:", 
      precision_score(bow_d2v['label'], bow_d2v['inv_birch_label']))

print("Precision score between label and inv_hdbscan_label is:", 
      precision_score(bow_d2v['label'], bow_d2v['inv_hdbscan_label']))

# Compare recall of each clustering method
print("Recall score between label and inv_kmeans_label is:", 
      recall_score(bow_d2v['label'], bow_d2v['inv_kmeans_label']))

print("Recall score between label and inv_birch_label is:", 
      recall_score(bow_d2v['label'], bow_d2v['inv_birch_label']))

print("Recall score between label and inv_hdbscan_label is:", 
      recall_score(bow_d2v['label'], bow_d2v['inv_hdbscan_label']))

# Compare f1 of each clustering method
print("F1 score between label and kmeans_label is:", 
      f1_score(bow_d2v['label'], bow_d2v['inv_kmeans_label']))

print("F1 score between label and birch_label is:", 
      f1_score(bow_d2v['label'], bow_d2v['inv_birch_label']))

print("F1 score between label and inv_hdbscan_label is:", 
      f1_score(bow_d2v['label'], bow_d2v['inv_hdbscan_label']))

651430


Unnamed: 0,index,subreddit,clean_text,label,kmeans_label,birch_label,hdbscan_label,link_flair_text,inv_kmeans_label,inv_birch_label,inv_hdbscan_label
0,0,ADHD,recently diagnosed need talk others diagnosed ...,0,0,0,1,,1,1,0
1,1,ADHD,really annoyed family drunk friend family host...,0,1,0,1,,0,1,0
2,2,ADHD,medication journey current disappointment wan...,0,0,0,1,,1,1,0
3,3,ADHD,wearable rem sleep detected gaming sleeping hy...,0,1,0,1,,0,1,0
4,4,ADHD,picking friend carpool nye party texted way dr...,0,1,0,1,,0,1,0


Classification metrics for each clustering methods (All data)
Accuracy score between label and kmeans_label is: 0.378874169135594
Accuracy score between label and birch_label is: 0.7958890441029735
Accuracy score between label and hdbscan_label is: 0.14202753941329077
Precision score between label and kmeans_label is: 0.09155448635921584
Precision score between label and birch_label is: 0.21228664810555622
Precision score between label and hdbscan_label is: 0.1305495863705951
Recall score between label and kmeans_label is: 0.4187619984080161
Recall score between label and birch_label is: 0.2052839818326544
Recall score between label and hdbscan_label is: 0.9792456805731142
F1 score between label and kmeans_label is: 0.15025789217251054
F1 score between label and birch_label is: 0.2087265975553149
F1 score between label and hdbscan_label is: 0.2303850491380699

Classification metrics for each clustering methods (All data, inversed)
Accuracy score between label and inv_kmeans_label is: 0