# Imports

In [None]:
pip install transformers



In [None]:
pip install textstat


Collecting textstat
  Downloading textstat-0.7.3-py3-none-any.whl (105 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m105.1/105.1 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pyphen (from textstat)
  Downloading pyphen-0.14.0-py3-none-any.whl (2.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m24.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyphen, textstat
Successfully installed pyphen-0.14.0 textstat-0.7.3


In [None]:
pip install sentence-transformers


Collecting sentence-transformers
  Downloading sentence_transformers-2.5.1-py3-none-any.whl (156 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m156.5/156.5 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: sentence-transformers
Successfully installed sentence-transformers-2.5.1


In [None]:
import pandas as pd

# Load the data

In [None]:
pd.set_option('max_colwidth', 400)

df = pd.read_csv('annotated_trees_101.csv', index_col=0)

In [None]:
df.columns

Index(['node_id', 'tree_id', 'timestamp', 'author', 'text', 'parent',
       'Aggressive', 'AgreeBut', 'AgreeToDisagree', 'Alternative', 'Answer',
       'AttackValidity', 'BAD', 'Clarification', 'Complaint', 'Convergence',
       'CounterArgument', 'CriticalQuestion', 'DirectNo', 'DoubleVoicing',
       'Extension', 'Irrelevance', 'Moderation', 'NegTransformation',
       'Nitpicking', 'NoReasonDisagreement', 'Personal', 'Positive',
       'Repetition', 'RephraseAttack', 'RequestClarification', 'Ridicule',
       'Sarcasm', 'Softening', 'Sources', 'ViableTransformation',
       'WQualifiers'],
      dtype='object')

In [None]:
df['Irrelevance'].value_counts()

0    9984
1     575
Name: Irrelevance, dtype: int64

In [None]:
result = df.groupby('tree_id')['Irrelevance'].sum().reset_index()
result = result.sort_values(by='Irrelevance', ascending=False)
trees = list(result.tree_id)

# Pre-processing

In [None]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer

nltk.download('stopwords')
nltk.download('wordnet')

def preprocess_text(text, remove_stopwords=True, use_stemming=False, use_lemmatization=True):
    # Remove special characters and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Convert to lowercase
    text = text.lower()

    # Tokenization (split the text into words)
    words = text.split()

    # Remove stopwords
    if remove_stopwords:
        stop_words = set(stopwords.words('english'))
        words = [word for word in words if word not in stop_words]

    # Apply stemming
    if use_stemming:
        stemmer = PorterStemmer()
        words = [stemmer.stem(word) for word in words]

    # Apply lemmatization
    if use_lemmatization:
        lemmatizer = WordNetLemmatizer()
        words = [lemmatizer.lemmatize(word) for word in words]

    # Join the processed words back into a single string
    processed_text = ' '.join(words)

    return processed_text

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


# Main Method

## Functions

In [None]:
from transformers import pipeline
from sentence_transformers import SentenceTransformer, util

# Load a pre-trained BERT model for sentence embeddings
model = SentenceTransformer("stsb-roberta-large")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.96k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/674 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()


tokenizer_config.json:   0%|          | 0.00/1.17k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/191 [00:00<?, ?B/s]

In [None]:
import numpy as np
from math import sqrt, pow, exp
import torch

# Define a function to calculate semantic similarity
def calculate_semantic_similarity(text1, text2):
    embeddings1 = model.encode(text1, convert_to_tensor=True)
    embeddings2 = model.encode(text2, convert_to_tensor=True)

    cosine_similarity = util.cos_sim(embeddings1, embeddings2)[0][0].item()
    euclidean_distance = calculate_semantic_eucl(embeddings1, embeddings2)
    return cosine_similarity, euclidean_distance

def calculate_semantic_cosine(embeddings1, embeddings2):
    cosine_similarity = util.cos_sim(embeddings1, embeddings2)[0][0].item()
    return cosine_similarity

def calculate_semantic_eucl(embeddings1, embeddings2):
    euclidean_distance = torch.sqrt(torch.sum((embeddings1 - embeddings2) ** 2)).item()
    return euclidean_distance


# Calculate mean and standard deviation
def get_stat(lst):
  mean_value = np.mean(lst)
  std_deviation = np.std(lst)
  return mean_value, std_deviation

In [None]:
import textstat

def calculate_readability_score(text):
    try:
        # Calculate Flesch-Kincaid readability score
        readability_score = textstat.flesch_kincaid_grade(text)
        return readability_score
    except Exception as e:
        print(f"Error calculating readability score: {e}")
        return 0



In [None]:
import matplotlib.pyplot as plt

def plot_curr(texts_irrel,red,count,all_score,name):
  red = [all_txt.index(item) for item in texts_irrel]
  plt.figure(figsize=(15, 6))  # Adjust the width and height as needed
  # Plot the line plot
  plt.plot(count, all_score, marker='o', linestyle='-')
  plt.scatter([count[i] for i in red], [all_score[i] for i in red], color='red', zorder=5)

  plt.xlabel('Comment depth')
  plt.ylabel('Cosine similarity score')
  plt.title(f'Change in Cosine similarity score between the root and the comment based on comment depth - {name}')
  plt.grid(True)
  plt.show()


## Root only

In [None]:
res_df = pd.DataFrame()
res_lst = []
for tree in trees:
  try:
    df_chose = df[df['tree_id'] == tree]
    df_chose = df_chose[df_chose['text'].apply(lambda x: len(x.split()) >= 7)]
    # process root
    root = df_chose[df_chose['parent'] == -1].text
    root = root.reset_index()
    root_txt = list(root['text'])[0]
    root_txt_clean = preprocess_text(root_txt)
    root_txt_clean = model.encode(root_txt_clean, convert_to_tensor=True)

    # create groups
    all_txt = list(df_chose['text'])[1:]

    irrelevent = df_chose[df_chose['Irrelevance'] == 1].text
    irrelevent = irrelevent.reset_index()
    texts_irrel = list(irrelevent['text'])

    rrelevent = df_chose[df_chose['Irrelevance'] == 0].text
    rrelevent = rrelevent.reset_index()
    texts_relev = list(rrelevent['text'])

    # run the main loop
    irre_score_cos = []
    rel_score_cos = []
    all_score_cos = []

    irre_score_euc = []
    rel_score_euc = []
    all_score_euc = []
    # red  = []
    # count_ = 0
    # count = [i for i in range(len(all_txt))]
    for txt in all_txt:
      # if txt in texts_irrel:
      #   red.append(count_)
      txt_prcoess = preprocess_text(txt)
      txt_prcoess = model.encode(txt_prcoess, convert_to_tensor=True)

      score_cos = calculate_semantic_cosine(txt_prcoess,root_txt_clean)
      score_euc = calculate_semantic_eucl(txt_prcoess,root_txt_clean)

      print(score_cos, score_euc)
      if txt in texts_irrel:
        irre_score_cos.append(score_cos)
        irre_score_euc.append(score_euc)
      else:
        rel_score_cos.append(score_cos)
        rel_score_euc.append(score_euc)

      all_score_cos.append(score_cos)
      all_score_euc.append(score_euc)

    # get stats and plot the results
    c1, _ = get_stat(rel_score_cos)
    c2, _ = get_stat(irre_score_cos)
    c_total, _ = get_stat(all_score_cos)

    e1, _ = get_stat(rel_score_euc)
    e2, _ = get_stat(irre_score_euc)
    e_total, _ = get_stat(all_score_euc)


    len_irev = len(texts_irrel)
    len_relv = len(texts_relev)
    total = len(all_txt)
    res_lst.append((tree,total,len_relv,len_irev,c_total,c1,c2,e_total,e1,e2))
    # plot_curr(texts_irrel,red,count,all_score,tree)
  except Exception as e:
    print(f'Error in {tree} - {e}')

## Weighted sliding window

In [None]:
# use weights

# if k is the next after root then root is 1.0

# if k is the second index then root = 0.7 and k-1 = 0.3

# if k is the third index then root = 0.6, k-1 = 0.25 and k-2 = 0.15

# else = 0.5 for root, 0.25 for k-1, 0.15 for k-2 and 0.1 for k-3

res_lst = []
for tree in trees:
  try:
    df_chose = df[df['tree_id'] == tree]
    df_chose = df_chose[df_chose['text'].apply(lambda x: len(x.split()) >= 7)]
    # process root
    root = df_chose[df_chose['parent'] == -1].text
    root = root.reset_index()
    root_txt = list(root['text'])[0]
    root_txt_clean = preprocess_text(root_txt)
    root_txt_clean = model.encode(root_txt_clean, convert_to_tensor=True)

    # create groups
    all_txt = list(df_chose['text'])[1:]

    irrelevent = df_chose[df_chose['Irrelevance'] == 1].text
    irrelevent = irrelevent.reset_index()
    texts_irrel = list(irrelevent['text'])

    rrelevent = df_chose[df_chose['Irrelevance'] == 0].text
    rrelevent = rrelevent.reset_index()
    texts_relev = list(rrelevent['text'])

    # run the main loop
    irre_score = []
    rel_score = []
    all_score = []

    irre_score_euc = []
    rel_score_euc = []
    all_score_euc = []


    embed_text = []
    for txt in all_txt:
      proccesed = preprocess_text(txt)
      embed_text.append(model.encode(proccesed, convert_to_tensor=True))

    for i in range(len(all_txt)):
      txt_embed = embed_text[i]
      txt = all_txt[i]


      # cosine only
      root_score = calculate_semantic_cosine(txt_embed,root_txt_clean)
      if i == 0:
        total_score = root_score
      elif i == 1:
        prev_txt = embed_text[0]
        prev_score = calculate_semantic_cosine(txt_embed,prev_txt)
        total_score = 0.6 * root_score + 0.4 * prev_score
      elif i == 2:
        prev_txt_0 = embed_text[0]
        prev_txt_1 = embed_text[1]
        prev_score_0 = calculate_semantic_cosine(txt_embed,prev_txt_0)
        prev_score_1 = calculate_semantic_cosine(txt_embed,prev_txt_1)
        total_score = 0.5 * root_score + 0.35 * prev_score_0 + 0.15 * prev_score_1
      else:
        prev_txt_0 = embed_text[i-1]
        prev_txt_1 = embed_text[i-2]
        prev_txt_2 = embed_text[i-3]

        prev_score_0 = calculate_semantic_cosine(txt_embed,prev_txt_0)
        prev_score_1 = calculate_semantic_cosine(txt_embed,prev_txt_1)
        prev_score_2 = calculate_semantic_cosine(txt_embed,prev_txt_2)
        total_score = 0.4 * root_score + 0.3 * prev_score_0 + 0.2 * prev_score_1 + 0.1 * prev_score_2

      # Euclidean only
      root_score_euc = calculate_semantic_eucl(txt_embed,root_txt_clean)
      if i == 0:
        total_score_euc = root_score_euc
      elif i == 1:
        prev_txt = embed_text[0]
        prev_score = calculate_semantic_eucl(txt_embed,prev_txt)
        total_score_euc = 0.6 * root_score_euc + 0.4 * prev_score
      elif i == 2:
        prev_txt_0 = embed_text[0]
        prev_txt_1 = embed_text[1]
        prev_score_0 = calculate_semantic_eucl(txt_embed,prev_txt_0)
        prev_score_1 = calculate_semantic_eucl(txt_embed,prev_txt_1)
        total_score_euc = 0.5 * root_score_euc + 0.35 * prev_score_0 + 0.15 * prev_score_1
      else:
        prev_txt_0 = embed_text[i-1]
        prev_txt_1 = embed_text[i-2]
        prev_txt_2 = embed_text[i-3]

        prev_score_0 = calculate_semantic_eucl(txt_embed,prev_txt_0)
        prev_score_1 = calculate_semantic_eucl(txt_embed,prev_txt_1)
        prev_score_2 = calculate_semantic_eucl(txt_embed,prev_txt_2)
        total_score_euc = 0.4 * root_score_euc + 0.3 * prev_score_0 + 0.2 * prev_score_1 + 0.1 * prev_score_2

      print(total_score, total_score_euc)
      if txt in texts_irrel:
        irre_score_cos.append(total_score)
        irre_score_euc.append(total_score_euc)
      else:
        rel_score_cos.append(total_score)
        rel_score_euc.append(total_score_euc)

      all_score_cos.append(total_score)
      all_score_euc.append(total_score_euc)

    # get stats and plot the results
    c1, _ = get_stat(rel_score_cos)
    c2, _ = get_stat(irre_score_cos)
    c_total, _ = get_stat(all_score_cos)

    e1, _ = get_stat(rel_score_euc)
    e2, _ = get_stat(irre_score_euc)
    e_total, _ = get_stat(all_score_euc)


    len_irev = len(texts_irrel)
    len_relv = len(texts_relev)
    total = len(all_txt)
    res_lst.append((tree,total,len_relv,len_irev,c_total,c1,c2,e_total,e1,e2))
    # plot_curr(texts_irrel,red,count,all_score,tree)
  except Exception as e:
    print(f'Error in {tree} - {e}')

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
0.20548028424382211 0.027079210322787338
0.5457183808088303 0.038899525117481744
0.5091762512922287 0.0365890460497247
0.5302043654024602 0.03588687063135999
0.6076722145080566 0.0389161575529548
0.7000949382781982 0.043692461698634784
0.4483948290348053 0.03231552844315873
0.4510771155357361 0.03273153883073156
0.5167157411575318 0.034485699382069065
0.38282332569360733 0.030907679561098845
0.38639889359474183 0.03063465154083738
0.5731697887182235 0.037434089229699534
0.5748323559761048 0.03720827138336342
0.6909017324447633 0.04364424979531686
0.7155416905879973 0.04493330941204538
0.7580840170383455 0.05043002772153793
0.6663090825080872 0.04203143480978337
0.7271394968032836 0.04640377899507988
0.7050508618354798 0.04518872146214725
0.21421583890914916 0.02751555790721636
0.10156853273510932 0.026860648505184104
0.36340423226356505 0.03318981252050662
0.3726807594299316 0.031599460454757373
0.2825798079371452 0.02944

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)


0.4657599925994873 0.03166551368461055
0.4807567834854126 0.03379142988714885
0.35831913799047466 0.029859865309304255
0.49696145057678226 0.03536523687227691
0.5855370342731475 0.04100500828276659
0.2485775351524353 0.028390456528040637
0.47972719669342034 0.03445048694589799
0.3446982264518738 0.030195940439697016
0.43898653686046596 0.032212401975092106
0.42011782526969915 0.03149888385415241
0.5409785509109497 0.03533291139561579
0.5692740440368652 0.03662658249244939
0.3964957773685455 0.031190118193084027
0.4191676259040833 0.032153619711435634
0.5213768601417542 0.03598753258675485
0.31051041781902317 0.029392127732978786
0.32926797568798066 0.029479361460953604
0.5342509180307389 0.034592114700925276
0.4614933460950852 0.03382882266249763
0.5694016098976136 0.03674017805768458
0.5000985264778137 0.033664679369287964
0.47487497031688686 0.03313123727846753
0.10159974396228791 0.024960251511969823
0.3640717945992947 0.03063988415763125
0.21190710663795473 0.027158876896665546
0.4

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)


0.655171275138855 0.039977210257336214
0.6137572646141052 0.03837252937564061
0.7233542829751968 0.04625324674435619
0.6505115032196045 0.04105786809190766
0.5146547615528106 0.035315542129672764
0.5686672091484071 0.03664005114320429
0.45574083924293524 0.033972176038838835
0.4947525888681411 0.03392625033607558
0.4678988724946976 0.033475084214288256
0.4995142012834549 0.03445845944036943
0.5150031387805939 0.03438530738636695
0.523994755744934 0.03551917869993944
0.1868006616830826 0.027142795388152157
0.4172804281115532 0.03252680320000971
0.4619087159633637 0.03342212318515051
0.25510555207729335 0.02819215038029265
0.47774279117584234 0.03309315057872527
0.4917965233325958 0.03373284164655487
0.5295128643512725 0.03488391059614223
0.5208797872066497 0.03416957322667156
0.6209010839462281 0.03864469675528254
0.653177547454834 0.04106570280068022
0.5320700109004974 0.03509528056373965
0.6549431622028351 0.040620772416521664
0.7172022104263306 0.04437430179571089
0.6661025166511535 

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)


0.2286520004272461 0.02767928461956347
0.42874734997749325 0.03259415299054091
0.5773392766714096 0.03796665858359791
0.26420261710882187 0.028793284835376423
0.3548002153635025 0.030217173315527313
0.4448888510465622 0.03377773644501746
0.4548885643482208 0.032447939305446344
0.4130367159843445 0.03226045057690519
0.13323275791481137 0.02616774580568469
0.3532841563224793 0.03115801220960461
0.02543605715036392 0.026225991351898327
0.2495861887931824 0.02907823232821605
0.29833987951278684 0.02933561033787161
0.38315634652972225 0.031033937858943806
0.40923967063426975 0.03189436510338217
0.48167215883731845 0.0331721695806714
0.40216138064861295 0.031570445024983504
0.3077193647623062 0.02972827959244254
0.423989188671112 0.03236275019970515
0.44643256664276126 0.032907690384343005
0.42559810876846316 0.031844028219725755
0.5495953619480134 0.035679487601389816
0.29288736572489144 0.028805719464884305
0.45066585540771487 0.033263923664185936
0.4663985699415207 0.03288390186480298
0.6

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)


0.4617573916912079 0.03232158836223571
0.42191227674484255 0.031656402484971716
0.5047236204147338 0.03404343468806469
0.4161928504705429 0.03204546888153188
0.1524528034031391 0.026354419888109777
0.30568772405385974 0.031062976005793826
0.2698506563901901 0.028918960327634677
0.37103383652865884 0.03124653685299401
0.4863266229629517 0.035571815963893964
0.3494310826063156 0.030410252240842466
0.52207610309124 0.03614381500368231
0.5173943936824799 0.0348758944076675
0.5598593592643738 0.035583254530447174
0.43778050541877744 0.03320282210547368
0.510975930094719 0.03505103758126484
0.35416075587272644 0.030525738631696962
0.2671452835202217 0.028497447977530364
0.5228840410709381 0.035599468888209966
0.45564903616905206 0.03287466265996781
0.6290971457958222 0.06395087635872476
0.4602456748485565 0.03291498546942387
0.34745994806289676 0.029665178345716355
0.5539318561553955 0.03572105035131274
0.4087043255567551 0.03128833269149597
0.2296603262424469 0.028062024241613102
0.24764111

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)


0.673920214176178 0.04060109268458897
0.6749315381050109 0.04084960934196223
0.7157620429992675 0.047701881835341
0.4804049074649811 0.0330367793965769
0.6611994862556457 0.0404594696453505
0.6941095888614656 0.04400937785025527
0.4463022142648697 0.03235679138897913
0.5959232777357101 0.039630903934343216
0.5678549408912659 0.0360185637735858
0.4939916610717774 0.03300954857505238
0.5033153653144837 0.03471512240927637
0.6726434946060181 0.042321624807794035
0.7493799984455108 0.04980505668587417
0.6530289053916931 0.04000793853636977
0.530280488729477 0.03402632323721476
0.5888984024524689 0.0367013252789595
0.12980257645249366 0.02608797221881206
0.4419030278921128 0.0326491979356813
0.4480464518070221 0.03229241333838986
0.5489114165306093 0.03724755587937501
0.5610204100608825 0.03593267565836398
0.6339078068733216 0.03971616851830035
0.6421751737594604 0.03913141973161166
0.6104260444641114 0.038451160823176234
0.560665762424469 0.03553056771838449
0.5422314882278443 0.0347521811

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)


0.5105645060539246 0.03473713300319824
0.4141372203826904 0.032034790523189124
0.3881559796631336 0.03209985793327254
0.5586625128984453 0.03960891156403204
0.4325472444295883 0.033136139882953684
0.66008278131485 0.04395549440434396
0.5241915464401246 0.03644485953856241
0.41341074407100675 0.033751539651863366
0.43914524316787723 0.03267449706018743
0.49980442821979526 0.03475591785516185
0.36446038484573373 0.030938155726909278
0.2992661401629448 0.031403277952592276
0.3566785961389542 0.031508433057089745
0.452120128273964 0.03423724699552059
0.48906047344207765 0.03545320175020661
0.6524429142475129 0.041863792740759985
0.7322951853275299 0.04604559924366276
0.45161745548248294 0.03205208970232616
0.710564911365509 0.04421814271801969
0.49513463079929354 0.0345262151679179
0.540223503112793 0.036680699347718615
0.437729236483574 0.03278359666585104
0.3829026132822037 0.03185577604619704
0.37761290371418 0.030635101258323076
0.24459791332483294 0.02836901294524119
0.358239561319351

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)


0.5536818504333496 0.03472030081461796
0.7262313604354859 0.048197931744710015
0.6688096106052399 0.04189606055342
0.6345231294631958 0.0387256298704087
0.5612385511398316 0.03604243115120774
0.4969377398490906 0.03421655573181182
0.5941465020179749 0.039878209802678866
0.47604965865612037 0.03380892470511131
0.5358795821666718 0.03528716214859369
0.3856474936008454 0.03100587018046556
0.2644739285111427 0.027983702022418615
0.2145897436887026 0.02713713226023398
0.4133978754281998 0.03138903336425292
0.08347919061779976 0.02505129038492863
0.395902656018734 0.03310708719013494
0.4190823197364807 0.03253970885733791
0.25162203386425974 0.027616054039239616
0.37949621975421904 0.0309235264491833
0.4586815178394318 0.03444936284400229
0.3452434748411179 0.033069928712304644
0.3940403163433075 0.031150131213303615
0.532860541343689 0.03533443432533779
0.5407077237963677 0.03636063648819929
0.3231243833899498 0.02931471411502437
0.5577561110258102 0.03556414225669185
0.5209006249904633 0.0

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)


0.2687031626701355 0.02920849571634208
0.5080809593200684 0.036933970099319854
0.35844120755791664 0.03251753282336897
0.44070609360933305 0.03393567995179651
0.19269376546144484 0.02775002484312221
0.3258192628622055 0.030341926897279766
0.45459869801998143 0.034149883467245276
0.21148422881960868 0.027609662900448924
0.36409847140312196 0.02999614292562784
0.45791304111480713 0.03324048983180901
0.2911488294601441 0.02914437744678853
0.4747251868247986 0.033800570426183986
0.18946056962013244 0.027989413493134194
0.37640204727649684 0.03141759317401829
0.37253738343715664 0.030976313827019798
0.2294750705361366 0.029021381168550502
0.20885416865348816 0.028707570887914174
0.28769804537296295 0.03016308401268493
0.2753729999065399 0.030487092348464463
0.2774374663829804 0.02998537729402094
0.14102134108543396 0.026459619247181453
0.24476147443056107 0.027995021033780706
0.39302386343479156 0.03143741569732659
0.33764045685529714 0.03023663315463921
0.2686460196971893 0.029403644574089

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)


0.329240083694458 0.02997670077216214
0.1729591965675354 0.027570103890161256
0.365880922973156 0.03160512476960374
0.17687325850129126 0.027247759747394615
0.2183402195572853 0.028618540944180697
0.21460643932223317 0.028523355983069484
0.3173927590250969 0.030106367703847944
0.4185266792774201 0.03228464160658773
0.2827224612236023 0.029221023073671373
0.3269855290651321 0.029973961617527554
0.14922561571002008 0.02653210228644063
0.3105375468730927 0.02994365798224015
0.3770635724067688 0.031366480696639075
0.2203905612230301 0.029093153082281883
0.2449081838130951 0.028454497487582518
0.29325642213225367 0.030001276962000764
0.2848643779754639 0.029981151442479582
0.2793697476387024 0.02931562607758344
0.3198174089193344 0.029644883322841348
0.19677688106894495 0.027804896476809164
0.4138982892036438 0.03239413824039819
0.1377703219652176 0.02679668949209333
0.15462114624679088 0.027934270864904534
0.2634210206568241 0.029081239447925143
0.34100751876831054 0.030874023849066307
0.4

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)


0.6124132871627808 0.037000161751500574
0.30368873476982117 0.02839928218456656
0.19290850609540938 0.02765013467442754
0.4045536071062088 0.03193094648603871
0.11468448890373112 0.02646711380251013
0.07627215646207332 0.02665610117867174
0.12725491626188157 0.026409146135153758
0.1796937085688114 0.02893815174704467
0.2518533729016781 0.03227005592687087
0.26398005783557893 0.029832304223057902
0.19167955219745636 0.028032055316191847
0.28594087585806843 0.030630005623192656
0.415764656662941 0.03182631070294166
0.2266222208738327 0.027605956763412237
0.37364277541637425 0.030785431061978598
0.41846065521240233 0.03186610439163818
0.4732356995344162 0.033603030186286575
0.5836575150489807 0.036757163096155954
0.6253757774829865 0.04098220840002817
0.42235523164272304 0.03205008324433594
0.40835837423801424 0.031190385047530952
0.5664848983287811 0.03912961332166528
0.6001570582389831 0.03887060317663048
0.5121811151504516 0.034205762792208876
0.4468620240688324 0.03147430334754064
0.4

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)


0.5802150964736938 0.03626778203082922
0.5452893018722534 0.035187500644537476
0.45157644152641296 0.032813701587091924
0.19567507952451707 0.02713244288153765
0.48998370170593264 0.034792797843550105
0.3658358842134476 0.030275807482312154
0.47174051702022557 0.033427611708059975
0.40328883826732637 0.03102484851035188
0.4891698360443116 0.03482541028438885
0.3387742102146149 0.029697418667679366
0.5509750753641128 0.03666916716626596
0.3845282524824143 0.031174429386077163
0.6497963786125183 0.04170230552862433
0.5813987612724305 0.03647307484891791
0.37231450080871586 0.031005633534973383
0.39625433087348944 0.03184784269519172
0.2045736595988274 0.027583959592641566
0.4347971767187118 0.0326813338360974
0.44345568418502807 0.031983371816765666
0.5057637095451355 0.03484984492568516
0.37330300509929654 0.030541525940416384
0.4591457188129425 0.032497545978628334
0.28454620242118833 0.028550490516540166
0.4783524930477142 0.03292812142056048
0.3715515911579132 0.03096763479077028
0.4

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)


0.426149845123291 0.03229126388916433
0.4948847949504852 0.03486495246098922
0.3686469167470932 0.030963451303386368
0.446573406457901 0.032443095821765536
0.5572410702705383 0.03721807959624794
0.5353829026222229 0.03534241648913872
0.5634450793266297 0.03714705217897881
0.4556006461381913 0.03208981796587292
0.5683824330568313 0.03611424145636041
0.46561710536479956 0.033336979262761884
0.6076007455587388 0.03844283528019964
0.5283361375331879 0.03579895456933482
0.5103667795658111 0.034560495621924904
0.4734044015407562 0.034004013082963344
0.42589482665061956 0.03233779170182603
0.5195773184299469 0.03518108195897364
0.5654067635536194 0.03807015697728234
0.41655423641204836 0.031203298103962007
0.4563929080963135 0.032865749132558846
0.5029514193534851 0.034172065391406006
0.5142337262630463 0.034545506731867
0.32569078207015995 0.029759581172458992
0.5396963268518448 0.03555219163654684
0.4868605434894562 0.03319963999952061
0.45023098587989807 0.03289355013675174
0.4291053116321

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)


0.4719369113445282 0.03298581772411397
0.5278892397880555 0.03568499065508257
0.28556875959038736 0.028668623558871977
0.16514028757810595 0.027021542692478036
0.23749550879001619 0.02835927917863154
0.24328598901629447 0.02920245191319891
0.31875536441802976 0.030557697919297955
0.033559310436248775 0.025222566847253478
0.3250205472111702 0.030244184693957142
0.10900786500424148 0.026722332501284158
0.3345064722001553 0.030407632368675844
0.4930218935012818 0.03453797836012829
0.6338138610124588 0.0427947456406512
0.6735767304897308 0.041203762199625724
0.7407570600509644 0.04737444132732958
0.6613719046115876 0.04021847951751751
0.7036171257495881 0.047026256182493675
0.6637470841407777 0.04025652384687421
0.7191109538078309 0.04853789343482899
0.6356886386871339 0.03885245165397859
0.5493775486946105 0.03586032260742723
0.605135065317154 0.03824634329262113
0.19400414526462553 0.026908942802018596
0.5136918812990189 0.035943925832663856
0.27736571431159973 0.027881475631826434
0.308

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)


0.5951119065284729 0.03672675384245732
0.6365716814994812 0.039243671370026795
0.42442815005779266 0.032020358434073255
0.3974050790071488 0.030638950805259604
0.4770583271980286 0.03402525953490192
0.5066050589084625 0.03443452910905615
0.4130292996764183 0.031663448029313666
0.24398578852415084 0.02803595218412559
0.4671122640371323 0.03286979103705688
0.47683575749397283 0.033817371814916945
0.6099696993827819 0.040358151548047005
0.46556724905967717 0.03249389576586209
0.5288721531629562 0.035441339909455596
0.4007497191429138 0.0314778022308723
0.21951274648308752 0.028141032061225996
0.28695654571056367 0.029150625351404308
0.38831553459167484 0.031192565644476238
0.43329969346523284 0.03224858706208135
0.4522267937660218 0.033354439882506214
0.5062754303216934 0.0340152870903079


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)


0.6200921535491943 0.03817035258404204
0.66971195936203 0.041004798645482204
0.6487557649612427 0.039886334845461495
0.517014741897583 0.03443874906710919
0.5141117572784424 0.03429744550935372
0.47938133776187897 0.0332508098145078
0.4185767382383347 0.03167296763921275
0.13650801628828046 0.027371889094646024
0.12480487748980523 0.02668861189243332
0.36337817907333375 0.030724094935998394
0.41002176254987716 0.03252543979488455
0.5697489127516747 0.03659885453656718
0.6221856713294983 0.039010783048880796
0.6665889203548432 0.04106239958283114
0.6490744590759278 0.039360733606873886
0.46040191352367404 0.032190810334250905
0.3101478457450867 0.028500720846433295
0.4769534647464752 0.03340492109641019
0.37539814561605456 0.030236267228129762
0.5717860758304596 0.03780212514535913
0.5466070055961609 0.035474293434364076
0.6360764324665069 0.0402666525461645
0.6537694156169892 0.04041007074729472
0.7434109747409822 0.04617380308197682
0.30772777199745177 0.028261087564184995
0.475756314

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)


0.6659651398658752 0.0421337440670465
0.657546091079712 0.041107005880180805
0.6089410543441772 0.03807109064170591
0.5710955679416656 0.036850358962847315
0.36747869849205017 0.03070349486357277
0.5312755197286605 0.03658284653624337
0.41950156688690193 0.032436463148733295
0.47278294116258623 0.03476254505355476
0.6181138515472413 0.04004401939701457
0.46009786725044255 0.033775092328926795
0.4607493758201599 0.03330889292848231
0.540388000011444 0.03641532681753735
0.3919626325368881 0.03155196156855416
0.38568795919418336 0.031319855800990816
0.40766712427139284 0.032032399253286936
0.2842600882053375 0.029999837014144694
0.40557583719491963 0.03193633480142755
0.14865717384964228 0.027093011217683663
0.4424884349107742 0.0347884577475213
0.46237850487232207 0.03337984428907345
0.5429707385599614 0.037231338448408896
0.6140968620777131 0.03868959830846316
0.6351337254047394 0.039448481025314945
0.6140982359647751 0.03838257877534585
0.5208653599023819 0.03489726743555915
0.52544087

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)


0.42724353075027466 0.031245411116302576
0.4019877791404724 0.031874640681502295
0.520338848233223 0.03446258992254313
0.38193916380405424 0.030491569102331965
0.3366854816675186 0.0300538249107579
0.23649300038814547 0.02822242989920283
0.44748938083648687 0.03285973029220586
0.38864576220512387 0.03136560694027844
0.4997449487447739 0.034299422197706014
0.5799512147903443 0.036628228487845665
0.00271151326596737 0.0243476738899596
0.333021867275238 0.02963189022796249
0.4182150393724442 0.03279419605160166
0.41364098638296126 0.031999959623662824
0.53481425344944 0.03477682642237508
0.5173778384923935 0.03424239094410175
0.3779423356056214 0.030702917705640508
0.5295520007610321 0.03494911740898238
0.6368850886821747 0.042107311489598995
0.5732035130262375 0.036653371631991796
0.4225918680429458 0.030991446054635698
0.3394354254007339 0.030253096300133658
0.3652672797441482 0.029784176025374445
0.3260455965995789 0.029111442626818314
0.3986298426985741 0.03197724835716882
0.517431759

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)


0.3151988387107849 0.02938562626053378
0.41697688102722164 0.03211990566171103
0.39517384469509126 0.03126259627559133
0.4329450875520706 0.03209750379857666
0.26020466387271884 0.028345157075576364
0.49688440859317784 0.0339273469669911
0.6283968329429627 0.04004521858767729
0.5631843566894531 0.03701176530092385
0.6281682401895523 0.03911241813136852
0.570615178346634 0.03630075968099692
0.6293206512928009 0.039998266317571994
0.40557352900505067 0.030978433633861976
0.5317772686481477 0.03516612515675873
0.5102715820074081 0.03417958730134347
0.46308151483535764 0.032358951425612584
0.5321902692317962 0.03489268011406767
0.4717711418867111 0.033803849906201916
0.4647497355937958 0.03262959183892128
0.3948894679546356 0.031116341082630273
0.37688651680946356 0.030214177435624046
0.3318141847848893 0.030057297545033355
0.09407161381095648 0.026036949811080082
0.36710375994443895 0.032068889534703236
0.5396051853895187 0.03921338416816146
0.5874967120587826 0.038571269893215035
0.41702

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)


In [None]:
res_df = pd.DataFrame(res_lst,columns = ['Tree Name','N_comments','N_relevant','N_irrelevant','Total Cosine mean','Relevant Cosine mean','Irrelevant Cosine mean','Total Euclidean mean','Relevant Euclidean mean','Irrelevant Euclidean mean'])
res_df.fillna(0,inplace=True)
res_df['% Irrelevant'] = res_df['N_irrelevant'] / res_df['N_comments']
res_df = res_df.set_index('Tree Name')
res_df.to_csv('results_approach2.csv')

In [None]:
res_df = pd.read_csv('results_approach2.csv')
res_df = res_df.iloc[:,1:]
res_df = res_df.set_index('Tree Name')

In [None]:
res_df = res_df[res_df['N_irrelevant'] >3]
res_df.to_csv('result_new.csv')

## Create embeddings dataset

In [None]:
# Create an embeddings and class dataframe
df_lst = []
for tree in trees:
  try:
    count = -1
    df_chose = df[df['tree_id'] == tree]
    df_chose = df_chose[df_chose['text'].apply(lambda x: len(x.split()) >= 7)]
    # process root
    root = df_chose[df_chose['parent'] == -1].text
    root = root.reset_index()
    root_txt = list(root['text'])[0]
    root_txt_clean = preprocess_text(root_txt)
    root_txt_clean = model.encode(root_txt_clean, convert_to_tensor=True)
    df_lst.append((tree,root_txt_clean,count,1))
    count += 1
    # create groups
    all_txt = list(df_chose['text'])[1:]

    irrelevent = df_chose[df_chose['Irrelevance'] == 1].text
    irrelevent = irrelevent.reset_index()
    texts_irrel = list(irrelevent['text'])

    rrelevent = df_chose[df_chose['Irrelevance'] == 0].text
    rrelevent = rrelevent.reset_index()
    texts_relev = list(rrelevent['text'])

    embed_text = []
    for txt in all_txt:
      proccesed = preprocess_text(txt)
      embedded_text = model.encode(proccesed, convert_to_tensor=True)
      if txt in texts_relev:
        cls = 0
      else:
        cls = 1


      df_lst.append((tree,embedded_text,count,cls))
      count +=1

  except:
    pass

total_df = pd.DataFrame(df_lst,columns = ['Tree Name','Embeddings','Comment_n','Class'])
total_df = total_df.set_index('Tree Name')
df['Embeddings'] = df['Embeddings'].apply(lambda x: x.numpy())

total_df.to_csv('Embedded.csv')

In [None]:
total_df = pd.read_csv('Embedded.csv')

In [None]:
total_df
total_df = total_df.set_index('Tree Name')
# Apply the conversion function to every row in the DataFrame
total_df['Embeddings'] = total_df['Embeddings'].apply(lambda x: torch.tensor(eval(x)))


ValueError: could not convert string to float: '...'

## K Sliding Window With root

In [None]:
all_df = pd.DataFrame()

### K == 1

In [None]:
weights = [(0.5,0.5), (0.25,0.75), (0.75,0.25)]
root_w = -1
full_house = []
col_names = ['Tree_Name','n','Class','W1-K1-R','W2-K1-R','W3-K1-R']


result = total_df.groupby('Tree Name')['Class'].sum().reset_index()
trees = list(result['Tree Name'])


for tree in trees:
  try:
    print(tree)
    df_chose = total_df[total_df.index == tree]
    order = list(df_chose['Comment_n'])[1:]
    cls =  list(df_chose['Class'])[1:]
    embeddings =  list(df_chose['Embeddings'])[1:]

    root = df_chose[df_chose['Comment_n'] == -1].Embeddings
    root = root.reset_index()
    root_txt = list(root['Embeddings'])[0]


    for i in order:
      cls_curr = cls[i]
      curr = embeddings[i]
      root_score = calculate_semantic_cosine(root_txt,curr)

      if i == 0:
        total_score = root_score
        full_house.append((tree,i,cls_curr,total_score,total_score,total_score))

      else:
        diff_w = []
        for w_set in weights:
          w1,w2 = w_set

          prev_txt_0 = embeddings[i-1]
          prev_score_0 = calculate_semantic_cosine(curr,prev_txt_0)
          total_score = w1 * root_score + w2 * prev_score_0 + w3
          diff_w.append(total_score)

        full_house.append((tree,i,cls_curr,diff_w[0],diff_w[1],diff_w[2]))

  except:
    print('Error')

full_house_df = pd.DataFrame(full_house,columns = col_names)
all_df = full_house_df

4r2a4d


TypeError: new(): invalid data type 'str'

### K == 2

In [None]:
weights = [(0.333,0.333,0.333), (0.30,0.45,0.25), (0.6,0.3,0.1)]
full_house = []
col_names = ['Tree_Name','n','Class','W1-K2-R','W2-K2-R','W3-K2-R']

result = total_df.groupby('Tree Name')['Class'].sum().reset_index()
trees = list(result['Tree Name'])

for tree in trees:
  try:

    df_chose = total_df[total_df.index == tree]
    order = list(df_chose['Comment_n'])[1:]
    cls =  list(df_chose['Class'])[1:]
    embeddings =  list(df_chose['Embeddings'])[1:]


    # process root
    root = df_chose[df_chose['Comment_n'] == -1].Embeddings
    root = root.reset_index()
    root_txt = list(root['Embeddings'])[0]


    for i in order:
      cls_curr = cls[i]
      curr = embeddings[i]
      root_score = calculate_semantic_cosine(root_txt,curr)

      if i == 0:
        total_score = root_score
        full_house.append((tree,i,cls_curr,total_score,total_score,total_score))

      elif i == 1:
        prev_txt = embeddings[0]
        prev_score = calculate_semantic_cosine(curr,prev_txt)
        total_score = 0.6 * root_score + 0.4 * prev_score
        full_house.append((tree,i,cls_curr,total_score,total_score,total_score))

      else:
        diff_w = []
        for w_set in weights:
          w1,w2,w3 = w_set

          prev_txt_0 = embeddings[i-1]
          prev_txt_1 = embeddings[i-2]

          prev_score_0 = calculate_semantic_cosine(curr,prev_txt_0)
          prev_score_1 = calculate_semantic_cosine(curr,prev_txt_1)
          total_score = w1 * root_score + w2 * prev_score_0 + w3 * prev_score_1
          diff_w.append(total_score)

        full_house.append((tree,i,cls_curr,diff_w[0],diff_w[1],diff_w[2]))

  except:
    print('Error')

full_house_df = pd.DataFrame(full_house,columns = col_names)
all_df = all_df.merge(full_house_df.iloc[:, 3:], left_index=True, right_index=True)


### K == 3

In [None]:

weights = [(0.25,0.25,0.25,0.25), (0.5,0.25,0.15,0.10), (0.4,0.3,0.2,0.1)],
full_house = []
col_names = ['Tree_Name','n','Class','W1-K3-R','W2-K3-R','W3-K3-R']

result = total_df.groupby('Tree Name')['Class'].sum().reset_index()
trees = list(result['Tree Name'])

for tree in trees:
  try:

    df_chose = total_df[total_df.index == tree]
    order = list(df_chose['Comment_n'])[1:]
    cls =  list(df_chose['Class'])[1:]
    embeddings =  list(df_chose['Embeddings'])[1:]

    root = df_chose[df_chose['Comment_n'] == -1].Embeddings
    root = root.reset_index()
    root_txt = list(root['Embeddings'])[0]


    for i in order:
      cls_curr = cls[i]
      curr = embeddings[i]
      root_score = calculate_semantic_cosine(root_txt,curr)

      if i == 0:
        total_score = root_score
        full_house.append((tree,i,cls_curr,total_score,total_score,total_score))

      elif i == 1:
        prev_txt = embeddings[0]
        prev_score = calculate_semantic_cosine(curr,prev_txt)
        total_score = 0.6 * root_score + 0.4 * prev_score
        full_house.append((tree,i,cls_curr,total_score,total_score,total_score))

      elif i == 2:
        prev_txt_0 = embeddings[0]
        prev_txt_1 = embeddings[1]
        prev_score_0 = calculate_semantic_cosine(curr,prev_txt_0)
        prev_score_1 = calculate_semantic_cosine(curr,prev_txt_1)
        total_score = 0.5 * root_score + 0.35 * prev_score_0 + 0.15 * prev_score_1
        full_house.append((tree,i,cls_curr,total_score,total_score,total_score))

      else:
        diff_w = []
        for w_set in weights:
          w1,w2,w3,w4 = w_set

          prev_txt_0 = embeddings[i-1]
          prev_txt_1 = embeddings[i-2]
          prev_txt_2 = embeddings[i-3]

          prev_score_0 = calculate_semantic_cosine(curr,prev_txt_0)
          prev_score_1 = calculate_semantic_cosine(curr,prev_txt_1)
          prev_score_2 = calculate_semantic_cosine(curr,prev_txt_2)
          total_score = w1 * root_score + w2 * prev_score_0 + w3 * prev_score_1 + w4 * prev_score_2
          diff_w.append(total_score)

        full_house.append((tree,i,cls_curr,diff_w[0],diff_w[1],diff_w[2]))

  except:
    print('Error')

full_house_df = pd.DataFrame(full_house,columns = col_names)
all_df = all_df.merge(full_house_df.iloc[:, 3:], left_index=True, right_index=True)


### K == 4

In [None]:
weights = [(0.20,0.20,0.20,0.20,0.20), (0.4,0.20,0.15,0.15,0.10), (0.25,0.4,0.2,0.1,0.05)],
full_house = []
col_names = ['Tree_Name','n','Class','W1-K4-R','W2-K4-R','W3-K4-R']

result = total_df.groupby('Tree Name')['Class'].sum().reset_index()
trees = list(result['Tree Name'])
for tree in trees:
  try:

    df_chose = total_df[total_df.index == tree]
    order = list(df_chose['Comment_n'])[1:]
    cls =  list(df_chose['Class'])[1:]
    embeddings =  list(df_chose['Embeddings'])[1:]


    # process root
    root = df_chose[df_chose['Comment_n'] == -1].Embeddings
    root = root.reset_index()
    root_txt = list(root['Embeddings'])[0]


    for i in order:
      cls_curr = cls[i]
      curr = embeddings[i]
      root_score = calculate_semantic_cosine(root_txt,curr)

      if i == 0:
        total_score = root_score
        full_house.append((tree,i,cls_curr,total_score,total_score,total_score))

      elif i == 1:
        prev_txt = embeddings[0]
        prev_score = calculate_semantic_cosine(curr,prev_txt)
        total_score = 0.6 * root_score + 0.4 * prev_score
        full_house.append((tree,i,cls_curr,total_score,total_score,total_score))

      elif i == 2:
        prev_txt_0 = embeddings[0]
        prev_txt_1 = embeddings[1]
        prev_score_0 = calculate_semantic_cosine(curr,prev_txt_0)
        prev_score_1 = calculate_semantic_cosine(curr,prev_txt_1)
        total_score = 0.5 * root_score + 0.35 * prev_score_0 + 0.15 * prev_score_1
        full_house.append((tree,i,cls_curr,total_score,total_score,total_score))

      elif i == 3:
        prev_txt_0 = embeddings[0]
        prev_txt_1 = embeddings[1]
        prev_txt_2 = embeddings[2]

        prev_score_0 = calculate_semantic_cosine(curr,prev_txt_0)
        prev_score_1 = calculate_semantic_cosine(curr,prev_txt_1)
        prev_score_2 = calculate_semantic_cosine(curr,prev_txt_2)

        total_score = 0.5 * root_score + 0.35 * prev_score_0 + 0.15 * prev_score_1 + 0.10 * prev_score_2
        full_house.append((tree,i,cls_curr,total_score,total_score,total_score))

      elif i == 4:
        prev_txt_0 = embeddings[0]
        prev_txt_1 = embeddings[1]
        prev_txt_2 = embeddings[2]
        prev_txt_3 = embeddings[3]

        prev_score_0 = calculate_semantic_cosine(curr,prev_txt_0)
        prev_score_1 = calculate_semantic_cosine(curr,prev_txt_1)
        prev_score_2 = calculate_semantic_cosine(curr,prev_txt_2)
        prev_score_3 = calculate_semantic_cosine(curr,prev_txt_3)

        total_score = 0.4 * root_score + 0.30 * prev_score_0 + 0.15 * prev_score_1 + 0.10 * prev_score_2 + 0.05 * prev_score_3
        full_house.append((tree,i,cls_curr,total_score,total_score,total_score))

      else:
        diff_w = []
        for w_set in weights:
          w1,w2,w3,w4,w5 = w_set

          prev_txt_0 = embeddings[i-1]
          prev_txt_1 = embeddings[i-2]
          prev_txt_2 = embeddings[i-3]
          prev_txt_3 = embeddings[i-4]

          prev_score_0 = calculate_semantic_cosine(curr,prev_txt_0)
          prev_score_1 = calculate_semantic_cosine(curr,prev_txt_1)
          prev_score_2 = calculate_semantic_cosine(curr,prev_txt_2)
          prev_score_3 = calculate_semantic_cosine(curr,prev_txt_3)

          total_score = w1 * root_score + w2 * prev_score_0 + w3 * prev_score_1 + w4 * prev_score_2 + w5 * prev_score_3
          diff_w.append(total_score)

        full_house.append((tree,i,cls_curr,diff_w[0],diff_w[1],diff_w[2]))

  except:
    print('Error')

full_house_df = pd.DataFrame(full_house,columns = col_names)
all_df = all_df.merge(full_house_df.iloc[:, 3:], left_index=True, right_index=True)


## K Sliding Window Without root

### K == 1

In [None]:
weights = [(0.5,0.5), (0.25,0.75), (0.75,0.25)]
root_w = -1
full_house = []
col_names = ['Tree_Name','n','Class','W1-K1-WR','W2-K1-WR','W3-K1-WR']


result = total_df.groupby('Tree Name')['Class'].sum().reset_index()
trees = list(result['Tree Name'])


for tree in trees:
  try:
    print(tree)
    df_chose = total_df[total_df.index == tree]
    order = list(df_chose['Comment_n'])[1:]
    cls =  list(df_chose['Class'])[1:]
    embeddings =  list(df_chose['Embeddings'])[1:]


    for i in order:
      cls_curr = cls[i]
      curr = embeddings[i]

      if i == 0:
        total_score = 0
        full_house.append((tree,i,cls_curr,total_score,total_score,total_score))

      else:
        diff_w = []
        for w_set in weights:
          w1,w2 = w_set
          w = w1 + w2
          prev_txt_0 = embeddings[i-1]
          prev_score_0 = calculate_semantic_cosine(curr,prev_txt_0)
          total_score = prev_score_0
          diff_w.append(total_score)

        full_house.append((tree,i,cls_curr,diff_w[0],diff_w[1],diff_w[2]))

  except:
    print('Error')

full_house_df = pd.DataFrame(full_house,columns = col_names)
all_df = all_df.merge(full_house_df.iloc[:, 3:], left_index=True, right_index=True)


### K == 2

In [None]:
weights = [(0.5,0.5), (0.333, 0.666), (0.666,0.333)]
full_house = []
col_names = ['Tree_Name','n','Class','W1-K2-WR','W2-K2-WR','W3-K2-WR']

result = total_df.groupby('Tree Name')['Class'].sum().reset_index()
trees = list(result['Tree Name'])

for tree in trees:
  try:

    df_chose = total_df[total_df.index == tree]
    order = list(df_chose['Comment_n'])[1:]
    cls =  list(df_chose['Class'])[1:]
    embeddings =  list(df_chose['Embeddings'])[1:]

    for i in order:
      cls_curr = cls[i]
      curr = embeddings[i]

      if i == 0:
        total_score = 0
        full_house.append((tree,i,cls_curr,total_score,total_score,total_score))

      elif i == 1:
        prev_txt = embeddings[0]
        prev_score = calculate_semantic_cosine(curr,prev_txt)
        total_score = prev_score
        full_house.append((tree,i,cls_curr,total_score,total_score,total_score))

      else:
        diff_w = []
        for w_set in weights:
          w1,w2 = w_set

          prev_txt_0 = embeddings[i-1]
          prev_txt_1 = embeddings[i-2]

          prev_score_0 = calculate_semantic_cosine(curr,prev_txt_0)
          prev_score_1 = calculate_semantic_cosine(curr,prev_txt_1)
          total_score =  w1 * prev_score_0 + w2 * prev_score_1
          diff_w.append(total_score)

        full_house.append((tree,i,cls_curr,diff_w[0],diff_w[1],diff_w[2]))

  except:
    print('Error')

full_house_df = pd.DataFrame(full_house,columns = col_names)
all_df = all_df.merge(full_house_df.iloc[:, 3:], left_index=True, right_index=True)


### K == 3

In [None]:
weights = [(0.333,0.333,0.333), (0.55,0.30,0.15), (0.65,0.20,0.15)],
full_house = []
col_names = ['Tree_Name','n','Class','W1-K3-WR','W2-K3-WR','W3-K3-WR']

result = total_df.groupby('Tree Name')['Class'].sum().reset_index()
trees = list(result['Tree Name'])

for tree in trees:
  try:

    df_chose = total_df[total_df.index == tree]
    order = list(df_chose['Comment_n'])[1:]
    cls =  list(df_chose['Class'])[1:]
    embeddings =  list(df_chose['Embeddings'])[1:]

    for i in order:
      cls_curr = cls[i]
      curr = embeddings[i]

      if i == 0:
        total_score = 0
        full_house.append((tree,i,cls_curr,total_score,total_score,total_score))

      elif i == 1:
        prev_txt = embeddings[0]
        prev_score = calculate_semantic_cosine(curr,prev_txt)
        total_score = prev_score
        full_house.append((tree,i,cls_curr,total_score,total_score,total_score))

      elif i == 2:
        prev_txt_0 = embeddings[0]
        prev_txt_1 = embeddings[1]

        prev_score_0 = calculate_semantic_cosine(curr,prev_txt_0)
        prev_score_1 = calculate_semantic_cosine(curr,prev_txt_1)

        total_score =  0.65 * prev_score_0 + 0.35 * prev_score_1

        full_house.append((tree,i,cls_curr,total_score,total_score,total_score))

      else:
        diff_w = []
        for w_set in weights:
          w1,w2,w3 = w_set

          prev_txt_0 = embeddings[i-1]
          prev_txt_1 = embeddings[i-2]
          prev_txt_2 = embeddings[i-3]

          prev_score_0 = calculate_semantic_cosine(curr,prev_txt_0)
          prev_score_1 = calculate_semantic_cosine(curr,prev_txt_1)
          prev_score_2 = calculate_semantic_cosine(curr,prev_txt_2)

          total_score = w1 * prev_score_0 + w2 * prev_score_1 + w3 * prev_score_2
          diff_w.append(total_score)

        full_house.append((tree,i,cls_curr,diff_w[0],diff_w[1],diff_w[2]))

  except:
    print('Error')

full_house_df = pd.DataFrame(full_house,columns = col_names)
all_df = all_df.merge(full_house_df.iloc[:, 3:], left_index=True, right_index=True)


### K == 4

In [None]:
weights = [(0.25,0.25,0.25,0.25), (0.4,0.3,0.20,0.10), (0.5,0.25,0.15,0.10)],
full_house = []
col_names = ['Tree_Name','n','Class','W1-K4-WR','W2-K4-WR','W3-K4-WR']

result = total_df.groupby('Tree Name')['Class'].sum().reset_index()
trees = list(result['Tree Name'])
for tree in trees:
  try:

    df_chose = total_df[total_df.index == tree]
    order = list(df_chose['Comment_n'])[1:]
    cls =  list(df_chose['Class'])[1:]
    embeddings =  list(df_chose['Embeddings'])[1:]

    for i in order:
      cls_curr = cls[i]
      curr = embeddings[i]
      root_score = calculate_semantic_cosine(root_txt,curr)

      if i == 0:
        total_score = 0
        full_house.append((tree,i,cls_curr,total_score,total_score,total_score))

      elif i == 1:
        prev_txt = embeddings[0]
        prev_score = calculate_semantic_cosine(curr,prev_txt)
        total_score = prev_score
        full_house.append((tree,i,cls_curr,total_score,total_score,total_score))

      elif i == 2:
        prev_txt_0 = embeddings[0]
        prev_txt_1 = embeddings[1]

        prev_score_0 = calculate_semantic_cosine(curr,prev_txt_0)
        prev_score_1 = calculate_semantic_cosine(curr,prev_txt_1)

        total_score = 0.6 * prev_score_0 + 0.4 * prev_score_1
        full_house.append((tree,i,cls_curr,total_score,total_score,total_score))

      elif i == 3:
        prev_txt_0 = embeddings[0]
        prev_txt_1 = embeddings[1]
        prev_txt_2 = embeddings[2]

        prev_score_0 = calculate_semantic_cosine(curr,prev_txt_0)
        prev_score_1 = calculate_semantic_cosine(curr,prev_txt_1)
        prev_score_2 = calculate_semantic_cosine(curr,prev_txt_2)

        total_score = 0.5 * prev_score_0 + 0.35 * prev_score_1 + 0.15 * prev_score_2
        full_house.append((tree,i,cls_curr,total_score,total_score,total_score))

      else:
        diff_w = []
        for w_set in weights:
          w1,w2,w3,w4 = w_set

          prev_txt_0 = embeddings[i-1]
          prev_txt_1 = embeddings[i-2]
          prev_txt_2 = embeddings[i-3]
          prev_txt_3 = embeddings[i-4]

          prev_score_0 = calculate_semantic_cosine(curr,prev_txt_0)
          prev_score_1 = calculate_semantic_cosine(curr,prev_txt_1)
          prev_score_2 = calculate_semantic_cosine(curr,prev_txt_2)
          prev_score_3 = calculate_semantic_cosine(curr,prev_txt_3)

          total_score = w1 * prev_score_0 + w2 * prev_score_1 + w3 * prev_score_2 + w4 * prev_score_3
          diff_w.append(total_score)

        full_house.append((tree,i,cls_curr,diff_w[0],diff_w[1],diff_w[2]))

  except:
    print('Error')

full_house_df = pd.DataFrame(full_house,columns = col_names)
all_df = all_df.merge(full_house_df.iloc[:, 3:], left_index=True, right_index=True)
all_df.to_csv('Final.csv')

## Length and readability score features

In [None]:
full_house = []
for tree in trees:
  try:
    df_chose = df[df['tree_id'] == tree]
    df_chose = df_chose[df_chose['text'].apply(lambda x: len(x.split()) >= 7)]
    # create groups
    all_txt = list(df_chose['text'])[1:]
    count = 0
    for text in all_txt:
      l = len(text)
      read_score = calculate_readability_score(text)
      full_house.append((tree,count,l,read_score))
      count += 1


  except Exception as e:
    print(f'Error in {tree} - {e}')

In [None]:
# full_house_df = pd.DataFrame(full_house,columns = ['Tree Name','i','Class','W1','W2','W3'])
# full_house_df = pd.DataFrame(full_house,columns = ['Tree Name','i','Length','Read_score'])
# full_house_df.to_csv('Full_house_more.csv')

## Classification with ML

In [None]:
df = pd.read_csv('all.csv')
df = df.iloc[:,3:]

In [None]:
X = df.iloc[:,1:].values
y = df.iloc[:,0].values

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler

from xgboost import XGBClassifier


# Split the data into train and test sets (stratified)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, stratify=y, random_state=42)

# Use RandomUnderSampler to perform undersampling on the majority class
# rus = RandomUnderSampler(sampling_strategy='auto', random_state=42)
# X_resampled, y_resampled = rus.fit_resample(X_train, y_train)

# Use RandomOverSampler to perform oversampling on the minority class
ros = RandomOverSampler(sampling_strategy='auto', random_state=42)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)
# Initialize classifiers
classifiers = {
    'XGBoost': XGBClassifier(),
    'Linear SVM': SVC(kernel='rbf'),
    'Logistic Regression': LogisticRegression(penalty='none'),
    'KNN': KNeighborsClassifier(n_neighbors=11)
}

# Train and evaluate each classifier
for name, clf in classifiers.items():
    print(f"Training {name}...")
    clf.fit(X_resampled, y_resampled)

    # Make predictions on the test set
    y_pred = clf.predict(X_test)

    # Display classification report
    print("Classification Report:")
    print(classification_report(y_test, y_pred))

    # Display confusion matrix
    conf_matrix = confusion_matrix(y_test, y_pred)
    print(f"Confusion Matrix for {name}:")
    print(conf_matrix)
    print("\n" + "-"*50 + "\n")


Training XGBoost...
Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.95      0.94      2190
           1       0.05      0.04      0.05       141

    accuracy                           0.89      2331
   macro avg       0.49      0.50      0.49      2331
weighted avg       0.89      0.89      0.89      2331

Confusion Matrix for XGBoost:
[[2076  114]
 [ 135    6]]

--------------------------------------------------

Training Linear SVM...
Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.41      0.57      2190
           1       0.07      0.65      0.12       141

    accuracy                           0.42      2331
   macro avg       0.51      0.53      0.34      2331
weighted avg       0.89      0.42      0.54      2331

Confusion Matrix for Linear SVM:
[[ 892 1298]
 [  49   92]]

--------------------------------------------------

Training Logistic Regression...
Cl



Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.68      0.79      2190
           1       0.07      0.38      0.12       141

    accuracy                           0.66      2331
   macro avg       0.51      0.53      0.46      2331
weighted avg       0.89      0.66      0.75      2331

Confusion Matrix for KNN:
[[1492  698]
 [  87   54]]

--------------------------------------------------



## Classficiation with Thresholds

In [None]:
df = pd.read_csv('all.csv')
df = df.iloc[:,1:-5]
df

Unnamed: 0,Tree Name,i,Class,W1_c,W2_c,W3_c
0,4r2a4d,0,0,0.315199,0.315199,0.315199
1,4r2a4d,1,0,0.416977,0.416977,0.416977
2,4r2a4d,2,0,0.395174,0.395174,0.395174
3,4r2a4d,3,0,0.371214,0.466018,0.432945
4,4r2a4d,4,0,0.245730,0.274229,0.260205
...,...,...,...,...,...,...
9316,7yf2le,39,0,0.590449,0.600688,0.585739
9317,7yf2le,40,0,0.513397,0.568745,0.553209
9318,7yf2le,41,0,0.253096,0.255947,0.250931
9319,7yf2le,42,1,0.121275,0.081300,0.088650


In [None]:
thresholds = [0.35, 0.4, 0.45, 0.5, 0.55, 0.60, 0.65, 0.7]

# Threshold function
def apply_threshold(value, threshold):
    return 1 if value >= threshold else 0

w1_c = list(df['W1_c'])
w2_c = list(df['W2_c'])
w3_c = list(df['W3_c'])

cols = ['W1_c',	'W2_c',	'W3_c']
for threshold_value in thresholds:
  for col in cols:
      df[f'{col} - {threshold_value}'] = df[col].apply(apply_threshold, threshold=threshold_value)

In [None]:
y_true = df.iloc[:,2].values

y_preds = []
cols_to_calc = df.columns[6:]
for col in cols_to_calc:
  y_preds.append(list(df[col]))


In [None]:
count = 0
for y_pred in y_preds:
     # Display classification report
    print(f'***********************************************   {cols_to_calc[count]}   ***********************************************')
    print("Classification Report:")
    print(classification_report(y_true, y_pred))

    # Display confusion matrix
    conf_matrix = confusion_matrix(y_true, y_pred)
    print(f"Confusion Matrix")
    print(conf_matrix)
    print("\n" + "-"*50 + "\n")
    count += 1

***********************************************   W1_c - 0.35   ***********************************************
Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.25      0.39      8756
           1       0.07      0.82      0.12       565

    accuracy                           0.28      9321
   macro avg       0.51      0.53      0.26      9321
weighted avg       0.90      0.28      0.38      9321

Confusion Matrix
[[2179 6577]
 [ 104  461]]

--------------------------------------------------

***********************************************   W2_c - 0.35   ***********************************************
Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.23      0.37      8756
           1       0.07      0.84      0.12       565

    accuracy                           0.27      9321
   macro avg       0.51      0.53      0.25      9321
weighted avg       0.90      0.27 