In [None]:
import pandas as pd
#Mount Google Drive (if needed)
from google.colab import drive
drive.mount('/content/drive')

filePath= '/content/drive/MyDrive/createdebate_released_no_parse.xlsx'
df = pd.read_excel(filePath)

Mounted at /content/drive


In [None]:
from LoadInput import LoadInput
from ConversationForest import ConversationForest
from InteractionNetwork import InteractionNetwork

In [None]:
!pip install node2vec

Collecting node2vec
  Downloading node2vec-0.4.6-py3-none-any.whl (7.0 kB)
Collecting networkx<3.0,>=2.5 (from node2vec)
  Downloading networkx-2.8.8-py3-none-any.whl (2.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m27.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: networkx, node2vec
  Attempting uninstall: networkx
    Found existing installation: networkx 3.1
    Uninstalling networkx-3.1:
      Successfully uninstalled networkx-3.1
Successfully installed networkx-2.8.8 node2vec-0.4.6


In [None]:
pip install gensim



In [None]:
import numpy as np
import networkx as nx
from node2vec import Node2Vec

class Node2VecEmbedding:
    def __init__(self, graph: nx.Graph):
        self.graph = graph

    def train(self, embedding_size=128, walk_length=10, num_walks=100, p=1, q=1):
        walks = self.generate_random_walks(walk_length, num_walks)
        model = self.learn_embeddings(walks, embedding_size, p, q)
        embeddings = self.extract_node_embeddings(model)
        return embeddings

    def generate_random_walks(self, walk_length, num_walks):
        walks = []
        for _ in range(num_walks):
            for node in self.graph.nodes():
                walk = self.random_walk(node, walk_length)
                walks.append(walk)
        return walks

    def random_walk(self, start_node, walk_length):
        walk = [start_node]
        for _ in range(walk_length - 1):
            current_node = walk[-1]
            neighbors = list(self.graph.neighbors(current_node))
            if neighbors:
                weights = [self.graph[current_node][neighbor].get('weight', 1.0) for neighbor in neighbors]
                probabilities = [weight / sum(weights) for weight in weights]
                next_node = np.random.choice(neighbors, p=probabilities)
                walk.append(next_node)
            else:
                break
        return walk

    def learn_embeddings(self, walks, embedding_size, p, q):
        # Create a Node2Vec instance
        node2vec = Node2Vec(
            self.graph,
            dimensions=embedding_size,
            walk_length=len(walks[0]),
            num_walks=len(walks),
            p=p,
            q=q,
            workers=1,
        )

        # Learn embeddings from the random walks
        model = node2vec.fit(window=5, min_count=0, batch_words=4)
        return model

    def extract_node_embeddings(self, model):
        embeddings = {}
        for node in self.graph.nodes():
            embeddings[node] = model.wv[str(node)]
        return embeddings
loadInput = LoadInput(filePath)
loadInput.loadDataFromAllSheets()
conversationForest = ConversationForest(loadInput)
conversationForest.buildConversationTrees()
trees = conversationForest.getAllConversationTrees()
import numpy as np

all_embeddings = {}
# Loop through each conversation tree
for key, tree in trees.items():
    # For each tree, create an InteractionNetwork object
    iNetwork = InteractionNetwork(tree, [])
    print("Graphs before generating embeddings:")
    # iNetwork.showInteractionNetwork()  # Optionally show the graph before generating embeddings

    # Create Node2VecEmbedding object and generate embeddings for the tree
    node2vec_embedding = Node2VecEmbedding(iNetwork.graph)
    node2vec_embeddings = node2vec_embedding.train(embedding_size=128, walk_length=10, num_walks=100, p=1, q=1)

    # Add the embeddings to the 'all_embeddings' dictionary with tree key
    all_embeddings[key] = node2vec_embeddings
print(all_embeddings)



Graphs before generating embeddings:


Computing transition probabilities:   0%|          | 0/333 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|██████████| 33300/33300 [23:01<00:00, 24.11it/s]


Graphs before generating embeddings:


Computing transition probabilities:   0%|          | 0/26 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|██████████| 2600/2600 [00:02<00:00, 891.07it/s]


Graphs before generating embeddings:


Computing transition probabilities:   0%|          | 0/8 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|██████████| 800/800 [00:00<00:00, 3372.63it/s]


Graphs before generating embeddings:


Computing transition probabilities:   0%|          | 0/35 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|██████████| 3500/3500 [00:07<00:00, 484.79it/s]


Graphs before generating embeddings:


Computing transition probabilities:   0%|          | 0/31 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|██████████| 3100/3100 [00:04<00:00, 760.32it/s]


Graphs before generating embeddings:


Computing transition probabilities:   0%|          | 0/3 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|██████████| 300/300 [00:00<00:00, 6024.80it/s]


Graphs before generating embeddings:


Computing transition probabilities:   0%|          | 0/14 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|██████████| 1400/1400 [00:00<00:00, 1858.78it/s]


Graphs before generating embeddings:


Computing transition probabilities:   0%|          | 0/9 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|██████████| 900/900 [00:00<00:00, 2820.34it/s]


Graphs before generating embeddings:


Computing transition probabilities:   0%|          | 0/5 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|██████████| 500/500 [00:00<00:00, 4350.90it/s]


Graphs before generating embeddings:


Computing transition probabilities:   0%|          | 0/5 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|██████████| 500/500 [00:00<00:00, 4392.82it/s]


Graphs before generating embeddings:


Computing transition probabilities:   0%|          | 0/15 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|██████████| 1500/1500 [00:01<00:00, 859.74it/s]


Graphs before generating embeddings:


Computing transition probabilities:   0%|          | 0/15 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|██████████| 1500/1500 [00:01<00:00, 872.59it/s]


Graphs before generating embeddings:


Computing transition probabilities:   0%|          | 0/15 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|██████████| 1500/1500 [00:01<00:00, 933.93it/s]


Graphs before generating embeddings:


Computing transition probabilities:   0%|          | 0/11 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|██████████| 1100/1100 [00:00<00:00, 1329.51it/s]


Graphs before generating embeddings:


Computing transition probabilities:   0%|          | 0/17 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|██████████| 1700/1700 [00:01<00:00, 1558.34it/s]


Graphs before generating embeddings:


Computing transition probabilities:   0%|          | 0/18 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|██████████| 1800/1800 [00:02<00:00, 717.95it/s]


Graphs before generating embeddings:


Computing transition probabilities:   0%|          | 0/9 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|██████████| 900/900 [00:00<00:00, 2851.49it/s]


Graphs before generating embeddings:


Computing transition probabilities:   0%|          | 0/42 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|██████████| 4200/4200 [00:09<00:00, 454.89it/s]


Graphs before generating embeddings:


Computing transition probabilities:   0%|          | 0/4 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|██████████| 400/400 [00:00<00:00, 3109.85it/s]


Graphs before generating embeddings:


Computing transition probabilities:   0%|          | 0/40 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|██████████| 4000/4000 [00:06<00:00, 611.90it/s]


Graphs before generating embeddings:


Computing transition probabilities:   0%|          | 0/9 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|██████████| 900/900 [00:00<00:00, 2856.17it/s]


Graphs before generating embeddings:


Computing transition probabilities:   0%|          | 0/12 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|██████████| 1200/1200 [00:00<00:00, 2226.41it/s]


Graphs before generating embeddings:


Computing transition probabilities:   0%|          | 0/6 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|██████████| 600/600 [00:00<00:00, 3970.62it/s]


Graphs before generating embeddings:


Computing transition probabilities:   0%|          | 0/10 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|██████████| 1000/1000 [00:00<00:00, 2540.07it/s]


Graphs before generating embeddings:


Computing transition probabilities:   0%|          | 0/5 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|██████████| 500/500 [00:00<00:00, 2728.86it/s]


Graphs before generating embeddings:


Computing transition probabilities:   0%|          | 0/9 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|██████████| 900/900 [00:00<00:00, 1629.60it/s]


Graphs before generating embeddings:


Computing transition probabilities:   0%|          | 0/11 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|██████████| 1100/1100 [00:00<00:00, 2319.84it/s]


Graphs before generating embeddings:


Computing transition probabilities:   0%|          | 0/29 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|██████████| 2900/2900 [00:03<00:00, 747.78it/s]


Graphs before generating embeddings:


Computing transition probabilities:   0%|          | 0/17 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|██████████| 1700/1700 [00:01<00:00, 1582.95it/s]


Graphs before generating embeddings:


Computing transition probabilities:   0%|          | 0/18 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|██████████| 1800/1800 [00:02<00:00, 757.21it/s] 


Graphs before generating embeddings:


Computing transition probabilities:   0%|          | 0/9 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|██████████| 900/900 [00:00<00:00, 2721.83it/s]


Graphs before generating embeddings:


Computing transition probabilities:   0%|          | 0/44 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|██████████| 4400/4400 [00:10<00:00, 421.03it/s]


Graphs before generating embeddings:


Computing transition probabilities:   0%|          | 0/15 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|██████████| 1500/1500 [00:00<00:00, 1750.16it/s]


Graphs before generating embeddings:


Computing transition probabilities:   0%|          | 0/7 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|██████████| 700/700 [00:00<00:00, 3048.20it/s]


Graphs before generating embeddings:


Computing transition probabilities:   0%|          | 0/9 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|██████████| 900/900 [00:00<00:00, 2962.60it/s]


Graphs before generating embeddings:


Computing transition probabilities:   0%|          | 0/26 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|██████████| 2600/2600 [00:03<00:00, 810.06it/s] 


Graphs before generating embeddings:


Computing transition probabilities:   0%|          | 0/9 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|██████████| 900/900 [00:00<00:00, 1418.23it/s]


Graphs before generating embeddings:


Computing transition probabilities:   0%|          | 0/14 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|██████████| 1400/1400 [00:00<00:00, 1882.57it/s]


Graphs before generating embeddings:


Computing transition probabilities:   0%|          | 0/6 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|██████████| 600/600 [00:00<00:00, 3668.88it/s]


Graphs before generating embeddings:


Computing transition probabilities:   0%|          | 0/11 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|██████████| 1100/1100 [00:00<00:00, 2325.38it/s]


Graphs before generating embeddings:


Computing transition probabilities:   0%|          | 0/16 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|██████████| 1600/1600 [00:01<00:00, 840.52it/s]


Graphs before generating embeddings:


Computing transition probabilities:   0%|          | 0/6 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|██████████| 600/600 [00:00<00:00, 2272.16it/s]


Graphs before generating embeddings:


Computing transition probabilities:   0%|          | 0/18 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|██████████| 1800/1800 [00:01<00:00, 1254.56it/s]


Graphs before generating embeddings:


Computing transition probabilities:   0%|          | 0/9 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|██████████| 900/900 [00:00<00:00, 2755.32it/s]


Graphs before generating embeddings:


Computing transition probabilities:   0%|          | 0/11 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|██████████| 1100/1100 [00:00<00:00, 1485.92it/s]


Graphs before generating embeddings:


Computing transition probabilities:   0%|          | 0/14 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|██████████| 1400/1400 [00:00<00:00, 1833.93it/s]


Graphs before generating embeddings:


Computing transition probabilities:   0%|          | 0/11 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|██████████| 1100/1100 [00:00<00:00, 2014.75it/s]


Graphs before generating embeddings:


Computing transition probabilities:   0%|          | 0/4 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|██████████| 400/400 [00:00<00:00, 3074.42it/s]


Graphs before generating embeddings:


Computing transition probabilities:   0%|          | 0/11 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|██████████| 1100/1100 [00:00<00:00, 2046.57it/s]


Graphs before generating embeddings:


Computing transition probabilities:   0%|          | 0/24 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|██████████| 2400/2400 [00:02<00:00, 803.07it/s]


Graphs before generating embeddings:


Computing transition probabilities:   0%|          | 0/11 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|██████████| 1100/1100 [00:00<00:00, 1321.41it/s]


Graphs before generating embeddings:


Computing transition probabilities:   0%|          | 0/15 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|██████████| 1500/1500 [00:00<00:00, 1517.22it/s]


Graphs before generating embeddings:


Computing transition probabilities:   0%|          | 0/7 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|██████████| 700/700 [00:00<00:00, 3383.06it/s]


Graphs before generating embeddings:


Computing transition probabilities:   0%|          | 0/15 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|██████████| 1500/1500 [00:01<00:00, 873.17it/s]


Graphs before generating embeddings:


Computing transition probabilities:   0%|          | 0/20 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|██████████| 2000/2000 [00:03<00:00, 643.74it/s]


Graphs before generating embeddings:


Computing transition probabilities:   0%|          | 0/8 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|██████████| 800/800 [00:00<00:00, 2953.81it/s]


Graphs before generating embeddings:


Computing transition probabilities:   0%|          | 0/5 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|██████████| 500/500 [00:00<00:00, 2853.74it/s]


{'878_26906': {'27083': array([ 0.17680259,  0.09596797,  0.2353515 ,  0.3745354 , -0.38406911,
       -0.38603336, -0.03619308,  0.26679844,  0.35974604,  0.1085992 ,
        0.14782032,  0.04880539,  0.10577419, -0.19871967, -0.3441342 ,
       -0.18492454, -0.1320089 , -0.27490857, -0.17662904,  0.12944315,
       -0.01953092, -0.03201642, -0.24299361,  0.12659447,  0.03695673,
        0.24008457,  0.5881773 , -0.35751098, -0.27883995, -0.31414035,
       -0.2670243 , -0.24337807,  0.11317733,  0.18709385, -0.09766907,
        0.14651024,  0.19266142,  0.01767073, -0.28633785, -0.08063021,
        0.43985638, -0.3489748 , -0.12876265, -0.01536059,  0.03180503,
       -0.41093487, -0.38079742,  0.12917826,  0.17210323,  0.31790635,
       -0.18378367,  0.04920666,  0.10690738, -0.2777986 , -0.02348725,
        0.40547374, -0.00534963,  0.22934523, -0.3396953 ,  0.25012243,
       -0.08527719, -0.4587417 , -0.6420696 ,  0.16053116, -0.11201955,
        0.07226776, -0.1160249 , -0.2172

In [None]:
#normalize graph embeddings
import numpy as np

np.set_printoptions(suppress=True)
all_embeddings2 = {}

for i in all_embeddings:
    all_embeddings2[i] = {}
    for j in all_embeddings[i]:
        if j in all_embeddings[i]:
            all_embeddings2[i][j] = np.array(all_embeddings[i][j])
        else:
            print(f"Key {j} does not exist in {i}.")
print(all_embeddings2)

{'878_26906': {'27083': array([ 0.17680259,  0.09596797,  0.2353515 ,  0.3745354 , -0.38406911,
       -0.38603336, -0.03619308,  0.26679844,  0.35974604,  0.1085992 ,
        0.14782032,  0.04880539,  0.10577419, -0.19871967, -0.3441342 ,
       -0.18492454, -0.1320089 , -0.27490857, -0.17662904,  0.12944315,
       -0.01953092, -0.03201642, -0.24299361,  0.12659447,  0.03695673,
        0.24008457,  0.5881773 , -0.35751098, -0.27883995, -0.31414035,
       -0.2670243 , -0.24337807,  0.11317733,  0.18709385, -0.09766907,
        0.14651024,  0.19266142,  0.01767073, -0.28633785, -0.08063021,
        0.43985638, -0.3489748 , -0.12876265, -0.01536059,  0.03180503,
       -0.41093487, -0.38079742,  0.12917826,  0.17210323,  0.31790635,
       -0.18378367,  0.04920666,  0.10690738, -0.2777986 , -0.02348725,
        0.40547374, -0.00534963,  0.22934523, -0.3396953 ,  0.25012243,
       -0.08527719, -0.4587417 , -0.6420696 ,  0.16053116, -0.11201955,
        0.07226776, -0.1160249 , -0.2172

In [None]:
#creating csv for graph embeddings
import csv

# Define the file name for the CSV file
csv_file_name = "all_embeddings2.csv"

# Write the aggregated_embeddings dictionary to the CSV file
with open(csv_file_name, mode="w", newline="") as file:
    writer = csv.writer(file)

    # Write the header row (column names)
    writer.writerow(["Disscusion_id", "Author", "G_Embeddings"])

    # Write the data rows
    for discussion, inner_dict in all_embeddings2.items():
        for author, embeddings in inner_dict.items():
            writer.writerow([discussion, author, embeddings])

In [None]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m13.8 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m20.1 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m33.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m42.2 MB/s[0m eta [36m0:00:0

In [None]:
#  text extraction

import csv
import pandas as pd
import numpy as np
df = pd.read_excel("/content/drive/MyDrive/createdebate_released_no_parse.xlsx",sheet_name="post")
df2= pd.read_excel("/content/drive/MyDrive/createdebate_released_no_parse.xlsx",sheet_name="text")
def checkrepetetion(x,y):
  c=0
  for i in range(len(df)):
    if(df["discussion_id"][i]==x and df["author_id"][i]==y):
      c=c+1
  if(c==1):
    return True
  else:
    return False

finaldict={}
for i in range(len(df)):
  if(checkrepetetion(df["discussion_id"][i],df["author_id"][i])):
    if(df["discussion_id"][i] in finaldict):
      finaldict[df["discussion_id"][i]][df["author_id"][i]]=[df["text_id"][i]]
    else:
      finaldict[df["discussion_id"][i]] = {df["author_id"][i] : [df["text_id"][i]]}
  else:
    if(df["discussion_id"][i] in finaldict):
      if(df["author_id"][i] in finaldict[df["discussion_id"][i]]):
        finaldict[df["discussion_id"][i]][df["author_id"][i]].append(df["text_id"][i])
      else:
        finaldict[df["discussion_id"][i]][df["author_id"][i]] =[df["text_id"][i]]
    else:
      finaldict[df["discussion_id"][i]] ={df["author_id"][i]:[df["text_id"][i]]}
print(finaldict)
f=finaldict
t=""
for i in finaldict:
  for j in finaldict[i]:
    for k in finaldict[i][j]:
      for a in range(len(df2)):
        if(df2["text_id"][a] == str(k)):
          t = t+df2["text"][a]
    f[i][j]=[t]
    t=""
print(f)


{878: {27083: [513135, 513212, 513146, 513214, 513218, 513220], 5901: [513160, 513166, 513185, 513189], 11010: [513211], 8705: [513276, 513161, 513278, 512970, 513158, 513163], 24659: [513136, 513147, 513213, 513216, 513142, 513206], 3346: [513137, 513140], 11470: [513171], 27784: [513215, 513186, 513148], 17754: [513139, 513141], 25206: [513187, 513190], 26502: [513236], 15952: [513132], 15215: [512958], 19243: [513167], 12647: [512963, 513337, 513242, 513277, 512966], 9123: [512968], 18370: [513279], 6954: [513239, 513241], 7160: [513240], 20626: [513286], 15857: [513229], 11090: [513247], 6003: [513340], 28180: [513143, 513342], 18867: [513341], 10959: [513343, 513344], 16689: [513150, 513219, 513149], 8596: [513155, 513224], 15074: [513345], 11330: [513346, 513349], 2049: [512969], 17746: [512964, 512887, 513162], 24037: [513347], 16095: [513243], 15409: [513348, 513225], 19132: [513237], 28608: [513144, 512975, 512977], 2140: [513209, 513491], 24263: [513207], 1931: [513238], 2761

In [None]:
#generate embeddings for text

import torch
from transformers import BertModel, BertTokenizer

# Load the BERT model and tokenizer
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

# Function to generate embeddings
def generate_embeddings(sentences):
    input_ids = torch.tensor([tokenizer.encode_plus(sent, add_special_tokens=True, padding='max_length', truncation=True, max_length=512)['input_ids'] for sent in sentences])
    attention_mask = torch.tensor([tokenizer.encode_plus(sent, add_special_tokens=True, padding='max_length', truncation=True, max_length=512)['attention_mask'] for sent in sentences])
    with torch.no_grad():
        model.eval()
        outputs = model(input_ids, attention_mask=attention_mask)
        embeddings = outputs[0][:, 0, :].numpy()  # Extract embeddings from the [CLS] token (index 0)
    return embeddings

# Function to convert the data to embeddings
def convert_data_to_embeddings(data):
    embeddings_data = {}
    for key1, inner_dict in data.items():
        embeddings_inner_dict = {}
        for key2, sentences in inner_dict.items():
            sentence_embeddings = generate_embeddings(sentences)
            embeddings_inner_dict[key2] = sentence_embeddings.tolist()
        embeddings_data[key1] = embeddings_inner_dict
    return embeddings_data

# Generate embeddings for the data
embeddings_result = convert_data_to_embeddings(f)
print(embeddings_result)

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [None]:
print(embeddings_result['9269'])

In [None]:
#convert text embeddings into csv file

import csv
# Define the file name for the CSV file
csv_file_name = "text_embeddings.csv"

# Write the author_embeddings dictionary to the CSV file
with open(csv_file_name, mode="w", newline="") as file:
    writer = csv.writer(file)
    # Write the header row (column names)
    writer.writerow(["Disscusion_id", "Author", "Embeddings"])
    # Write the data rows
    for discussion, inner_dict in embeddings_result.items():
        for author, embeddings in inner_dict.items():
            writer.writerow([discussion, author, embeddings])

In [None]:
  #aggregation of graph and text embeddings

import numpy as np
# Coefficients for aggregation
a = 0.8
b = 0.2

# Aggregating embeddings
aggregated_embeddings = {}

for graph_key, graph_author_embeddings in all_embeddings2.items():
    graph_discussion_id = int(graph_key.split('_')[0])  # Extract discussion ID from the graph key
    if str(graph_discussion_id) in str(embeddings_result):
        aggregated_embeddings[graph_discussion_id] = {}
        for graph_author_id, graph_emb_array in graph_author_embeddings.items():
            if "_" in graph_author_id:
              continue
            graph_author_id = int(graph_author_id)  # Convert author ID back to integer for matching with text_embeddings
            if graph_author_id in embeddings_result[graph_discussion_id]:
                text_emb_array = np.array(embeddings_result[graph_discussion_id][graph_author_id][0])
                 # Check if dimensions are not equal
                if graph_emb_array.shape != text_emb_array.shape:
                    # Reshape or expand graph_emb_array to match the dimensions of text_emb_array
                    if graph_emb_array.shape[0] < text_emb_array.shape[0]:
                        expanded_graph_emb_array = np.pad(graph_emb_array, (0, text_emb_array.shape[0] - graph_emb_array.shape[0]), mode='constant')
                    else:
                        expanded_graph_emb_array = graph_emb_array[:text_emb_array.shape[0]]

                    aggregated_embedding = a * expanded_graph_emb_array + b * text_emb_array
                else:
                    aggregated_embedding = a * graph_emb_array + b * text_emb_array

                aggregated_embeddings[graph_discussion_id][graph_author_id] = aggregated_embedding.tolist()
            else:
                # Handle missing author IDs in text_embeddings
                missing_emb_array = np.zeros_like(graph_emb_array)  # Replace with any default value or zeros
                aggregated_embedding = a * graph_emb_array + b * missing_emb_array
                aggregated_embeddings[graph_discussion_id][graph_author_id] = aggregated_embedding.tolist()
                print(f"Warning: Author ID {graph_author_id} not found in the text embeddings for discussion ID {graph_discussion_id}.")
    else:
        print(f"Warning: Discussion ID {graph_discussion_id} not found in the text embeddings.")


NameError: ignored

In [None]:
#convert aggregated dictionary to csv file

import csv
# Define the file name for the CSV file
csv_file_name = "aggregated_embeddings.csv"

# Write the aggregated_embeddings dictionary to the CSV file
with open(csv_file_name, mode="w", newline="") as file:
    writer = csv.writer(file)

    # Write the header row (column names)
    writer.writerow(["Disscusion_id", "Author", "Embeddings"])

    # Write the data rows
    for discussion, inner_dict in aggregated_embeddings.items():
        for author, embeddings in inner_dict.items():
            writer.writerow([discussion, author, embeddings])

In [None]:
#extract labels

import csv
import pandas as pd
import numpy as np
df = pd.read_excel("/content/drive/MyDrive/createdebate_released_no_parse.xlsx",sheet_name="post")
def checkrepetetion(x,y):
  c=0
  for i in range(len(df)):
    if(df["discussion_id"][i]==x and df["author_id"][i]==y):
      c=c+1
  if(c==1):
    return True
  else:
    return False
finaldict={}
for i in range(len(df)):
  if(checkrepetetion(df["discussion_id"][i],df["author_id"][i])):
    if(df["discussion_id"][i] in finaldict):
      finaldict[df["discussion_id"][i]][df["author_id"][i]]=[df["discussion_stance_id"][i]]
    else:
      finaldict[df["discussion_id"][i]] = {df["author_id"][i] : [df["discussion_stance_id"][i]]}
  else:
    if(df["discussion_id"][i] in finaldict):
      if(df["author_id"][i] in finaldict[df["discussion_id"][i]]):
        finaldict[df["discussion_id"][i]][df["author_id"][i]].append(df["discussion_stance_id"][i])
      else:
        finaldict[df["discussion_id"][i]][df["author_id"][i]] =[df["discussion_stance_id"][i]]
    else:
      finaldict[df["discussion_id"][i]] ={df["author_id"][i]:[df["discussion_stance_id"][i]]}
print(finaldict)

{878: {27083: [0.0, 0.0, 0.0, 0.0, 0.0, 0.0], 5901: [0.0, 0.0, 1.0, 1.0], 11010: [1.0], 8705: [1.0, 1.0, 0.0, 1.0, 1.0, 1.0], 24659: [1.0, 0.0, 1.0, 0.0, 0.0, 1.0], 3346: [1.0, 1.0], 11470: [1.0], 27784: [1.0, 1.0, 1.0], 17754: [1.0, 1.0], 25206: [1.0, 0.0], 26502: [1.0], 15952: [0.0], 15215: [0.0], 19243: [0.0], 12647: [0.0, 1.0, 1.0, 1.0, 0.0], 9123: [0.0], 18370: [1.0], 6954: [1.0, 1.0], 7160: [0.0], 20626: [1.0], 15857: [1.0], 11090: [1.0], 6003: [1.0], 28180: [0.0, 2.0], 18867: [1.0], 10959: [1.0, 1.0], 16689: [1.0, 1.0, 0.0], 8596: [0.0, 1.0], 15074: [1.0], 11330: [1.0, 1.0], 2049: [0.0], 17746: [0.0, 0.0, 0.0], 24037: [1.0], 16095: [1.0], 15409: [1.0, 1.0], 19132: [1.0], 28608: [0.0, 0.0, 0.0], 2140: [1.0, 1.0], 24263: [0.0], 1931: [6.0], 27616: [0.0], 4666: [0.0], 22669: [1.0], 9619: [1.0], 4219: [1.0], 20083: [1.0], 9204: [0.0], 25419: [1.0], 20796: [1.0], 3327: [1.0], 19749: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0], 26420: [1.0], 21084: [0.0], 27894: [0.0], 17527: [0.0, 1.0], 7273

In [None]:
#find maximum count of labels

def findmax(list1):
    Highest_count = list1[0]
    for i in list1:
      if list1.count(Highest_count) < list1.count(i):
        Highest_count = i
      elif list1.count(Highest_count) == list1.count(i):
        Highest_count = max(Highest_count,i)
    return Highest_count
labels_dict=finaldict
for d_id,a_dict in finaldict.items():
  for a_id,label in a_dict.items():
    labels_dict[d_id][a_id]=findmax(labels_dict[d_id][a_id])

print(labels_dict)




{878: {27083: 0.0, 5901: 1.0, 11010: 1.0, 8705: 1.0, 24659: 1.0, 3346: 1.0, 11470: 1.0, 27784: 1.0, 17754: 1.0, 25206: 1.0, 26502: 1.0, 15952: 0.0, 15215: 0.0, 19243: 0.0, 12647: 1.0, 9123: 0.0, 18370: 1.0, 6954: 1.0, 7160: 0.0, 20626: 1.0, 15857: 1.0, 11090: 1.0, 6003: 1.0, 28180: 2.0, 18867: 1.0, 10959: 1.0, 16689: 1.0, 8596: 1.0, 15074: 1.0, 11330: 1.0, 2049: 0.0, 17746: 0.0, 24037: 1.0, 16095: 1.0, 15409: 1.0, 19132: 1.0, 28608: 0.0, 2140: 1.0, 24263: 0.0, 1931: 6.0, 27616: 0.0, 4666: 0.0, 22669: 1.0, 9619: 1.0, 4219: 1.0, 20083: 1.0, 9204: 0.0, 25419: 1.0, 20796: 1.0, 3327: 1.0, 19749: 1.0, 26420: 1.0, 21084: 0.0, 27894: 0.0, 17527: 1.0, 7273: 1.0, 18692: 0.0, 3085: 1.0, 5571: 1.0, 6213: 1.0, 17840: 1.0, 10906: 1.0, 19809: 1.0, 14313: 0.0, 20922: 0.0, 19509: 1.0, 18924: 1.0, 19804: 1.0, 26316: 1.0, 26071: 1.0, 16668: 1.0, 23602: 1.0, 9224: 1.0, 25163: 1.0, 23887: 1.0, 7305: 1.0, 14147: 8.0, 14931: 1.0, 22427: 1.0, 11164: 1.0, 1808: 1.0, 21752: 1.0, 12649: 1.0, 26212: 1.0, 27887: 0

In [None]:
#generate csv file for labels

import csv
# Define the file name for the CSV file
csv_file_name = "labels.csv"

# Write the aggregated_embeddings dictionary to the CSV file
with open(csv_file_name, mode="w", newline="") as file:
    writer = csv.writer(file)

    # Write the header row (column names)
    writer.writerow(["Disscusion_id", "Author", "label"])

    # Write the data rows
    for discussion, inner_dict in labels_dict.items():
        for author, label in inner_dict.items():
            writer.writerow([discussion, author, label])

In [None]:
#merge labels and embeddings

import csv
import pandas as pd

# Load the CSV files into pandas DataFrames
aggregated_df = pd.read_csv("/content/aggregated_embeddings.csv")
labels_df = pd.read_csv("/content/labels.csv")

# Merge the DataFrames based on Discussion_id and Author
merged_df = pd.merge(aggregated_df, labels_df, on=["Disscusion_id", "Author"])

# Define the file name for the merged CSV file
merged_csv_file_name = "merged_data.csv"

# Write the merged DataFrame to the CSV file
merged_df.to_csv(merged_csv_file_name, index=False)

print("Merged data saved to", merged_csv_file_name)


Merged data saved to merged_data.csv


In [None]:
#MLP Classifier

import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report
import ast

# Load the merged CSV file
merged_df = pd.read_csv("/content/merged_data.csv")
# Convert string embeddings to actual lists of floats
merged_df['Embeddings'] = merged_df['Embeddings'].apply(ast.literal_eval)

# Separate features (embeddings) and labels
X = np.array(merged_df['Embeddings'].tolist())  # Features (embeddings)
y = merged_df['label']  # Labels

# Convert labels to numerical values (assuming labels are encoded as strings)
label_mapping = {label: idx for idx, label in enumerate(y.unique())}
y = y.map(label_mapping)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define the MLP model for multi-class classification
num_classes = len(label_mapping)
model = tf.keras.models.Sequential([
    tf.keras.layers.Input(shape=(X_train_scaled.shape[1],)),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(num_classes, activation='softmax')  # Output layer for multi-class classification
])

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train_scaled, y_train, epochs=20, batch_size=32, validation_split=0.1)

# Evaluate the model on the test set
loss, accuracy = model.evaluate(X_test_scaled, y_test)
print("Test accuracy:", accuracy)

# Make predictions on the test set
y_pred = model.predict(X_test_scaled)
y_pred_classes = np.argmax(y_pred, axis=1)  # Get the predicted classes

# Print the classification report
class_names = list(label_mapping.keys())
report = classification_report(y_test, y_pred_classes)
print("Classification Report:")
print(report)
print("**********************************************")
print("summary")
print(model.summary())


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Test accuracy: 0.6203703880310059
Classification Report:
              precision    recall  f1-score   support

           0       0.48      0.51      0.49        76
           1       0.71      0.69      0.70       137
           3       0.00      0.00      0.00         1
           8       0.00      0.00      0.00         1
          11       0.00      0.00      0.00         1

    accuracy                           0.62       216
   macro avg       0.24      0.24      0.24       216
weighted avg       0.62      0.62      0.62       216

**********************************************
summary
Model: "sequential_63"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_189 (D

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
