# Text embeddings

### Function to split the text in case it is very long and produce many tokens

In [None]:
import openai
import pandas as pd
import numpy as np
from openai.embeddings_utils import get_embedding

openai.api_key = "---"

# embedding model parameters
embedding_model = "text-embedding-ada-002"
embedding_encoding = "cl100k_base"  # this the encoding for text-embedding-ada-002
max_tokens = 8000  # the maximum for text-embedding-ada-002 is 8191

#
def get_embedding(text, model=embedding_model):
   text1=text.split(" ")
   text_len=1.4*len(text1)
   if text_len>4000:
      num_batches=int(len(text1)//4000)#number of splits
      offset=len(text1)//num_batches# length of each split
      embedding=np.array(np.zeros(1536),dtype=np.float32)
      counter=0
      for i in range(0, num_batches+1):
         batche_start=i*offset
         if batche_start+offset>len(text1):#if we rich the final split 
            text2=text1[batche_start:]
         else:
            text2=text1[batche_start:batche_start+offset]
        
         text3 = " ".join(text2) 
         text4=text3.replace("\n", " ")
         counter +=1
         # if batche is empty:
         if text4:
            # if batche_start==0:
            #    embedding=np.array(openai.Embedding.create(input = [text4], model=model)['data'][0]['embedding'],dtype=np.float32)
            # else:
            print(counter)
            # this sum the embeddings
            embedding += np.array(openai.Embedding.create(input = [text4], model=model)['data'][0]['embedding'],dtype=np.float32)
         else:continue
      text5=list(embedding/counter)# after finish with the splits, this is to return the mean embedding
      return text5
   else:
      text = text.replace("\n", " ")
      return openai.Embedding.create(input = [text], model=model)['data'][0]['embedding']

In [None]:
import pandas as pd
# Drop the specified rows from the DataFrame
df = pd.read_csv('items_cleaned.csv',encoding='utf-8')

# run all
df["embedding"] = df.cleaned_post.apply(lambda x: get_embedding(str(x), model=embedding_model))
df.to_csv("sav_text_embeddings.csv",encoding='utf-8')

# Graph embeddings

### Build the network and create triples

In [None]:
import pandas as pd
import networkx as nx
df=pd.read_csv('file_with_item_posts_and_ids.csv')
G = nx.Graph()# create cgarph
G.add_nodes_from(df['UID'].unique(), type='editor')# add editors
G.add_nodes_from(df['TID'].unique(), type='subject')# add subjects
print('number of nodes before edges all: ', G.number_of_nodes())

for p in range(len(df)):
    G.add_node(df.loc[p,'PID'], type='post')
    G.add_edge(df.loc[p,'TID'],df.loc[p,'PID'], relation='includes')
    G.add_edge(df.loc[p,'UID'],df.loc[p,'PID'], relation='said')
    

print('number of nodes: ', G.number_of_nodes())
print('number of edges : ', G.number_of_edges())

# save triples
f = open('save_triples.txt', 'w')#open a txt
for line in nx.generate_edgelist(G, data=['relation']):
    triple = line.split(' ') # change the order in the last two elements
    # triple[-2], triple[-1] = triple[-1], triple[-2]
    username=' '.join(triple[:-2])
    row=str(username)+'\t'+str(triple[-1])+'\t'+str(triple[-2])+'\n'
    print(row)
    f.write(row)

f.close()  

In [None]:
import pykeen
pykeen.env()

#######------------ load data and split to train test


import pandas as pd

df=pd.read_csv("triples.txt", encoding='utf-8', delimiter='\t', header=None).astype("str")
df.columns=['subject', 'predicate', 'object']


#split train and test set
from pykeen.triples import TriplesFactory
# Assuming your dataframe has columns 'subject', 'predicate', 'object'
triples_factory = TriplesFactory.from_labeled_triples(
    triples=df[['subject', 'predicate', 'object']].values , create_inverse_triples=True
)

training = triples_factory
validation = triples_factory
testing = triples_factory

d=training
id_to_entity={v: k for k, v in d.entity_to_id.items()}
id_to_relation={v: k for k, v in d.relation_to_id.items()}

# Display the first few triples
triples_factory.triples


###-------------Run --------
# run model
from pykeen.pipeline import pipeline
import torch

result = pipeline(
    # model='CompGCN',
    model='CompGCN',
    loss="softplus",
    training=training,
    testing=testing,
    validation=validation,
    model_kwargs=dict(embedding_dim=100),  # Increase the embedding dimension
    optimizer_kwargs=dict(lr=0.1),  # Adjust the learning rate
    training_kwargs=dict(num_epochs=100, use_tqdm_batch=False),  # Increase the number of epochs
)

# The trained model is stored in the pipeline result
model = result.model

# save the model
torch.save(model,'model.pkl')
my_pykeen_model = torch.load('model.pkl')


###3----------------print stats froevaluation

# evaluate
from pykeen.evaluation import RankBasedEvaluator

# Create an evaluator
evaluator = RankBasedEvaluator()

# Evaluate the model
metrics = evaluator.evaluate(my_pykeen_model, testing.mapped_triples, additional_filter_triples=[training.mapped_triples, validation.mapped_triples])

# Print the metrics
print(f"Hits@1: {metrics.get_metric('hits@1')}")
print(f"Hits@3: {metrics.get_metric('hits@3')}")
print(f"Hits@5: {metrics.get_metric('hits@5')}")
print(f"Hits@10: {metrics.get_metric('hits@10')}")
print(f"Mean Reciprocal Rank: {metrics.get_metric('mean_reciprocal_rank')}")



### -------------- save embeddings------------

a=my_pykeen_model.entity_representations
b=a[0].combined.entity_representations._embeddings.weight # this is for CompGCN
# b=a[0]._embeddings.weight # this is for TransR
p=triples_factory.entity_to_id
r=triples_factory.relation_to_id

embedding_dict={}
for entity in p.keys():
  id=p[entity]
  embedding=b[id]
  embedding_dict[entity]=embedding.cpu().detach().numpy()

df_embeddings_ids = pd.DataFrame.from_dict(embedding_dict, orient='index')

df_embeddings_ids.to_csv('save_graph_embeddings.csv',encoding='utf-8',header=None)


# Neural Network training

In [None]:
import numpy as np
import ast
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Embedding, Flatten, Concatenate
from tensorflow.keras.models import Model
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder, MinMaxScaler
from sklearn.utils import class_weight

#load data after creating one file including all features for prediction

#Posts
# data = pd.read_csv('training_thread.csv',encoding='utf-8')
data = pd.read_csv('training_individual_posts.csv',encoding='utf-8')
# data = pd.read_csv('training_first_posts.csv',encoding='utf-8')

#Editors
# data = pd.read_csv('/training_editors.csv',encoding='utf-8')


In [None]:
from sklearn.utils import resample
from collections import Counter
from keras.callbacks import EarlyStopping 
from sklearn.metrics import classification_report
from collections import Counter
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score,confusion_matrix,classification_report




def run_keras(concatenated_arr, status, n):

    print('============= n= ', str(n), ' ================')
    print('=============================================')
    print('\n')
    # Split the data into train, validation, and test sets
    # First, split into train and temp sets
    X_train, X_test, y_train, y_test = train_test_split(
        concatenated_arr,
        status,
        test_size=0.4,  # 60% for training and temp
        random_state=42
    )

    # Then, split the temp set into validation and test sets
    X_val, X_test, y_val, y_test = train_test_split(
        X_test,
        y_test,
        test_size=0.5,  # 50% of the temp set for validation and test each
        random_state=42
    )



    es = EarlyStopping(monitor='val_loss', mode='min',verbose=1, patience=20)

    # Define the input layers
    input = Input(shape=(X_test.shape[1],))

    if n==1.8:
        # Define the neural network layers
        x = Dense(8, activation='relu')(input)
    elif n==1.32:
        x = Dense(32, activation='relu')(input)
    elif n==1.64:
        x = Dense(64, activation='relu')(input)
    elif n==2:
        x = Dense(8, activation='relu')(input)
        x = Dense(32, activation='relu')(x)
    elif n==2.1:
        x = Dense(32, activation='relu')(input)
        x = Dense(364, activation='relu')(x)
    elif n==3:
        x = Dense(8, activation='relu')(input)
        x = Dense(32, activation='relu')(x)
        x = Dense(64, activation='relu')(x)

    output = Dense(1, activation='sigmoid')(x)

    # Create the model

    class_weights = class_weight.compute_class_weight(class_weight='balanced',
                                                    classes=np.unique(y_train),
                                                    y=y_train)

    model = Model(inputs=input, outputs=output)

    # Compile the model
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

    # Train the model
    model.fit(X_train,
            y_train,
            epochs=100,
            batch_size=128,
            validation_data=(X_val, y_val),
            class_weight=dict(zip(np.unique(y_train),class_weights)),
            callbacks=[es]
            )


    print(Counter(y_test))
    predictions = model.predict(X_test)

    y_pred_class = np.round(predictions)  # Predicted class labels
    # Calculate evaluation metrics
    report = classification_report(y_test, y_pred_class)
    # Calculate precision, recall, and F1-score
    precision = precision_score(y_test, y_pred_class, average='weighted')
    recall = recall_score(y_test, y_pred_class, average='weighted')
    f1 = f1_score(y_test, y_pred_class, average='weighted')
    acc = accuracy_score(y_test, y_pred_class)
    cm = confusion_matrix(y_test, y_pred_class)
    # report = classification_report( y_test,y_pred_tree)

    #Print the precision, recall, and F1-score
    print("Precision tree:", precision)
    print("Recall tree:", recall)
    print("F1 Score tree:", f1)
    print("Acc score tree:", acc)
    print("Confusion matrix:")
    print(print(cm))


    print("Classification Report:\n", report)

    # Evaluate the model
    loss, accuracy = model.evaluate(X_test, y_test)
    print(f'Test Loss: {loss}, Test Accuracy: {accuracy}')

    print('\n')
    print('\n')
    print('\n')
    print('\n')

### transform data to arrays

In [None]:
# remove negative age
data=data[data['account_age']>=0]

# convert embeddings from stings to lists
data['graph']=data['graph'].apply(lambda x: np.array(ast.literal_eval(x), dtype='float32'))
data['text']=data['text'].apply(lambda x: np.array(ast.literal_eval(x), dtype='float32'))

#convert data to arrays
t_embeddings=np.vstack(data['text'].values)
g_embeddings=np.vstack(data['graph'].values)
labels=np.array(data['answer2post'])
age=np.array(data['account_age'])
edits=np.array(data['num_edits'])
posts=np.array(data['num_posts'])
status=np.array(data['status'])
rights=np.array(data['rights'])


#Normalise the continious values
scaler = StandardScaler()
age_norm=scaler.fit_transform(age.reshape(-1, 1)).flatten()
edits_norm=scaler.fit_transform(edits.reshape(-1, 1)).flatten()
posts_norm=scaler.fit_transform(posts.reshape(-1, 1)).flatten()

#encode the categorical values
label_encoder = LabelEncoder()
status_enc=label_encoder.fit_transform(status)
# label_enc=label_encoder.fit_transform(labels)
rights_enc=label_encoder.fit_transform(rights)

### concatinate data and run the different cases

In [None]:
# #run features
file='G_Tsum'
# concatenated_arr=np.vstack(age_norm)
# concatenated_arr=np.vstack(edits_norm)
# concatenated_arr=np.vstack(posts_norm)
# concatenated_arr=np.vstack(rights_enc)
# concatenated_arr=np.vstack(status_enc)
# concatenated_arr = np.vstack((age_norm, edits_norm,posts_norm,status_enc,rights_enc )).T
# concatenated_arr = np.vstack((age_norm, edits_norm,posts_norm,rights_enc )).T

# # # run graph
# concatenated_arr = g_embeddings

#run text
# concatenated_arr = t_embeddings


#run features and graph
# # concatenated_arr = np.vstack((age_norm, edits_norm,posts_norm,status_enc,rights_enc)).T
# concatenated_arr = np.vstack((age_norm, edits_norm,posts_norm,rights_enc )).T
# concatenated_arr=np.concatenate((concatenated_arr,g_embeddings),axis=1)


# # #run features and text
# concatenated_arr = np.vstack((age_norm, edits_norm,posts_norm,status_enc,rights_enc)).T
# concatenated_arr = np.vstack((age_norm, edits_norm,posts_norm,rights_enc )).T
# concatenated_arr=np.concatenate((concatenated_arr,t_embeddings),axis=1)

# #run graph and text
concatenated_arr = np.concatenate((g_embeddings,t_embeddings),axis=1)


# # #run all
# # concatenated_arr = np.vstack((age_norm, edits_norm,posts_norm,status_enc,rights_enc)).T
# concatenated_arr = np.vstack((age_norm, edits_norm,posts_norm,rights_enc )).T
# concatenated_arr=np.concatenate((concatenated_arr,g_embeddings,t_embeddings),axis=1)

# # #run age and text
# # concatenated_arr = np.vstack((age_norm, edits_norm,posts_norm,status_enc,rights_enc)).T
# concatenated_arr = np.vstack(age_norm)
# concatenated_arr=np.concatenate((concatenated_arr,t_embeddings),axis=1)

import sys 
stdoutOrigin=sys.stdout 
sys.stdout = open("log_output_posts_"+str(file)+".txt", "w")

dense_list=[1.8, 1.32, 1.64, 2, 2.1, 3]

for n in dense_list:
    run_keras(concatenated_arr,labels, n)

