In [3]:
pip install convokit



In [4]:
pip install -U sentence-transformers



In [5]:
pip install xgboost



In [6]:
pip install -U scikit-learn



In [7]:
from convokit import Corpus, download
from sentence_transformers import SentenceTransformer
from ast import literal_eval
from xgboost import XGBClassifier
import numpy as np
import sklearn
from sklearn.metrics import accuracy_score, classification_report

  from tqdm.autonotebook import tqdm, trange


In [8]:
corpus = Corpus(filename=download("winning-args-corpus"))

Downloading winning-args-corpus to /root/.convokit/downloads/winning-args-corpus
Downloading winning-args-corpus from http://zissou.infosci.cornell.edu/convokit/datasets/winning-args-corpus/winning-args-corpus.zip (73.7MB)... Done


In [9]:
ids = corpus.get_utterance_ids()

print(len(ids))

293297


In [10]:
SPEAKER_BLACKLIST = ['DeltaBot','AutoModerator']
training_trios = []

for id in ids:
  ut = corpus.get_utterance(id)
  if ut.reply_to == ut.conversation_id and (ut.meta['success'] == 1 or ut.meta['success'] == 0) and (ut.speaker.id not in SPEAKER_BLACKLIST):
    op = corpus.get_utterance(ut.conversation_id).text
    x = ut.text
    y = ut.meta['success']
    training_trios += [(op, x, y)]

print(len(training_trios))

train_len = len(training_trios)

8106


In [11]:
# sentence_transformer_model = 'Alibaba-NLP/gte-large-en-v1.5'
# embedder = SentenceTransformer(sentence_transformer_model, device = "cpu", trust_remote_code=True, truncate_dim = 1024)

In [12]:
# data = []
# counter = 0


# for trio in training_trios:
#   op, x, y =  trio
#   op_vec = embedder.encode(op)
#   x_vec = embedder.encode(x)
#   data += [(op_vec, x_vec, y)]
#   counter += 1

#   if counter % 250 == 0:
#     print(f"Completed {counter}/{train_len}")

# print(f"Total training pairs is {len(data)} - expected is 8106")

In [13]:
# with open("data.txt", "w") as f: # in case colab decides to end my runtime :(
#   for trio in data:
#     op, x, y = trio

#     op_s = ", ".join(str(es) for es in op)
#     x_s = ", ".join(str(ex) for ex in x)

#     f.write(f"{op_s}::{x_s}::{y}" + "\n")

In [14]:
with open("data.txt", "r") as f:
  lines = f.readlines()

print(len(lines))

input = []
output = []
for i, line in enumerate(lines):
  parts = line.split("::")
  op, x, y = parts

  op_i = literal_eval("[" + op + "]")
  x_i = literal_eval("[" + x + "]")
  y_i = int(y)

  input += [np.array(op_i + x_i)]
  output += [y_i]


8106


In [18]:
X = np.vstack(input)
Y = np.vstack(output)

train_len = len(X)
split = int(train_len * 0.7)
val = int(train_len * 0.9)

# Splitting the dataset
train_X = X[:split]
train_Y = Y[:split]

val_X = X[split:val]
val_Y = Y[split:val]

test_X = X[val:]
test_Y = Y[val:]

# Shuffling the data
randomize = np.arange(len(train_X))
np.random.shuffle(randomize)
train_X = train_X[randomize]
train_Y = train_Y[randomize]

randomize = np.arange(len(val_X))
np.random.shuffle(randomize)
val_X = val_X[randomize]
val_Y = val_Y[randomize]

randomize = np.arange(len(test_X))
np.random.shuffle(randomize)
test_X = test_X[randomize]
test_Y = test_Y[randomize]


In [21]:
param_grid = {
    'max_depth': [3, 6, 9, 12, 15, 20],
    'min_child_weight': [1, 3, 5, 7],
    'gamma': [0, 0.1, 0.2, 0.5, 1],
    'subsample': [0.5, 0.6, 0.7, 0.8, 0.9],
    'colsample_bytree': [0.5, 0.6, 0.7, 0.8, 0.9],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'n_estimators': [50, 100, 150, 200]
}

# Grid search for hyperparameter tuning
best_params = None
best_accuracy = 0

for max_depth in param_grid['max_depth']:
    for min_child_weight in param_grid['min_child_weight']:
        for gamma in param_grid['gamma']:
            for subsample in param_grid['subsample']:
                for colsample_bytree in param_grid['colsample_bytree']:
                    for learning_rate in param_grid['learning_rate']:
                        for n_estimators in param_grid['n_estimators']:
                            model = XGBClassifier(
                                max_depth=max_depth,
                                min_child_weight=min_child_weight,
                                gamma=gamma,
                                subsample=subsample,
                                colsample_bytree=colsample_bytree,
                                learning_rate=learning_rate,
                                n_estimators=n_estimators,
                                early_stopping_rounds=20
                            )

                            model.fit(train_X, train_Y, eval_set=[(val_X, val_Y)], verbose=False)

                            y_predict = model.predict(test_X)
                            accuracy = accuracy_score(test_Y, y_predict)

                            print("Accuracy", accuracy)

                            if accuracy > best_accuracy:
                                best_accuracy = accuracy
                                best_params = {
                                    'max_depth': max_depth,
                                    'min_child_weight': min_child_weight,
                                    'gamma': gamma,
                                    'subsample': subsample,
                                    'colsample_bytree': colsample_bytree,
                                    'learning_rate': learning_rate,
                                    'n_estimators': n_estimators
                                }

print("Best Hyperparameters:", best_params)
print("Best Accuracy:", best_accuracy)

# Train final model with best hyperparameters
final_model = XGBClassifier(**best_params)
final_model.fit(train_X, train_Y, eval_set=[(val_X, val_Y)], early_stopping_rounds=20, verbose=False)

y_train_predict = final_model.predict(train_X)
y_test_predict = final_model.predict(test_X)

print("TRAIN -------------------------")
print(accuracy_score(train_Y, y_train_predict))
print(classification_report(train_Y, y_train_predict))

print("\nTEST -------------------------")
print(accuracy_score(test_Y, y_test_predict))
print(classification_report(test_Y, y_test_predict))

Accuracy 0.5265104808877928
Accuracy 0.5363748458692972
Accuracy 0.5400739827373613
Accuracy 0.5536374845869297
Accuracy 0.5536374845869297
Accuracy 0.5536374845869297
Accuracy 0.5536374845869297
Accuracy 0.5536374845869297
Accuracy 0.5400739827373613
Accuracy 0.5400739827373613
Accuracy 0.5400739827373613
Accuracy 0.5400739827373613
Accuracy 0.5265104808877928
Accuracy 0.5265104808877928
Accuracy 0.5265104808877928
Accuracy 0.5265104808877928
Accuracy 0.530209617755857
Accuracy 0.5388409371146733
Accuracy 0.5524044389642416
Accuracy 0.5499383477188656
Accuracy 0.5363748458692972
Accuracy 0.5363748458692972
Accuracy 0.5363748458692972
Accuracy 0.5363748458692972
Accuracy 0.5240443896424167
Accuracy 0.5240443896424167
Accuracy 0.5240443896424167
Accuracy 0.5240443896424167
Accuracy 0.5043156596794082
Accuracy 0.5043156596794082
Accuracy 0.5043156596794082
Accuracy 0.5043156596794082
Accuracy 0.528976572133169
Accuracy 0.5400739827373613
Accuracy 0.5376078914919852
Accuracy 0.55980271270

KeyboardInterrupt: 