In [4]:
#%pip install -U sentence-transformers
from IPython.core.display import HTML
HTML("<script>Jupyter.notebook.kernel.restart()</script>")


In [6]:
import pandas as pd
import os

def load_dataset(split):
    """
    Load the dataset for a given split ('train', 'dev', 'test')
    and return a Pandas DataFrame.
    """
    base_path = os.path.join(split)  # e.g., 'train/', 'dev/', 'test/'
    data_file = os.path.join(base_path, f"{split}.data.txt")
    label_file = os.path.join(base_path, f"{split}.gold.txt")

    df = pd.read_csv(data_file, sep="\t", header=None, names=["target_word", "PoS", "indices", "example_1", "example_2"])

    df[['index1', 'index2']] = df['indices'].str.split('-', expand=True).astype(int)
    df.drop(columns=['indices'], inplace=True)

    labels = pd.read_csv(label_file, header=None, names=["label"])
    
    df['label'] = labels['label']

    return df

train_df = load_dataset('train')
dev_df = load_dataset('dev')
test_df = load_dataset('test')

train_df.to_csv('train.csv', index=False)
dev_df.to_csv('dev.csv', index=False)
test_df.to_csv('test.csv', index=False)


In [9]:
# Simple setence-concatenation Logistic Regression Model with training word2vec

import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

train_df = pd.read_csv("train.csv")
dev_df = pd.read_csv("dev.csv")
test_df = pd.read_csv("test.csv")

train_df["combined_text"] = train_df["example_1"] + " " + train_df["example_2"]
dev_df["combined_text"] = dev_df["example_1"] + " " + dev_df["example_2"]
test_df["combined_text"] = test_df["example_1"] + " " + test_df["example_2"]

train_df["label"] = train_df["label"].map({"T": 1, "F": 0})
dev_df["label"] = dev_df["label"].map({"T": 1, "F": 0})
test_df["label"] = test_df["label"].map({"T": 1, "F": 0})

train_sentences = [sentence.split() for sentence in train_df["combined_text"]]
dev_sentences = [sentence.split() for sentence in dev_df["combined_text"]]
test_sentences = [sentence.split() for sentence in test_df["combined_text"]]

word2vec_model = Word2Vec(sentences=train_sentences, vector_size=100, window=5, min_count=1, workers=4)

def get_sentence_vector(sentence, model, vector_size=100):
    words = sentence.split()
    word_vectors = [model.wv[word] for word in words if word in model.wv]
    if len(word_vectors) == 0:
        return np.zeros(vector_size)
    return np.mean(word_vectors, axis=0)

X_train = np.array([get_sentence_vector(sentence, word2vec_model) for sentence in train_df["combined_text"]])
X_dev = np.array([get_sentence_vector(sentence, word2vec_model) for sentence in dev_df["combined_text"]])
X_test = np.array([get_sentence_vector(sentence, word2vec_model) for sentence in test_df["combined_text"]])

y_train = train_df["label"]
y_dev = dev_df["label"]
y_test = test_df["label"]

clf = LogisticRegression()
clf.fit(X_train, y_train)

y_dev_pred = clf.predict(X_dev)
y_test_pred = clf.predict(X_test)

print("Dev Set Performance:")
print(f"Accuracy: {accuracy_score(y_dev, y_dev_pred):.4f}")
print("Classification Report:\n", classification_report(y_dev, y_dev_pred))

print("Test Set Performance:")
print(f"Accuracy: {accuracy_score(y_test, y_test_pred):.4f}")
print("Classification Report:\n", classification_report(y_test, y_test_pred))


Dev Set Performance:
Accuracy: 0.5486
Classification Report:
               precision    recall  f1-score   support

           0       0.53      0.77      0.63       319
           1       0.59      0.32      0.42       319

    accuracy                           0.55       638
   macro avg       0.56      0.55      0.52       638
weighted avg       0.56      0.55      0.52       638

Test Set Performance:
Accuracy: 0.5364
Classification Report:
               precision    recall  f1-score   support

           0       0.53      0.74      0.62       700
           1       0.56      0.33      0.41       700

    accuracy                           0.54      1400
   macro avg       0.54      0.54      0.52      1400
weighted avg       0.54      0.54      0.52      1400



In [10]:
# Simple Logistic Regression Model with pre-trained word2vec
import pandas as pd
import numpy as np
import gensim.downloader as api
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

train_df = pd.read_csv("train.csv")
dev_df = pd.read_csv("dev.csv")
test_df = pd.read_csv("test.csv")

train_df["combined_text"] = train_df["example_1"] + " " + train_df["example_2"]
dev_df["combined_text"] = dev_df["example_1"] + " " + dev_df["example_2"]
test_df["combined_text"] = test_df["example_1"] + " " + test_df["example_2"]

train_df["label"] = train_df["label"].map({"T": 1, "F": 0})
dev_df["label"] = dev_df["label"].map({"T": 1, "F": 0})
test_df["label"] = test_df["label"].map({"T": 1, "F": 0})

print("Loading pre-trained Word2Vec model...")
word2vec_model = api.load("word2vec-google-news-300")

def get_sentence_vector(sentence, model, vector_size=300):
    words = sentence.split()
    word_vectors = [model[word] for word in words if word in model]
    if len(word_vectors) == 0:
        return np.zeros(vector_size)
    return np.mean(word_vectors, axis=0)

print("Transforming sentences into vectors...")
X_train = np.array([get_sentence_vector(sentence, word2vec_model) for sentence in train_df["combined_text"]])
X_dev = np.array([get_sentence_vector(sentence, word2vec_model) for sentence in dev_df["combined_text"]])
X_test = np.array([get_sentence_vector(sentence, word2vec_model) for sentence in test_df["combined_text"]])

y_train = train_df["label"]
y_dev = dev_df["label"]
y_test = test_df["label"]

print("Training Logistic Regression classifier...")
clf = LogisticRegression()
clf.fit(X_train, y_train)

y_dev_pred = clf.predict(X_dev)
y_test_pred = clf.predict(X_test)

print("\nDev Set Performance:")
print(f"Accuracy: {accuracy_score(y_dev, y_dev_pred):.4f}")
print("Classification Report:\n", classification_report(y_dev, y_dev_pred))

print("\nTest Set Performance:")
print(f"Accuracy: {accuracy_score(y_test, y_test_pred):.4f}")
print("Classification Report:\n", classification_report(y_test, y_test_pred))


Loading pre-trained Word2Vec model...
Transforming sentences into vectors...
Training Logistic Regression classifier...

Dev Set Performance:
Accuracy: 0.5470
Classification Report:
               precision    recall  f1-score   support

           0       0.55      0.51      0.53       319
           1       0.54      0.58      0.56       319

    accuracy                           0.55       638
   macro avg       0.55      0.55      0.55       638
weighted avg       0.55      0.55      0.55       638


Test Set Performance:
Accuracy: 0.5314
Classification Report:
               precision    recall  f1-score   support

           0       0.53      0.49      0.51       700
           1       0.53      0.57      0.55       700

    accuracy                           0.53      1400
   macro avg       0.53      0.53      0.53      1400
weighted avg       0.53      0.53      0.53      1400



In [2]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import accuracy_score, classification_report

train_df = pd.read_csv("train.csv")
dev_df = pd.read_csv("dev.csv")
test_df = pd.read_csv("test.csv")

train_df["label"] = train_df["label"].map({"T": 1, "F": 0})
dev_df["label"] = dev_df["label"].map({"T": 1, "F": 0})
test_df["label"] = test_df["label"].map({"T": 1, "F": 0})

# Step 2: Load a pre-trained sentence embedding model
print("Loading Sentence Transformer model...")
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

# Step 3: Function to compute sentence embeddings and cosine similarity
def compute_similarity(df):
    sent1_embeddings = model.encode(df["example_1"].tolist(), convert_to_numpy=True)
    sent2_embeddings = model.encode(df["example_2"].tolist(), convert_to_numpy=True)
    similarities = np.diag(cosine_similarity(sent1_embeddings, sent2_embeddings))
    return similarities

# Step 4: Compute cosine similarity for train, dev, and test sets
print("Computing sentence similarities...")
X_train = compute_similarity(train_df).reshape(-1, 1)
X_dev = compute_similarity(dev_df).reshape(-1, 1)
X_test = compute_similarity(test_df).reshape(-1, 1)

y_train = train_df["label"]
y_dev = dev_df["label"]
y_test = test_df["label"]

# Step 5: Train a Logistic Regression classifier
print("Training Logistic Regression classifier...")
clf = LogisticRegression()
clf.fit(X_train, y_train)

# Step 6: Evaluate the model
y_dev_pred = clf.predict(X_dev)
y_test_pred = clf.predict(X_test)

print("\nDev Set Performance:")
print(f"Accuracy: {accuracy_score(y_dev, y_dev_pred):.4f}")
print("Classification Report:\n", classification_report(y_dev, y_dev_pred))

print("\nTest Set Performance:")
print(f"Accuracy: {accuracy_score(y_test, y_test_pred):.4f}")
print("Classification Report:\n", classification_report(y_test, y_test_pred))


  Referenced from: <31D2ED80-D446-353A-885A-F2032D05B554> /Users/jcjustin/opt/anaconda3/lib/python3.9/site-packages/torchvision/image.so
  Expected in:     <709C1DF5-D253-3C66-87E2-C99FD3A259DF> /Users/jcjustin/opt/anaconda3/lib/python3.9/site-packages/torch/lib/libtorch_cpu.dylib
  warn(f"Failed to load image Python extension: {e}")


ImportError: dlopen(/Users/jcjustin/opt/anaconda3/lib/python3.9/site-packages/pyarrow/lib.cpython-39-darwin.so, 0x0002): symbol not found in flat namespace '__ZN5arrow12ArrayBuilder12AppendScalarERKNS_6ScalarEx'