In [4]:
from gensim.models import KeyedVectors
import numpy as np
import pandas as pd
import time

# from train import *
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics.pairwise import cosine_similarity

In [5]:
wv = KeyedVectors.load('embs_train.kv')
wv

<gensim.models.keyedvectors.KeyedVectors at 0x7faeb3faab80>

### Part 1

In [6]:
wv.most_similar('awful', topn=10)

[('horrible', 0.7597668170928955),
 ('terrible', 0.7478912472724915),
 ('dreadful', 0.7218177914619446),
 ('horrid', 0.6720177531242371),
 ('atrocious', 0.6626645922660828),
 ('ugly', 0.6236302256584167),
 ('lousy', 0.6135217547416687),
 ('unbelievable', 0.6068726181983948),
 ('appalling', 0.6061565279960632),
 ('hideous', 0.5811460614204407)]

In [7]:
wv.most_similar('student', topn=10)

[('students', 0.7294867038726807),
 ('teacher', 0.6301366090774536),
 ('school', 0.6055627465248108),
 ('undergraduate', 0.6020305752754211),
 ('university', 0.600540041923523),
 ('campus', 0.5629045367240906),
 ('academic', 0.5484224557876587),
 ('professors', 0.530981183052063),
 ('college', 0.525564968585968),
 ('grad', 0.5203014016151428)]

In [8]:
wv.most_similar('family', topn=10)

[('relatives', 0.6662653088569641),
 ('families', 0.6252894997596741),
 ('siblings', 0.6140849590301514),
 ('friends', 0.6128394603729248),
 ('mother', 0.6065612435340881),
 ('father', 0.5717043876647949),
 ('wife', 0.5601866245269775),
 ('son', 0.5384211540222168),
 ('clan', 0.5372899770736694),
 ('grandmother', 0.5366827845573425)]

In [9]:
wv.most_similar('success', topn=10)

[('successes', 0.724018394947052),
 ('successful', 0.6167578101158142),
 ('accomplishment', 0.49159350991249084),
 ('achievements', 0.4895521104335785),
 ('achievement', 0.4850277304649353),
 ('triumphs', 0.4617437720298767),
 ('greatness', 0.45542895793914795),
 ('progress', 0.44958776235580444),
 ('popularity', 0.4400866627693176),
 ('triumph', 0.43484923243522644)]

In [10]:
wv.most_similar('enhance', topn=10)

[('enhancing', 0.7954033613204956),
 ('improve', 0.7549501657485962),
 ('enhances', 0.7072708010673523),
 ('enhanced', 0.6955755352973938),
 ('bolster', 0.6009522080421448),
 ('elevate', 0.5607604384422302),
 ('enable', 0.5488600134849548),
 ('boost', 0.5353639721870422),
 ('reduce', 0.5311101675033569),
 ('fortify', 0.5228351354598999)]

In [11]:
 wv.most_similar(positive=['sister', 'man'], negative=['woman'], topn=5)

[('brother', 0.7966989874839783),
 ('uncle', 0.6753759980201721),
 ('nephew', 0.6596081852912903),
 ('son', 0.6472460031509399),
 ('father', 0.6398823261260986)]

In [12]:
 wv.most_similar(positive=['harder', 'fast'], negative=['hard'], topn=5)

[('faster', 0.7064898610115051),
 ('rapidly', 0.5021133422851562),
 ('easier', 0.48843103647232056),
 ('slow', 0.45752349495887756),
 ('quickly', 0.4370785653591156)]

In [13]:
 wv.most_similar(positive=['king', 'woman'], negative=['man'], topn=5)

[('queen', 0.7118193507194519),
 ('monarch', 0.6189674139022827),
 ('princess', 0.5902431011199951),
 ('kings', 0.5236844420433044),
 ('queens', 0.5181134343147278)]

In [14]:
 wv.most_similar(positive=['winter', 'hot'], negative=['cold'], topn=5)

[('summer', 0.5669187903404236),
 ('spring', 0.5186282396316528),
 ('hottest', 0.5085405707359314),
 ('summertime', 0.4723301827907562),
 ('season', 0.4058019518852234)]

In [15]:
 wv.most_similar(positive=['doctor', 'law'], negative=['medicine'], topn=5)

[('laws', 0.4727955460548401),
 ('lawyer', 0.4094833433628082),
 ('judge', 0.38079601526260376),
 ('legally', 0.3534584939479828),
 ('dentist', 0.34965550899505615)]

### Part 2

In [16]:
# path = 'train.txt'
def load_data(path):
    with open(path, 'r') as file:
        lines = file.readlines()

    # Extract labels and texts
    data = {'Label': [1 if line.startswith('+') else 0 for line in lines],
            'Text': [line[2:].strip() for line in lines]}  # Adjust slicing if necessary

    new_df = pd.DataFrame(data)
    return new_df

In [17]:
path = 'train.txt'
train_df = load_data(path)

In [18]:
path = 'dev.txt'
dev_df = load_data(path)

In [19]:
path = 'test.txt'
test_df = load_data(path)

In [20]:
train_y = train_df['Label']
train_X = train_df['Text']

dev_y = dev_df['Label']
dev_X = dev_df['Text']

test_X = test_df['Text']

In [21]:
print(train_X[0] == train_df.iloc[1, 1])

False


In [22]:
# Function to convert a sentence to a vector
def sentence_to_vector(sentence, wv):
    words = sentence.lower().split()
    valid_embeddings = [wv[word] for word in words if word in wv]
    if not valid_embeddings:
        return np.zeros(wv.vector_size)
    return np.mean(valid_embeddings, axis=0)

In [23]:
def sentence_vec(data):
    sentence_vectors = []
    for sentence in data:
        vector = sentence_to_vector(sentence, wv)
        sentence_vectors.append(vector)

    # Convert the list of sentence vectors to a NumPy array
    sentence_vectors = np.array(sentence_vectors)
    return sentence_vectors

In [24]:
x_train_vector = sentence_vec(train_X)
x_dev_vector =  sentence_vec(dev_X)

In [25]:
first_sentence = x_train_vector[0]

In [26]:
def find_similarity(sentence, sentence_vectors, feature, main_df, number):
    similarities = []
    label = None
    
    # Loop through each vector in sentence_vectors
    for sentence_vector in sentence_vectors:
        # Calculate cosine similarity with the first sentence
        similarity = cosine_similarity([sentence], [sentence_vector])
        # Append the similarity score to the list (accessing the first element as similarity returns a 2D array)
        similarities.append(similarity[0][0])
        
    similarities[number] = -1

    closest_sentence_idx = np.argmax(similarities)

    # Closest sentence
    closest_sentence = feature[closest_sentence_idx]
    
    target_sentence = closest_sentence
    for index, row in main_df.iterrows():
        sentence = row['Text']  # Replace 'Sentence' with your actual column name for sentences
        label = row['Label']        # Replace 'Label' with your actual column name for labels

        if sentence == target_sentence:
            if label == 0:
                label = "-"
            else:
                label = '+'
            break

    
    print(f"Main senetence: {feature[number]}.")
    print(f"Closest sentence label: {label}")
    print(f"Closest sentence: {closest_sentence}.")
    
    return closest_sentence

### 2.1.1

In [27]:
first_similarities = find_similarity(first_sentence, x_train_vector, train_X, train_df, 0)

Main senetence: it 's a tour de force , written and directed so quietly that it 's implosion rather than explosion you fear.
Closest sentence label: -
Closest sentence: a semi autobiographical film that 's so sloppily written and cast that you can not believe anyone more central to the creation of bugsy than the caterer had anything to do with it.


### 2.1.2

In [28]:
second_sentence = x_train_vector[1]

In [29]:
second_similarities = find_similarity(second_sentence, x_train_vector, train_X, train_df, 1)

Main senetence: places a slightly believable love triangle in a difficult to swallow setting , and then disappointingly moves the story into the realm of an improbable thriller.
Closest sentence label: -
Closest sentence: the plan to make enough into an inspiring tale of survival wrapped in the heart pounding suspense of a stylish psychological thriller ' has flopped as surely as a souffl gone wrong.


In [30]:
def knn_classifier(X_train, y_train, X_dev, y_dev):
    error_rates = {}

    for k in range(1, 100, 2):
        knn = KNeighborsClassifier(n_neighbors=k)
        knn.fit(X_train, y_train)

        dev_predictions = knn.predict(X_dev)

        accuracy = accuracy_score(y_dev, dev_predictions)

        error_rate = 1 - accuracy

        error_rates[k] = [error_rate, knn]
   
    min_key = min(error_rates, key=lambda k: error_rates[k][0])
    print(f"Best k: {min_key}, smallest dev error: {error_rates[min_key][0]}")
    
    return error_rates[min_key][1]

### 2.1.3

In [31]:
best_knn = knn_classifier(x_train_vector, train_y, x_dev_vector, dev_y)

Best k: 73, smallest dev error: 0.278


In [32]:
# x_test_vector = sentence_vec(test_X)

In [33]:
# predicted = best_knn.predict(x_test_vector)

In [34]:
# predictions_df = pd.DataFrame(predicted, columns=['Text'])
# predictions_df.to_csv('predictions.csv', index=False)

### 2.1.4

In [35]:
train_y_hot = train_df['Label']
train_X_hot = train_df['Text']

dev_y_hot = dev_df['Label']
dev_X_hot = dev_df['Text']

test_X_hot = test_df['Text']

In [36]:
# Tokenization and conversion to numerical feature vectors using Bag-of-Words
vectorizer = CountVectorizer(binary=True)

train_X_vectorized = vectorizer.fit_transform(train_X_hot)
train_X_vectorized_dense = train_X_vectorized.toarray()

dev_X_vectorized = vectorizer.transform(dev_df['Text'])
# Convert the sparse matrix to a dense array
dev_X_vectorized_dense = dev_X_vectorized.toarray()

In [37]:
vectorizer.get_feature_names_out()

array(['007', '10', '100', ..., 'zoolander', 'zucker', 'zwick'],
      dtype=object)

In [38]:
best_knn = knn_classifier(train_X_vectorized_dense, train_y_hot, dev_X_vectorized_dense, dev_y_hot)

Best k: 15, smallest dev error: 0.383


### 2.2

In [52]:
# import sys
# import time
# from svector import svector

def read_from(textfile):
    for line in open(textfile):
        label, words = line.strip().split("\t")
        yield (1 if label=="+" else -1, words.split())
    
def test(devfile, model, word_vectors):
    tot, err = 0, 0
    for i, (label, words) in enumerate(read_from(devfile), 1): # note 1...|D|
        sentence_embedding = make_vector(words, word_vectors)
        prediction = np.dot(model, sentence_embedding)
        err += label * prediction <= 0
    return err/i  # i is |D| now

In [53]:
def make_vector(words, word_vectors):
    embeddings = [word_vectors[word] for word in words if word in word_vectors]
    if embeddings:
        return np.mean(embeddings, axis = 0)
    else:
        return np.zeros(word_vectors.vector_size)

In [54]:
def train_basic(trainfile, devfile, word_vectors, epochs=10):
    t = time.time()
    best_err = 1.
    model = np.zeros(word_vectors.vector_size)
    best_model = model.copy()
    for it in range(1, epochs+1):
        updates = 0
        for i, (label, words) in enumerate(read_from(trainfile), 1): # label is +1 or -1
            sent = make_vector(words, word_vectors)
            if label * np.dot(model, sent) <= 0:
                updates += 1
                model += label * sent
        dev_err = test(devfile, model, word_vectors)
        if dev_err < best_err:
            best_model = model.copy()
            best_err = dev_err
        print("epoch %d, update %.1f%%, dev %.1f%%" % (it, updates / i * 100, dev_err * 100))
    print("best dev err %.1f%%, |w|=%d, time: %.1f secs" % (best_err * 100, len(model), time.time() - t))
    
    return best_model

In [55]:
train_basic('train.txt', 'dev.txt', wv, epochs=10)

epoch 1, update 31.1%, dev 37.4%
epoch 2, update 29.5%, dev 35.4%
epoch 3, update 29.8%, dev 33.2%
epoch 4, update 29.1%, dev 40.0%
epoch 5, update 29.7%, dev 35.2%
epoch 6, update 29.4%, dev 40.4%
epoch 7, update 29.4%, dev 38.4%
epoch 8, update 29.4%, dev 42.5%
epoch 9, update 29.1%, dev 39.0%
epoch 10, update 29.1%, dev 39.2%
best dev err 33.2%, |w|=300, time: 8.7 secs


array([-2.00598490e+00,  2.34847342e-01,  1.34461514e+00,  8.38283854e-01,
        7.76929314e-01, -2.16661256e+00, -1.76399479e+00,  4.66723936e-01,
       -1.35334088e-01, -6.64555523e-01, -6.87830750e-01, -2.71037435e+00,
       -1.34277877e+00, -4.96574975e-01, -4.87512803e-01, -2.46967947e-01,
       -6.17474215e-01, -4.65677193e-01, -7.64027532e-01,  2.95568793e-01,
        4.84064350e-01, -4.70414176e-01,  2.75841371e+00,  2.33994782e-01,
       -1.22336734e-01, -5.43849311e-01, -6.48343692e-01,  6.97254429e-01,
       -1.94000242e+00, -4.38251254e-01, -1.01712185e+00,  2.31187137e-01,
       -6.24905476e-01, -7.04480509e-01,  3.87558706e-01,  6.90070035e-01,
       -6.10672049e-01, -5.86942534e-01,  4.40280042e-03,  1.35342819e+00,
        1.22522119e+00,  2.11149208e-01, -9.80365908e-01,  1.48128555e+00,
        2.34985066e+00,  9.13465053e-01,  4.98046289e-01, -2.84336815e-01,
        8.32202857e-01,  1.04895266e+00, -8.58216584e-01, -1.82093867e+00,
       -2.32548645e-01, -

In [44]:
def smart_average(trainfile, devfile, word_vectors, epochs=10):
    t = time.time()
    best_err = 1.
    model = np.zeros(word_vectors.vector_size)
    avg_model = np.zeros(word_vectors.vector_size)
    c = 1
    for it in range(1, epochs+1):
        updates = 0
        for i, (label, words) in enumerate(read_from(trainfile), 1): # label is +1 or -1
            sent = make_vector(words, word_vectors)
            if label * np.dot(model, sent) <= 0:
                updates += 1
                update = label * sent
                model += update
                avg_model += c * update
            c += 1
        new_model = model - avg_model/c
        dev_err = test(devfile, new_model, word_vectors)
        if dev_err < best_err:
            best_model = new_model.copy()
            best_er = dev_err
            
        print("epoch %d, update %.1f%%, dev %.1f%%" % (it, updates / i * 100, dev_err * 100))
    print("best dev err %.1f%%, |w|=%d, time: %.1f secs" % (best_er * 100, len(best_model), time.time() - t))
    
    return best_model

In [45]:
# smart average perceptron
model = smart_average('train.txt', 'dev.txt', wv, epochs=10)

epoch 1, update 31.1%, dev 24.9%
epoch 2, update 29.5%, dev 23.9%
epoch 3, update 29.8%, dev 24.3%
epoch 4, update 29.1%, dev 24.1%
epoch 5, update 29.7%, dev 24.2%
epoch 6, update 29.4%, dev 23.9%
epoch 7, update 29.4%, dev 23.6%
epoch 8, update 29.4%, dev 23.8%
epoch 9, update 29.1%, dev 24.1%
epoch 10, update 29.1%, dev 24.4%
best dev err 24.4%, |w|=300, time: 11.4 secs


In [46]:
# def train(trainfile, devfile, epochs=5):
#     t = time.time()
#     best_err = 1.
#     model = svector()
#     for it in range(1, epochs+1):
#         updates = 0
#         for i, (label, words) in enumerate(read_from(trainfile), 1): # label is +1 or -1
#             sent = make_vector(words)
#             if label * (model.dot(sent)) <= 0:
#                 updates += 1
#                 model += label * sent
#         dev_err = test(devfile, model)
#         best_err = min(best_err, dev_err)
#         print("epoch %d, update %.1f%%, dev %.1f%%" % (it, updates / i * 100, dev_err * 100))
#     print("best dev err %.1f%%, |w|=%d, time: %.1f secs" % (best_err * 100, len(model), time.time() - t))

In [47]:
with open('test.txt.predicted', 'w') as f:
    for i, (label, words) in enumerate(read_from("test.txt"), 1):
        f.write("%s\t%s\n" % ("+" if np.dot(model, make_vector(words, wv)) > 0 else "-", " ".join(words)))

### Part 3

In [48]:
train_y_final = train_df['Label']
train_X_final = train_df['Text']

dev_y_final = dev_df['Label']
dev_X_final = dev_df['Text']

test_X_final = test_df['Text']

In [49]:
# Convert sentences to numerical data
tfidf_vectorizer = TfidfVectorizer(min_df=1)
train_X_final_vectors = tfidf_vectorizer.fit_transform(train_X_final)

dev_X_final_vectors = tfidf_vectorizer.transform(dev_X_final)

In [50]:
start_time = time.time()

logistic_model = LogisticRegression()
logistic_model.fit(train_X_final_vectors, train_y_final)

logistic_predictions = logistic_model.predict(dev_X_final_vectors)

logistic_accuracy = accuracy_score(dev_y_final, logistic_predictions)

logistic_error = 1 - logistic_accuracy

end_time = time.time()
logistic_runtime = end_time - start_time
print(f'Logistic Regression dev error: {logistic_error}')
print(f'Logistic Regression runtime: {logistic_runtime} seconds')

start_time = time.time()

tree_model = DecisionTreeClassifier()
tree_model.fit(train_X_final_vectors, train_y_final)

tree_predictions = tree_model.predict(dev_X_final_vectors)

tree_accuracy = accuracy_score(dev_y_final, tree_predictions)

tree_error = 1 - tree_accuracy

end_time = time.time()
tree_runtime = end_time - start_time
print(f'Decision Tree dev error: {tree_error}')
print(f'Decision Tree runtime: {tree_runtime} seconds')

Logistic Regression dev error: 0.274
Logistic Regression runtime: 0.34636473655700684 seconds
Decision Tree dev error: 0.42700000000000005
Decision Tree runtime: 3.1895549297332764 seconds
