In [1]:
import csv
import json
import numpy as np
import pandas as pd
import gensim.downloader as api
from gensim.models import KeyedVectors
from sklearn.dummy import DummyClassifier
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split  

In [2]:
# load Word2Vec model from gensim
word2vec_model = api.load("word2vec-google-news-300")

In [3]:
# Load fastText model
fasttext_model = api.load('fasttext-wiki-news-subwords-300')

In [4]:
# Load GloVe model
glove_model = api.load("glove-wiki-gigaword-300")

In [14]:
# Read JSON file and write data to CSV file
with open('train_rand_split.jsonl', 'r', encoding='utf-8') as json_file, open('data.csv', 'w', newline='', encoding='utf-8') as csv_file:
    writer = csv.writer(csv_file)
    
    # Write header
    writer.writerow(['answerKey', 'id', 'question_concept', 'stem', 'choice_label', 'choice_text'])
    
    # Process each line in JSON lines file
    for line in json_file:
        data = json.loads(line)
        
        # Extract 
        answer_key = data['answerKey']
        question_id = data['id']
        question_concept = data['question']['question_concept']
        stem = data['question']['stem']
        
        # Extract choices
        choices = data['question']['choices']
        for choice in choices:
            choice_label = choice['label']
            choice_text = choice['text']
            
            # Write data to CSV file
            writer.writerow([answer_key, question_id, question_concept, stem, choice_label, choice_text])

print("CSV file created is done.")

CSV file created is done.


In [15]:
data = pd.read_csv('data.csv')

In [16]:
# Split the data ==> features (X) and labels (y)
X_text = data[['stem', 'choice_text']]  # stem and choice_text ==> as features
y = data['answerKey']

In [17]:
# generate word embeddings for text --> using a given word embedding model
def generate_word_embeddings(text_column, word_embedding_model):
    word_embeddings = []
    for text in text_column:
        words = text.split()
        embeddings = [word_embedding_model.get_vector(word) for word in words if word in word_embedding_model.key_to_index]
        if embeddings:
            avg_embedding = np.mean(embeddings, axis=0)
            word_embeddings.append(avg_embedding)
        else:
            # If no embeddings found for any word, use zero vector
            word_embeddings.append(np.zeros(word_embedding_model.vector_size))
    return np.array(word_embeddings)


In [18]:
# Generate word embeddings for each text column using different word embedding models: 
X_word2vec_stem = generate_word_embeddings(X_text['stem'], word2vec_model)
X_word2vec_choice = generate_word_embeddings(X_text['choice_text'], word2vec_model)

X_fasttext_stem = generate_word_embeddings(X_text['stem'], fasttext_model)
X_fasttext_choice = generate_word_embeddings(X_text['choice_text'], fasttext_model)

X_glove_stem = generate_word_embeddings(X_text['stem'], glove_model)
X_glove_choice = generate_word_embeddings(X_text['choice_text'], glove_model)

In [19]:
# Split the data ==> training + testing sets
X_train, X_test, y_train, y_test = train_test_split(X_word2vec_stem, y, test_size=0.2, random_state=42)

In [20]:
# Train logistic regression models for each word embedding model
model_word2vec = LogisticRegression()
model_word2vec.fit(X_train, y_train)

# Assuming X_fasttext_stem and X_glove_stem are the appropriate feature matrices for fastText and GloVe respectively
# If not, replace them with the correct feature matrices
X_fasttext_stem_train, X_fasttext_stem_test = train_test_split(X_fasttext_stem, test_size=0.2, random_state=42)
X_glove_stem_train, X_glove_stem_test = train_test_split(X_glove_stem, test_size=0.2, random_state=42)

model_fasttext = LogisticRegression()
model_fasttext.fit(X_fasttext_stem_train, y_train)

model_glove = LogisticRegression()    
model_glove.fit(X_glove_stem_train, y_train)  

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [21]:
# Evaluate accuracy for each model
y_pred_word2vec = model_word2vec.predict(X_test)
accuracy_word2vec = accuracy_score(y_test, y_pred_word2vec)
print("Accuracy for Word2Vec model:", accuracy_word2vec)

y_pred_fasttext = model_fasttext.predict(X_fasttext_stem_test)
accuracy_fasttext = accuracy_score(y_test, y_pred_fasttext)
print("Accuracy for fastText model:", accuracy_fasttext)

y_pred_glove = model_glove.predict(X_glove_stem_test)
accuracy_glove = accuracy_score(y_test, y_pred_glove)
print("Accuracy for GloVe model:", accuracy_glove)

Accuracy for Word2Vec model: 0.28600759675597986
Accuracy for fastText model: 0.2584950210450672
Accuracy for GloVe model: 0.2651678472436095


In [22]:
# Train a baseline model using DummyClassifier
baseline_model = DummyClassifier(strategy="most_frequent")
baseline_model.fit(X_train, y_train)

# Evaluate accuracy for the baseline model
baseline_accuracy = baseline_model.score(X_test, y_test)
print("Baseline accuracy:", baseline_accuracy)

Baseline accuracy: 0.20141669233138282


In [23]:
# Calculate the differences in accuracies for each model
word2vec_diff = accuracy_word2vec - baseline_accuracy
fasttext_diff = accuracy_fasttext - baseline_accuracy
glove_diff = accuracy_glove - baseline_accuracy

In [24]:
# Check if each model outperforms the baseline
if word2vec_diff > 0:
    print("Word2Vec model outperforms the baseline by:", word2vec_diff)
else:
    print("Word2Vec model does not outperform the baseline.")

if fasttext_diff > 0:
    print("fastText model outperforms the baseline by:", fasttext_diff)
else:
    print("fastText model does not outperform the baseline.")

if glove_diff > 0:
    print("GloVe model outperforms the baseline by:", glove_diff)
else:
    print("GloVe model does not outperform the baseline.")


Word2Vec model outperforms the baseline by: 0.08459090442459705
fastText model outperforms the baseline by: 0.0570783287136844
GloVe model outperforms the baseline by: 0.06375115491222666
