In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string

In [None]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
def preprocess(text):
    tokens = word_tokenize(text)
    tokens = [word.lower() for word in tokens if word.isalpha()]
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return tokens

In [None]:
questions = [
    "What is a binary tree?",
    "How does a stack differ from a queue?",
    "Explain the concept of a hash table.",
    "Define a heap and explain its usage.",
]

In [None]:
labels = [1, 4, 2, 1]

In [None]:
processed_questions = [" ".join(preprocess(question)) for question in questions]
print(processed_questions)

['binary tree', 'stack differ queue', 'explain concept hash table', 'define heap explain usage']


Modified TF-IDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(processed_questions)
print(tfidf_matrix.toarray())

[[0.70710678 0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.70710678 0.        ]
 [0.         0.         0.         0.57735027 0.         0.
  0.         0.57735027 0.57735027 0.         0.         0.        ]
 [0.         0.52547275 0.         0.         0.41428875 0.52547275
  0.         0.         0.         0.52547275 0.         0.        ]
 [0.         0.         0.52547275 0.         0.41428875 0.
  0.52547275 0.         0.         0.         0.         0.52547275]]


Word2Vec Embeddings

In [None]:
from gensim.models import Word2Vec

In [None]:
sentences = [preprocess(question) for question in questions]
word2vec_model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)


In [None]:
word = 'binary'
embedding = word2vec_model.wv[word]
print(embedding)

[ 9.7702928e-03  8.1651136e-03  1.2809718e-03  5.0975787e-03
  1.4081288e-03 -6.4551616e-03 -1.4280510e-03  6.4491653e-03
 -4.6173059e-03 -3.9930656e-03  4.9244044e-03  2.7130984e-03
 -1.8479753e-03 -2.8769434e-03  6.0107317e-03 -5.7167388e-03
 -3.2367026e-03 -6.4878250e-03 -4.2346325e-03 -8.5809948e-03
 -4.4697891e-03 -8.5112294e-03  1.4037776e-03 -8.6181965e-03
 -9.9166557e-03 -8.2016252e-03 -6.7726658e-03  6.6805850e-03
  3.7845564e-03  3.5616636e-04 -2.9579818e-03 -7.4283206e-03
  5.3341867e-04  4.9989222e-04  1.9561886e-04  8.5259555e-04
  7.8633073e-04 -6.8160298e-05 -8.0070542e-03 -5.8702733e-03
 -8.3829118e-03 -1.3120425e-03  1.8206370e-03  7.4171280e-03
 -1.9634271e-03 -2.3252917e-03  9.4871549e-03  7.9704521e-05
 -2.4045217e-03  8.6048469e-03  2.6870037e-03 -5.3439722e-03
  6.5881060e-03  4.5101536e-03 -7.0544672e-03 -3.2317400e-04
  8.3448651e-04  5.7473574e-03 -1.7176545e-03 -2.8065301e-03
  1.7484308e-03  8.4717153e-04  1.1928272e-03 -2.6342822e-03
 -5.9857843e-03  7.32298

Combining Features

In [None]:
import numpy as np

In [None]:
def get_combined_features(question):
    tokens = preprocess(question)
    tfidf_vector = vectorizer.transform([" ".join(tokens)]).toarray().flatten()
    word2vec_vectors = np.array([word2vec_model.wv[word] for word in tokens if word in word2vec_model.wv])
    word2vec_vector = np.mean(word2vec_vectors, axis=0) if word2vec_vectors.size else np.zeros(100)
    combined_vector = np.concatenate((tfidf_vector, word2vec_vector))
    return combined_vector

In [None]:
features = [get_combined_features(question) for question in questions]
print(features)

[array([ 7.07106781e-01,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  7.07106781e-01,  0.00000000e+00,
        8.42954498e-03,  3.29859182e-03,  4.61423537e-03, -2.19554012e-03,
       -3.31068505e-03, -6.54776627e-03, -2.71575269e-03,  5.71919046e-03,
       -4.21543233e-03, -6.15648553e-03,  6.66809082e-03, -5.16951783e-04,
        3.38036031e-03, -3.88634740e-03,  4.96466272e-03, -3.97360884e-04,
       -4.22046753e-04, -4.65331413e-03, -6.92753936e-04, -8.41861591e-03,
       -3.61766433e-03, -5.55119384e-03,  4.32639197e-03, -6.04079990e-03,
       -8.25817883e-03, -1.93059910e-03, -3.62357544e-03,  1.54151430e-03,
        5.33351395e-03,  2.11423938e-03, -3.42909154e-03, -3.32821603e-03,
        4.83846059e-03,  4.12727427e-03,  3.27874534e-03,  2.75994907e-03,
        1.58541033e-03, -9.54893185e-04, -7.18917372e-03, -3.08604189e-03,
       -4.97415010e-03, 

Classification

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

In [None]:
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)


In [None]:
clf = RandomForestClassifier()
clf.fit(X_train, y_train)


In [None]:
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           1       0.00      0.00      0.00       0.0
           4       0.00      0.00      0.00       1.0

    accuracy                           0.00       1.0
   macro avg       0.00      0.00      0.00       1.0
weighted avg       0.00      0.00      0.00       1.0



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
