In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('/content/drive/MyDrive/Smart Study Assistant Project/Model for Gate questions classification/questions-data-new (1).csv')

In [3]:
df.sample(5)

Unnamed: 0,topic,question
1786,Computer Organization and Architecture,Consider a small two-way set-associative cache...
815,Mathematics,"In a room containing 28 people, there are 18 p..."
729,Mathematics,"Company X shipped 5 computer chips, 1 of which..."
581,Operating System,A memory page containing a heavily used variab...
656,Mathematics,Let A be a set with n elements. Let C be a col...


In [4]:
df['topic'].value_counts()

Unnamed: 0_level_0,count
topic,Unnamed: 1_level_1
Computer Networks,300
Operating System,300
Mathematics,300
General Aptitude,300
Programming and Data Structure,300
Computer Organization and Architecture,300
Digital Logic,300
Theory of Computation,300


In [5]:
df.isnull().sum()

Unnamed: 0,0
topic,0
question,0


In [6]:
df.duplicated().sum()

90

In [7]:
df = df.drop_duplicates()
df.duplicated().sum()

0

In [8]:
df['topic'].value_counts()

Unnamed: 0_level_0,count
topic,Unnamed: 1_level_1
Mathematics,300
Theory of Computation,296
Programming and Data Structure,293
Computer Networks,292
General Aptitude,292
Digital Logic,292
Operating System,287
Computer Organization and Architecture,258


In [9]:
df.shape

(2310, 2)

In [10]:
# Lowercasing
df['question'] = df['question'].str.lower()

In [11]:
# Remove punctuations
import string
def remove_punctuations(text):
  for punc in string.punctuation:
    text = text.replace(punc, '')
  return text
df['question'] = df['question'].apply(remove_punctuations)


In [12]:
df.head(5)

Unnamed: 0,topic,question
0,Computer Networks,in the following pairs of osi protocol layersu...
1,Computer Networks,an ip machine q has a path to another ip machi...
2,Computer Networks,to send same bit sequence nrz encoding require
3,Computer Networks,if there are n devices nodes in a network what...
4,Computer Networks,in networking terminology utp means


In [13]:
# Removing stop words
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
def remove_stopwords(text):
  text = [word for word in text.split() if word not in stop_words]
  return " ".join(text)
df['question'] = df['question'].apply(remove_stopwords)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [14]:
df.head(5)

Unnamed: 0,topic,question
0,Computer Networks,following pairs osi protocol layersublayer fun...
1,Computer Networks,ip machine q path another ip machine h via thr...
2,Computer Networks,send bit sequence nrz encoding require
3,Computer Networks,n devices nodes network number cable links req...
4,Computer Networks,networking terminology utp means


In [15]:
# Tokenize
import nltk
nltk.download('punkt_tab')
df['question'] = df['question'].apply(nltk.word_tokenize)

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [16]:
df.sample(5)

Unnamed: 0,topic,question
1839,Digital Logic,"[following, bit, pattern, represents, floating..."
2229,Theory of Computation,"[consider, following, languages, l1, wan, w, ε..."
1326,Programming and Data Structure,"[array, contains, items, 10, 4, 7, 23, 67, 12,..."
1867,Digital Logic,"[many, 2input, multiplexers, required, constru..."
472,Operating System,"[student, wishes, create, symbolic, links, com..."


In [17]:
# Lemmatization
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

nltk.download('punkt')
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()

def lemmatize_tokens(tokens):
    return [lemmatizer.lemmatize(token) for token in tokens]

df['question'] = df['question'].apply(lemmatize_tokens)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [18]:
df.sample(10)

Unnamed: 0,topic,question
511,Operating System,"[computer, six, tape, drive, n, process, compe..."
2350,Theory of Computation,"[given, turing, machine, input, w, describe, w..."
890,Mathematics,"[define, conditional, probability, independenc..."
826,Mathematics,"[graph, g, obtained, adding, vertex, k34​, mak..."
357,Operating System,"[timeslice, used, roundrobin, scheduling, poli..."
706,Mathematics,"[nine, word, sentence, quick, brown, fox, jump..."
1255,Programming and Data Structure,"[let, depth, first, search, tree, undirected, ..."
705,Mathematics,"[probability, given, positive, integer, lying,..."
2341,Theory of Computation,"[design, contextfree, grammar, language, l, aⁿ..."
1357,Programming and Data Structure,"[best, data, structure, check, whether, arithm..."


In [19]:
!pip install gensim



In [20]:
# Word2Vec
from gensim.models import Word2Vec
model = Word2Vec(df['question'], vector_size=100, window=10, min_count=1)

In [21]:
# corpus build
corpus = []
for question in df['question']:
  corpus.append(question)

In [22]:
model.build_vocab(corpus)
model.train(corpus, total_examples=model.corpus_count, epochs=model.epochs)



(231151, 255510)

In [23]:
len(model.wv.index_to_key)

6286

In [24]:
# returns the average word2vec embeddings
def document_vector(doc_tokens):
    # remove out-of-vocabulary words
    words = [word for word in doc_tokens if word in model.wv.index_to_key]
    if len(words) == 0:
        return np.zeros(model.vector_size)  # fallback vector if words.size() is 0
    return np.mean(model.wv[words], axis=0)

In [25]:
from tqdm import tqdm

In [26]:
X = []
for doc in tqdm(df['question'].values):
    X.append(document_vector(doc))

100%|██████████| 2310/2310 [00:03<00:00, 613.61it/s]


In [27]:
X = np.array(X)
X[0]

array([ 0.28805748,  0.88140655, -0.07048798,  0.1333673 ,  0.40599287,
       -1.7147079 , -0.3516044 ,  2.551299  ,  0.23915842, -0.7246466 ,
       -0.27111647, -0.34167242, -0.33806002,  0.39349166, -0.23033533,
       -0.925583  ,  0.38638446, -1.0972763 ,  0.31195843, -1.7844468 ,
        0.81019485,  0.30500013, -0.24962315, -0.26543775, -0.23824064,
        0.09286961, -0.92464113,  0.45189124, -0.7159429 , -0.5711334 ,
        0.68594587,  0.79070157, -1.0935833 , -0.6469966 , -0.68122613,
        0.47852993, -0.5835271 , -0.97771275,  0.37333968, -1.4173791 ,
        0.02169257, -0.6513959 , -0.03943862, -0.12646027,  0.5289118 ,
        0.26026925, -0.617797  ,  0.1108859 ,  0.6970906 ,  0.24836428,
        0.5581102 , -0.18350424,  1.0993463 , -0.38389957, -0.801225  ,
        0.6276052 ,  0.55112165,  0.41515887, -0.7992735 ,  0.39573032,
        0.2083322 , -0.13951665, -0.05631424,  0.6145443 , -1.4606955 ,
        0.7246616 ,  0.16462441, -0.25144738, -1.640295  ,  0.60

In [28]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()

y = encoder.fit_transform(df['topic'])

In [29]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=1)


In [30]:
# Applying Random Forest
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
rf = RandomForestClassifier()
rf.fit(X_train,y_train)
y_pred = rf.predict(X_test)
accuracy_score(y_test,y_pred)

0.6060606060606061

In [31]:
# Applying Logistic Regression:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(multi_class='multinomial')
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
accuracy_score(y_test, y_pred)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.5606060606060606

In [32]:
# Applying SVM:
from sklearn.svm import SVC
clf = SVC(kernel='linear')
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
accuracy_score(y_test, y_pred)

0.564935064935065