In [59]:
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

In [60]:
pd.set_option('max_colwidth', 100)

# Data Preprocessing

In [61]:
from nltk.stem.porter import PorterStemmer

stemmer = PorterStemmer()

def preprocess(document):
    """ This function Changes document to lowercase and removes stopwords"""
    document = str(document)
    document = document.lower()
    words = word_tokenize(document)
    
    words = [word for word in words if word not in stopwords.words('english')]
    
    document = " ".join(words)
    return document

## Creating Tf-Idf model for our dataset

In [62]:
# loading data
dataset = pd.read_csv('dataset.csv')
dataset.head()

Unnamed: 0,questions,year,topic
0,The ratio of mass percent of C and H of anorganic compound (CXHYOZ) is 6 : 1 Ifone molecule of t...,2018.0,alkane
1,Which type of ‘defect’ has the presence ofcations in the interstitial sites,2018.0,solid state
2,According to molecular orbital theorywhich of the following will not be a viable molecule,2018.0,chemical_bonding
3,Which of the following lines correctly showthe temperature dependence ofequilibrium constant K ...,2018.0,chemical_equilibrium
4,The combustion of benzene (l) gives CO2(g)and H2O(l) Given that heat of combustionof benzene at ...,2018.0,thermodynamics


In [63]:
# extracting the question from dataframe
dataset = dataset.dropna()
dataset['year'] = dataset['year'].astype(np.int64)

questions = [question for question in dataset.questions]
dataset.head()

Unnamed: 0,questions,year,topic
0,The ratio of mass percent of C and H of anorganic compound (CXHYOZ) is 6 : 1 Ifone molecule of t...,2018,alkane
1,Which type of ‘defect’ has the presence ofcations in the interstitial sites,2018,solid state
2,According to molecular orbital theorywhich of the following will not be a viable molecule,2018,chemical_bonding
3,Which of the following lines correctly showthe temperature dependence ofequilibrium constant K ...,2018,chemical_equilibrium
4,The combustion of benzene (l) gives CO2(g)and H2O(l) Given that heat of combustionof benzene at ...,2018,thermodynamics


In [64]:
print(questions)

['The ratio of mass percent of C and H of anorganic compound (CXHYOZ) is 6 : 1 Ifone molecule of the above compound(CXHYOZ) contains half as much oxygenas required to burn one molecule ofcompound CXHY completely to CO2 andH2O The empirical formula of compoundCXHYOZ is', 'Which type of ‘defect’ has the presence ofcations in the interstitial sites ', 'According to molecular orbital theorywhich of the following will not be a viable molecule ', 'Which of the following lines correctly showthe temperature dependence ofequilibrium constant  K  for an exothermicreaction ', 'The combustion of benzene (l) gives CO2(g)and H2O(l) Given that heat of combustionof benzene at constant volume is−32639 kJ mol−1 at 258 C; heat ofcombustion (in kJ mol−1) of benzene atconstant pressure will be :(R=8314 JK−1 mol−1)', 'For 1 molal aqueous solution of thefollowing compounds which one willshow the highest freezing point ', 'An aqueous solution contains 010 M H2Sand 020 M HCl If the equilibriumconstants for the

In [65]:
# preprocess questions using the preprocess function
questions = [preprocess(question) for question in questions]
print(questions)

['ratio mass percent c h anorganic compound ( cxhyoz ) 6 : 1 ifone molecule compound ( cxhyoz ) contains half much oxygenas required burn one molecule ofcompound cxhy completely co2 andh2o empirical formula compoundcxhyoz', 'type ‘ defect ’ presence ofcations interstitial sites', 'according molecular orbital theorywhich following viable molecule', 'following lines correctly showthe temperature dependence ofequilibrium constant k exothermicreaction', 'combustion benzene ( l ) gives co2 ( g ) h2o ( l ) given heat combustionof benzene constant volume is−32639 kj mol−1 258 c ; heat ofcombustion ( kj mol−1 ) benzene atconstant pressure : ( r=8314 jk−1 mol−1 )', '1 molal aqueous solution thefollowing compounds one willshow highest freezing point', 'aqueous solution contains 010 h2sand 020 hcl equilibriumconstants formation hs− h2s 10×10−7 s2− fromhs− ions 12×10−13 theconcentration s2− ions aqueoussolution', 'aqueous solution contains unknownconcentration ba2+ 50 ml a1 solution na2so4 added b

In [66]:
# bag of word model
vectorizer = TfidfVectorizer()
tfidf_model = vectorizer.fit_transform(questions)

In [67]:
tfidf = pd.DataFrame(tfidf_model.toarray(), columns = vectorizer.get_feature_names())
print(tfidf)

         010       020        10       108        12        13       258  \
0   0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
1   0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
2   0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
3   0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
4   0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.156491   
5   0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
6   0.182775  0.182775  0.488367  0.000000  0.182775  0.182775  0.000000   
7   0.000000  0.000000  0.331027  0.000000  0.000000  0.000000  0.000000   
8   0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
9   0.000000  0.000000  0.000000  0.254991  0.000000  0.000000  0.000000   
10  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
11  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
12  0.000000

In [68]:
for col in tfidf.columns:
    print(col)

010
020
10
108
12
13
258
2766
2o
32639
33
363
3br3
3ca3
3i
4br2
50
500
5188
60
8314
a1
according
acetaldehyde
acid
acidic
acids
added
adsorbent
alkali
alkenes
alkynes
amperes
and05
andh2o
andk2
andstatements
anorganic
approximate
aqueous
aqueoussolution
asample
atconstant
atomic
awhite
b2h6
ba
ba2
basicity
baso4is
baso4just
beelectrolysed
begins
benzene
blood
bond
br
br2
burn
butreduces
byacidification
c6h6
ca
catalyticamount
ch3co
chloroformatein
chromatographyas
cis
cl3
cn
co
co2
combination
combustion
combustionof
completely
complex
compound
compoundcxhyoz
compounds
compoundscontain
concentration
consider
constant
contains
converting
correct
correctly
covalent
cr
current
cxhy
cxhyoz
decomposition
defect
dependence
drinking
electrons
empirical
enamel
equilibriumconstants
estimation
excess
exothermicreaction
fe
final
fluoride
followed
following
form
formation
formed
formula
freezing
fromhs
gaseous
gelatinous
given
gives
givesan
glucose
h2o
h2s
h2sand
h2so4
half
harder
hcl
heat
heated
