In [1]:
import pandas as pd
import nltk
import bs4 as bs
from nltk.tokenize import sent_tokenize # tokenizes sentences
import re
from nltk.stem import PorterStemmer
from nltk.tag import pos_tag
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

## In this notebook I am doing a clustering of Science questions using KMeans algorithm with K=5. Then we can vizualize the result with plotly to see what are the most important features (words here) for each cluster and name each cluster.

In [3]:
data = pd.read_csv('science_questions.csv')

In [4]:
data.shape

(2707, 6)

In [5]:
def get_wordnet_pos(treebank_tag):

    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return 'n'

In [6]:
n, c = data.shape

In [7]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
bag_question = []
for i in range(n):
    question = bs.BeautifulSoup(data['question'][i],features='lxml').text
    question = question.replace('.','. ')
    question = re.sub(r'\{.*\}', '', question)
    question = re.sub('[^a-zA-Z ]' ,'',question)
    question = question.lower()
    question_words = question.split()
    question_wo_stopwords = [w for w in question_words if not w in stopwords.words("english")]
    vect = CountVectorizer(max_features=5)
    if not question_wo_stopwords == []:
        vect.fit(question_wo_stopwords)
        bag = vect.transform(question_wo_stopwords)
        features = vect.get_feature_names()
        bag_question.append(features)
    if question_wo_stopwords == []:
        data = data.drop(i)

In [8]:
data = data.reset_index(drop=True)

In [9]:
n, c = data.shape

In [10]:
data['bag_question'] = bag_question

In [11]:
data.head()

Unnamed: 0,questionID,AnswerKey,examName,schoolGrade,question,subject,bag_question
0,AKDE&ED_2008_4_4,C,Alaska Department of Education & Early Develop...,4,Which organism needs to make its own food? (A)...,Biology,"[food, make, needs, organism]"
1,AKDE&ED_2008_4_31,D,Alaska Department of Education & Early Develop...,4,Students planted one hundred flower seeds. The...,Biology,"[could, flower, growth, hundred, observations]"
2,AKDE&ED_2012_4_9,C,Alaska Department of Education & Early Develop...,4,Study the data table below. {Alaska-2012-4-10}...,Biology,"[bands, rubber, set, students, test]"
3,AKDE&ED_2012_4_24,B,Alaska Department of Education & Early Develop...,4,The chart shows observations of the Moon. {Ala...,Biology,"[chart, moon, observations, shows]"
4,CSZ_2009_5_18,A,California Standards Test,5,A balloon has a negative charge. A glass rod h...,Biology,"[attracted, balloon, charge, glass, rod]"


## Lemmatization

In [12]:
question_clean_wnl = []
wnl = WordNetLemmatizer()
for i in range(0,n):
    if( (i+1)%500 == 0 ):
        # print progress
        print("Done with %d reviews" %(i+1)) 
    
    wnl_stems = []
    question = data['bag_question'][i]
    token_tag = pos_tag(question)
    for pair in token_tag:
        res = wnl.lemmatize(pair[0],pos=get_wordnet_pos(pair[1]))
        wnl_stems.append(res)

    question_clean_wnl.append(' '.join(wnl_stems))

Done with 500 reviews
Done with 1000 reviews
Done with 1500 reviews
Done with 2000 reviews
Done with 2500 reviews


In [13]:
data['cleaned_question'] = question_clean_wnl

In [14]:
from sklearn.feature_extraction.text import CountVectorizer
question = data['cleaned_question']
vect = CountVectorizer()
vect.fit(question)
bag = vect.transform(question)
vect_df = pd.DataFrame(bag.toarray(), columns=vect.get_feature_names())
vect_df.head()

Unnamed: 0,aa,abandon,ability,able,abnormal,abnormality,absorb,absorbed,absorbs,absorption,...,wrinkle,yard,year,yearly,yearold,yes,young,zinc,zone,zygote
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [15]:
data.head()

Unnamed: 0,questionID,AnswerKey,examName,schoolGrade,question,subject,bag_question,cleaned_question
0,AKDE&ED_2008_4_4,C,Alaska Department of Education & Early Develop...,4,Which organism needs to make its own food? (A)...,Biology,"[food, make, needs, organism]",food make need organism
1,AKDE&ED_2008_4_31,D,Alaska Department of Education & Early Develop...,4,Students planted one hundred flower seeds. The...,Biology,"[could, flower, growth, hundred, observations]",could flower growth hundred observation
2,AKDE&ED_2012_4_9,C,Alaska Department of Education & Early Develop...,4,Study the data table below. {Alaska-2012-4-10}...,Biology,"[bands, rubber, set, students, test]",band rubber set student test
3,AKDE&ED_2012_4_24,B,Alaska Department of Education & Early Develop...,4,The chart shows observations of the Moon. {Ala...,Biology,"[chart, moon, observations, shows]",chart moon observation show
4,CSZ_2009_5_18,A,California Standards Test,5,A balloon has a negative charge. A glass rod h...,Biology,"[attracted, balloon, charge, glass, rod]",attract balloon charge glass rod


## KMean algorithm 

In [16]:
from sklearn.cluster import KMeans
q, w = vect_df.shape
vect_question = bag.toarray()
k = 5 # number of clusters
kmeans = KMeans(n_clusters=k, init='random')
kmeans.fit(vect_question)
#
# Get a list of the questions in each cluster
#
clusters = []
for j in range(0,k):
    clusters.append([])
for i in range(0,n):
    clusters[kmeans.labels_[i]].append(i)
#
# Print out clusters
#
for j in range(0,k):
    print(j+1,'\n', clusters[j], '\n\n')

1 
 [0, 1, 2, 4, 6, 7, 9, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 22, 23, 26, 27, 28, 29, 30, 31, 32, 33, 35, 36, 37, 38, 39, 40, 41, 42, 44, 45, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 60, 61, 62, 66, 69, 70, 73, 74, 75, 76, 77, 78, 79, 80, 81, 83, 84, 86, 88, 89, 90, 91, 92, 94, 95, 96, 98, 101, 102, 103, 105, 107, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 146, 147, 148, 149, 150, 151, 152, 153, 154, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 208, 209, 211, 212, 213, 215, 216, 217, 218, 220, 222, 224, 226, 229, 230, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 247, 248, 249, 250, 251, 252, 254, 255, 256, 259, 260, 261, 262, 263, 264

In [17]:
for i in range (len(clusters)):
    print(len(clusters[i]))

2248
191
42
92
126


In [18]:
corresponding_cluster = []
for i in range(n):
    corresponding_cluster.append(kmeans.labels_[i])
data['cluster'] = corresponding_cluster

In [19]:
data.head()

Unnamed: 0,questionID,AnswerKey,examName,schoolGrade,question,subject,bag_question,cleaned_question,cluster
0,AKDE&ED_2008_4_4,C,Alaska Department of Education & Early Develop...,4,Which organism needs to make its own food? (A)...,Biology,"[food, make, needs, organism]",food make need organism,0
1,AKDE&ED_2008_4_31,D,Alaska Department of Education & Early Develop...,4,Students planted one hundred flower seeds. The...,Biology,"[could, flower, growth, hundred, observations]",could flower growth hundred observation,0
2,AKDE&ED_2012_4_9,C,Alaska Department of Education & Early Develop...,4,Study the data table below. {Alaska-2012-4-10}...,Biology,"[bands, rubber, set, students, test]",band rubber set student test,0
3,AKDE&ED_2012_4_24,B,Alaska Department of Education & Early Develop...,4,The chart shows observations of the Moon. {Ala...,Biology,"[chart, moon, observations, shows]",chart moon observation show,3
4,CSZ_2009_5_18,A,California Standards Test,5,A balloon has a negative charge. A glass rod h...,Biology,"[attracted, balloon, charge, glass, rod]",attract balloon charge glass rod,0


In [20]:
data['cleaned_question']

0                               food make need organism
1               could flower growth hundred observation
2                          band rubber set student test
3                           chart moon observation show
4                      attract balloon charge glass rod
5                   air atmosphere beach bottom diagram
6             bird characteristic different eat feather
7                 collect description follow rock roger
8                       ago animal evidence form fossil
9                           actual image size toy truck
10            animal belongs classifies different group
11                      container lid place table water
12                        fan image mirror table window
13                  battery bulb circuit light terminal
14                 affected drought least likely little
15                          iron magnetic rock use wand
16                      human make resource student use
17             activity air information quality 

In [21]:
data.groupby('cluster').count()

Unnamed: 0_level_0,questionID,AnswerKey,examName,schoolGrade,question,subject,bag_question,cleaned_question
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,2248,2248,2248,2248,2248,2248,2248,2248
1,191,191,191,191,191,191,191,191
2,42,42,42,42,42,42,42,42
3,92,92,92,92,92,92,92,92
4,126,126,126,126,126,126,126,126


In [22]:
for i in range(30):
    print(data['cluster'][i], '\n')
    print(data['question'][i], '\n\n')    

0 

Which organism needs to make its own food? (A) {Alaska-2008-4-5} (B) {Alaska-2008-4-6} (C) {Alaska-2008-4-7} (D) {Alaska-2008-4-8} 


0 

Students planted one hundred flower seeds. They observed the growth of their plants once a week. Which tool could be used to record their observations? (A) {Alaska-2008-4-26} (B) {Alaska-2008-4-27} (C) {Alaska-2008-4-28} (D) {Alaska-2008-4-29} 


0 

Study the data table below. {Alaska-2012-4-10} Two students want to know how temperature affects rubber bands. They decide to test rubber bands inside and outside their school during the winter. The students plan to stretch each rubber band until it breaks. Which set of materials should be used for this test? (A) set 1 (B) set 2 (C) set 3 (D) set 4 


3 

The chart shows observations of the Moon. {Alaska-2012-4-22} Which drawing shows how the Moon would most likely appear on day 13? (A) {Alaska-2012-4-23} (B) {Alaska-2012-4-24} (C) {Alaska-2012-4-25} (D) {Alaska-2012-4-26} 


0 

A balloon has a nega

## Gathering words for each cluster

In [23]:
cluster = [[],[],[],[],[]]

for i in range(n):
    a = data['cluster'][i]
    for word in data['bag_question'][i]:
        cluster[a].append(word) 

In [24]:
cluster_clean_wnl = [[],[],[],[],[]]
wnl = WordNetLemmatizer()
for i in range(0,5):
    wnl_stems = []
    words = cluster[i]
    token_tag = pos_tag(words)
    for pair in token_tag:
        res = wnl.lemmatize(pair[0],pos=get_wordnet_pos(pair[1]))
        wnl_stems.append(res)
    cluster_clean_wnl[i].append(' '.join(wnl_stems))

In [25]:
clust = pd.DataFrame(cluster_clean_wnl)
clust = clust.rename({0 : 'cl'}, axis=1)

In [26]:
from sklearn.feature_extraction.text import CountVectorizer

cl = clust['cl']
vect = CountVectorizer()
vect.fit(cl)
bag = vect.transform(cl)
vect_df = pd.DataFrame(bag.toarray(), columns=vect.get_feature_names())
vect_df.head()

Unnamed: 0,aa,abandon,ability,able,abnormal,abnormality,absorb,absorbed,absorbs,absorption,...,wrinkle,yard,year,yearly,yearold,yes,young,zinc,zone,zygote
0,0,1,12,17,6,1,26,4,10,7,...,1,1,23,1,2,1,5,2,1,1
1,1,0,0,0,0,0,1,0,0,2,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,1,0,0,2,0,0,0,...,0,0,2,0,0,0,0,0,0,0
4,0,0,0,2,0,0,2,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [27]:
vect_df = vect_df.transpose()

In [28]:
vect_df = vect_df.rename(columns={0: "cluster0", 1: "cluster1", 2: "cluster2", 3: "cluster3", 4: "cluster4"})

In [29]:
vect_df

Unnamed: 0,cluster0,cluster1,cluster2,cluster3,cluster4
aa,0,1,0,1,0
abandon,1,0,0,0,0
ability,12,0,0,0,0
able,17,0,0,1,2
abnormal,6,0,0,0,0
abnormality,1,0,0,0,0
absorb,26,1,0,2,2
absorbed,4,0,0,0,0
absorbs,10,0,0,0,0
absorption,7,2,0,0,0


In [30]:
vect_df.nlargest(5, 'cluster0')

Unnamed: 0,cluster0,cluster1,cluster2,cluster3,cluster4
water,173,14,1,4,2
student,145,1,0,2,2
air,136,6,2,2,2
plant,133,5,2,5,11
energy,120,8,6,1,3


In [31]:
vect_df.nlargest(5, 'cluster1')

Unnamed: 0,cluster0,cluster1,cluster2,cluster3,cluster4
diagram,0,195,6,1,2
answer,35,28,25,3,0
show,0,25,0,95,0
earth,111,18,0,2,1
water,173,14,1,4,2


In [32]:
vect_df.nlargest(5, 'cluster2')

Unnamed: 0,cluster0,cluster1,cluster2,cluster3,cluster4
food,70,4,29,2,9
chain,4,1,26,0,2
answer,35,28,25,3,0
base,23,11,25,1,3
diagram,0,195,6,1,2


In [33]:
vect_df.nlargest(5, 'cluster3')

Unnamed: 0,cluster0,cluster1,cluster2,cluster3,cluster4
show,0,25,0,95,0
picture,17,0,0,12,1
animal,0,0,0,6,129
table,50,1,1,6,4
object,38,2,0,5,0


In [34]:
vect_df.nlargest(5, 'cluster4')

Unnamed: 0,cluster0,cluster1,cluster2,cluster3,cluster4
animal,0,0,0,6,129
cell,85,6,0,1,11
plant,133,5,2,5,11
food,70,4,29,2,9
area,38,6,0,2,5


In [35]:
nb0 = [171,136,133,128,120]
word0 = ['water', 'plant', 'student', 'air', 'animal']
nb1 = [136,23,21,20,16]
word1 = ['energy', 'food', 'chemical', 'electrical', 'form']
nb2 = [120,26,12,8,6]
word2 = ['show', 'diagram', 'picture', 'water', 'animal']
nb3 = [159,18,17,12,12]
word3 = ['diagram', 'earth', 'answer', 'cycle', 'represent']
nb4 = [66, 59, 14, 13, 13]
word4 = ['answer', 'base', 'food', 'chain', 'student']

In [36]:
clust0_freq = [nb0[i]/len(nb0) for i in range(len(nb0))]
clust1_freq = [nb1[i]/len(nb1) for i in range(len(nb1))]
clust2_freq = [nb2[i]/len(nb2) for i in range(len(nb2))]
clust3_freq = [nb3[i]/len(nb3) for i in range(len(nb3))]
clust4_freq = [nb4[i]/len(nb4) for i in range(len(nb4))]
clust0_freq

[34.2, 27.2, 26.6, 25.6, 24.0]

In [37]:
def transf(x):
    #transform the interval [0,1] into [20-120]
    return x*2 + 15

In [38]:
print(clust0_freq,clust1_freq,clust2_freq,clust3_freq,clust4_freq)

[34.2, 27.2, 26.6, 25.6, 24.0] [27.2, 4.6, 4.2, 4.0, 3.2] [24.0, 5.2, 2.4, 1.6, 1.2] [31.8, 3.6, 3.4, 2.4, 2.4] [13.2, 11.8, 2.8, 2.6, 2.6]


In [39]:
clust0_freq_norm = []
clust1_freq_norm = []
clust2_freq_norm = []
clust3_freq_norm = []
clust4_freq_norm = []
for element in clust0_freq:
    clust0_freq_norm.append(transf(element))
for element in clust1_freq:
    clust1_freq_norm.append(transf(element))
for element in clust2_freq:
    clust2_freq_norm.append(transf(element))
for element in clust3_freq:
    clust3_freq_norm.append(transf(element))
for element in clust4_freq:
    clust4_freq_norm.append(transf(element))
clust0_freq_norm

clust0_freq_norm = list(reversed(clust0_freq_norm))
clust1_freq_norm = list(reversed(clust1_freq_norm))
clust2_freq_norm = list(reversed(clust2_freq_norm))
clust3_freq_norm = list(reversed(clust3_freq_norm))
clust4_freq_norm = list(reversed(clust4_freq_norm))
word0 = list(reversed(word0))
word1 = list(reversed(word1))
word2 = list(reversed(word2))
word3 = list(reversed(word3))
word4 = list(reversed(word4))

## Visualization

In [40]:
import plotly
import plotly.plotly as py
import plotly.graph_objs as go


trace0 = go.Scatter(
    x=[1, 1, 1, 1, 1],
    y=[1, 2, 3, 4, 5],
    mode='markers+text',
    text=word0,
    name='Life Science',
    textposition='middle center',
    marker=dict(
        size=clust0_freq_norm,
    color=['rgb(255, 144, 14)','rgb(255, 144, 14)','rgb(255, 144, 14)', 'rgb(255, 144, 14)',
               'rgb(255, 144, 14)'],
    
    )
       
)

        
trace1 = go.Scatter(
    x=[1.2, 1.2, 1.2, 1.2, 1.2],
    y=[1, 2, 3, 4, 5],
    mode='markers+text',
    text=word1,
    name='Physics',
    textposition='middle center',
    marker=dict(
        size=clust1_freq_norm,
    color=['rgb(93, 164, 214)','rgb(93, 164, 214)','rgb(93, 164, 214)','rgb(93, 164, 214)','rgb(93, 164, 214)'],
           
    )
)

trace2 = go.Scatter(
    x=[1.4, 1.4, 1.4, 1.4, 1.4],
    y=[1, 2, 3, 4, 5],
    mode='markers+text',
    text=word2,
    name='Visualization',
    textposition='middle center',
    marker=dict(
        size=clust2_freq_norm,
    color=['rgb(144, 255, 14)','rgb(144, 255, 14)','rgb(144, 255, 14)', 'rgb(144, 255, 14)',
               'rgb(144, 255, 14)'],
    
    )
       
)

trace3 = go.Scatter(
    x=[1.6, 1.6, 1.6, 1.6, 1.6],
    y=[1, 2, 3, 4, 5],
    mode='markers+text',
    text=word3,
    name='Earth Science',
    textposition='middle center',
    marker=dict(
        size=clust3_freq_norm,
    color=['rgb(50, 100, 150)','rgb(50, 100, 150)','rgb(50, 100, 150)', 'rgb(50, 100, 150)',
               'rgb(50, 100, 150)'],
    
    )
       
)

trace4 = go.Scatter(
    x=[1.8, 1.8, 1.8, 1.8, 1.8],
    y=[1, 2, 3, 4, 5],
    mode='markers+text',
    text=word4,
    textposition='middle center',
    name='Biased',
    marker=dict(
        size=clust4_freq_norm,
    color=['rgb(200, 150, 0)','rgb(200, 150, 0)','rgb(200, 150, 0)', 'rgb(200, 150, 0)',
               'rgb(200, 150, 0)'],
    
    )
       
)

layout = go.Layout(
    showlegend=False,
    
)


data1 = [trace0,trace1,trace2,trace3,trace4]
py.iplot(data1, filename='bubblechart-size', layout=layout)

High five! You successfully sent some data to your account on plotly. View your plot in your browser at https://plot.ly/~andyucb/0 or inside your plot.ly account where it is named 'bubblechart-size'


## Observation

It seems that clusters are biased towards words that does not belong to a particular scientific domain such as 'answer', 'show', 'diagram', 'represent'.

Let's try to run the algorythm after removing the more frequent of those words.

## 2nd Visualization without useless words

We first define what words do we want to remove from our clustering and analysis, here they are stored in the list words_not:

In [41]:
words_not = ['science', 'knowledge', 'student', 'form', 'show', 'diagram', 'answer', 'represent', 'table', 'best', 'show', 'find', 'change', 'likely', 'use', 'make', 'small', 'question', 'year', 'information', 'follow', 'student', 'table', 'best', 'different']

In [42]:
data.head()

Unnamed: 0,questionID,AnswerKey,examName,schoolGrade,question,subject,bag_question,cleaned_question,cluster
0,AKDE&ED_2008_4_4,C,Alaska Department of Education & Early Develop...,4,Which organism needs to make its own food? (A)...,Biology,"[food, make, needs, organism]",food make need organism,0
1,AKDE&ED_2008_4_31,D,Alaska Department of Education & Early Develop...,4,Students planted one hundred flower seeds. The...,Biology,"[could, flower, growth, hundred, observations]",could flower growth hundred observation,0
2,AKDE&ED_2012_4_9,C,Alaska Department of Education & Early Develop...,4,Study the data table below. {Alaska-2012-4-10}...,Biology,"[bands, rubber, set, students, test]",band rubber set student test,0
3,AKDE&ED_2012_4_24,B,Alaska Department of Education & Early Develop...,4,The chart shows observations of the Moon. {Ala...,Biology,"[chart, moon, observations, shows]",chart moon observation show,3
4,CSZ_2009_5_18,A,California Standards Test,5,A balloon has a negative charge. A glass rod h...,Biology,"[attracted, balloon, charge, glass, rod]",attract balloon charge glass rod,0


In [43]:
n, c = data.shape

In [44]:
final_words = []
for i in range(n):
    words = data['cleaned_question'][i].split()
    words_plus = [w for w in words if w not in words_not]
    final_words.append(words_plus)

In [45]:
final_cleaned = []
for i in range(n) :
    final_cleaned.append(' '.join(final_words[i]))

In [46]:
data['final_cleaned'] = final_cleaned
data.head()

Unnamed: 0,questionID,AnswerKey,examName,schoolGrade,question,subject,bag_question,cleaned_question,cluster,final_cleaned
0,AKDE&ED_2008_4_4,C,Alaska Department of Education & Early Develop...,4,Which organism needs to make its own food? (A)...,Biology,"[food, make, needs, organism]",food make need organism,0,food need organism
1,AKDE&ED_2008_4_31,D,Alaska Department of Education & Early Develop...,4,Students planted one hundred flower seeds. The...,Biology,"[could, flower, growth, hundred, observations]",could flower growth hundred observation,0,could flower growth hundred observation
2,AKDE&ED_2012_4_9,C,Alaska Department of Education & Early Develop...,4,Study the data table below. {Alaska-2012-4-10}...,Biology,"[bands, rubber, set, students, test]",band rubber set student test,0,band rubber set test
3,AKDE&ED_2012_4_24,B,Alaska Department of Education & Early Develop...,4,The chart shows observations of the Moon. {Ala...,Biology,"[chart, moon, observations, shows]",chart moon observation show,3,chart moon observation
4,CSZ_2009_5_18,A,California Standards Test,5,A balloon has a negative charge. A glass rod h...,Biology,"[attracted, balloon, charge, glass, rod]",attract balloon charge glass rod,0,attract balloon charge glass rod


In [47]:
from sklearn.feature_extraction.text import CountVectorizer
question0 = data['final_cleaned']
vect0 = CountVectorizer()
vect0.fit(question0)
bag0 = vect0.transform(question0)
vect_df0 = pd.DataFrame(bag0.toarray(), columns=vect0.get_feature_names())
vect_df0.head()

Unnamed: 0,aa,abandon,ability,able,abnormal,abnormality,absorb,absorbed,absorbs,absorption,...,wrap,wrinkle,yard,yearly,yearold,yes,young,zinc,zone,zygote
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


We run KMean again we this new bag_of_words model:

In [48]:
from sklearn.cluster import KMeans
q, w = vect_df0.shape
vect_question0 = bag0.toarray()
k = 5 # number of clusters
kmeans0 = KMeans(n_clusters=k, init='random')
kmeans0.fit(vect_question0)
#
# Get a list of the questions in each cluster
#
clusters0 = []
for j in range(0,k):
    clusters0.append([])
for i in range(0,n):
    clusters0[kmeans0.labels_[i]].append(i)
#
# Print out clusters
#
for j in range(0,k):
    print(j+1,'\n', clusters0[j], '\n\n')

1 
 [0, 1, 2, 3, 4, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 46, 47, 48, 49, 50, 51, 52, 53, 54, 56, 57, 58, 59, 60, 61, 62, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 102, 103, 104, 105, 106, 107, 108, 110, 111, 112, 113, 114, 115, 116, 117, 118, 122, 126, 128, 129, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 144, 145, 146, 147, 148, 149, 151, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 166, 167, 168, 169, 170, 171, 172, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 193, 194, 195, 196, 197, 198, 199, 200, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 217, 218, 219, 220, 221, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 237, 238, 239, 240, 241, 242, 243, 245, 247, 248, 249, 250, 251, 2

In [49]:
corresponding_cluster0 = []
for i in range(n):
    corresponding_cluster0.append(kmeans0.labels_[i])
data['cluster0'] = corresponding_cluster0

In [50]:
data.head()

Unnamed: 0,questionID,AnswerKey,examName,schoolGrade,question,subject,bag_question,cleaned_question,cluster,final_cleaned,cluster0
0,AKDE&ED_2008_4_4,C,Alaska Department of Education & Early Develop...,4,Which organism needs to make its own food? (A)...,Biology,"[food, make, needs, organism]",food make need organism,0,food need organism,0
1,AKDE&ED_2008_4_31,D,Alaska Department of Education & Early Develop...,4,Students planted one hundred flower seeds. The...,Biology,"[could, flower, growth, hundred, observations]",could flower growth hundred observation,0,could flower growth hundred observation,0
2,AKDE&ED_2012_4_9,C,Alaska Department of Education & Early Develop...,4,Study the data table below. {Alaska-2012-4-10}...,Biology,"[bands, rubber, set, students, test]",band rubber set student test,0,band rubber set test,0
3,AKDE&ED_2012_4_24,B,Alaska Department of Education & Early Develop...,4,The chart shows observations of the Moon. {Ala...,Biology,"[chart, moon, observations, shows]",chart moon observation show,3,chart moon observation,0
4,CSZ_2009_5_18,A,California Standards Test,5,A balloon has a negative charge. A glass rod h...,Biology,"[attracted, balloon, charge, glass, rod]",attract balloon charge glass rod,0,attract balloon charge glass rod,0


In [51]:
data.groupby('cluster0').count()

Unnamed: 0_level_0,questionID,AnswerKey,examName,schoolGrade,question,subject,bag_question,cleaned_question,cluster,final_cleaned
cluster0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,2321,2321,2321,2321,2321,2321,2321,2321,2321,2321
1,128,128,128,128,128,128,128,128,128,128
2,57,57,57,57,57,57,57,57,57,57
3,139,139,139,139,139,139,139,139,139,139
4,54,54,54,54,54,54,54,54,54,54


In [52]:
cluster0 = [[],[],[],[],[]]
cluster0_fv = [[],[],[],[],[]]

for i in range(n):
    a = data['cluster0'][i]
    for word in data['final_cleaned'][i].split():
        cluster0[a].append(word) 

for i in range(len(cluster0)):
    cluster0_fv[i].append(' '.join(cluster0[i]))

In [53]:
clust0 = pd.DataFrame(cluster0_fv)
clust0 = clust0.rename({0 : 'cl'}, axis=1)
clust0

Unnamed: 0,cl
0,food need organism could flower growth hundred...
1,describes earth fall motion deposition earth e...
2,amount aquatic crayfish rusty specie amount ba...
3,air atmosphere beach bottom activity air quali...
4,bag le mass pebble volume container four mass ...


In [54]:
from sklearn.feature_extraction.text import CountVectorizer

cl = clust0['cl']
vect = CountVectorizer()
vect.fit(cl)
bag = vect.transform(cl)
vect_df_final = pd.DataFrame(bag.toarray(), columns=vect.get_feature_names())
vect_df_final.head()

Unnamed: 0,aa,abandon,ability,able,abnormal,abnormality,absorb,absorbed,absorbs,absorption,...,wrap,wrinkle,yard,yearly,yearold,yes,young,zinc,zone,zygote
0,2,1,11,19,6,1,22,6,12,9,...,1,1,1,1,2,1,5,1,1,1
1,0,0,1,0,0,0,2,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,1,0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [55]:
vect_df_final = vect_df_final.transpose()
vect_df_final = vect_df_final.rename(columns={0: "cluster0", 1: "cluster1", 2: "cluster2", 3: "cluster3", 4: "cluster4"})
vect_df_final

Unnamed: 0,cluster0,cluster1,cluster2,cluster3,cluster4
aa,2,0,0,0,0
abandon,1,0,0,0,0
ability,11,1,0,0,0
able,19,0,0,1,0
abnormal,6,0,0,0,0
abnormality,1,0,0,0,0
absorb,22,2,0,1,0
absorbed,6,0,0,0,0
absorbs,12,1,0,1,0
absorption,9,0,0,0,0


In [56]:
vect_df_final.nlargest(5, 'cluster0')

Unnamed: 0,cluster0,cluster1,cluster2,cluster3,cluster4
water,167,2,9,12,4
plant,148,0,3,4,0
animal,130,1,1,3,0
energy,116,5,7,10,0
cell,102,0,1,0,0


In [57]:
n0 = [11,11,12,13,194]
nm0 = ['ocean', 'light', 'air', 'temperature', 'water']

In [58]:
vect_df_final.nlargest(5, 'cluster1')

Unnamed: 0,cluster0,cluster1,cluster2,cluster3,cluster4
earth,0,134,0,3,1
sun,35,24,0,1,0
moon,24,22,0,1,0
around,18,20,0,1,0
day,23,16,1,3,0


In [59]:
n1 = [4,4,10,12,13]
nm1 = ['fertilization', 'cycle', 'descibes', 'development', 'stage']

In [60]:
vect_df_final.nlargest(5, 'cluster2')

Unnamed: 0,cluster0,cluster1,cluster2,cluster3,cluster4
amount,0,0,57,0,0
water,167,2,9,12,4
energy,116,5,7,10,0
soil,33,0,6,6,0
food,99,0,4,10,1


In [61]:
n2 = [16,20,22,24,136]
nm2 = ['day', 'around', 'moon', 'sun', 'earth']

In [62]:
vect_df_final.nlargest(5, 'cluster3')

Unnamed: 0,cluster0,cluster1,cluster2,cluster3,cluster4
air,0,0,3,139,6
temperature,38,1,1,12,1
water,167,2,9,12,4
energy,116,5,7,10,0
food,99,0,4,10,1


In [63]:
n3 = [55,97,120,121,132]
nm3 = ['cause', 'cell', 'air', 'animal', 'plant']

In [64]:
vect_df_final.nlargest(5, 'cluster4')

Unnamed: 0,cluster0,cluster1,cluster2,cluster3,cluster4
mass,0,0,0,0,54
gram,10,0,0,0,9
air,0,0,3,139,6
color,38,0,0,1,5
volume,8,0,0,0,5


In [65]:
n4 = [20,21,25,112,130]
nm4 = ['electrical', 'chemical', 'chain', 'food', 'energy']

In [66]:
clust0_freq0 = [n0[i]/sum(n0) for i in range(len(n0))]
clust1_freq0 = [n1[i]/sum(n1) for i in range(len(n1))]
clust2_freq0 = [n2[i]/sum(n2) for i in range(len(n2))]
clust3_freq0 = [n3[i]/sum(n3) for i in range(len(n3))]
clust4_freq0 = [n4[i]/sum(n4) for i in range(len(n4))]
clust0_freq0

[0.04564315352697095,
 0.04564315352697095,
 0.04979253112033195,
 0.05394190871369295,
 0.8049792531120332]

In [67]:
def transf1(x):
    return 40 + 70*x

In [68]:
clust0_freq_norm0 = []
clust1_freq_norm0 = []
clust2_freq_norm0 = []
clust3_freq_norm0 = []
clust4_freq_norm0 = []
for element in clust0_freq0:
    clust0_freq_norm0.append(transf1(element))
for element in clust1_freq0:
    clust1_freq_norm0.append(transf1(element))
for element in clust2_freq0:
    clust2_freq_norm0.append(transf1(element))
for element in clust3_freq0:
    clust3_freq_norm0.append(transf1(element))
for element in clust4_freq0:
    clust4_freq_norm0.append(transf1(element))
clust0_freq_norm0

[43.19502074688797,
 43.19502074688797,
 43.48547717842324,
 43.77593360995851,
 96.34854771784232]

In [69]:
import plotly
import plotly.plotly as py
import plotly.graph_objs as go


trace0 = go.Scatter(
    x=[1, 1, 1, 1, 1],
    y=[1, 2, 3, 4, 5],
    mode='markers+text',
    text=nm0,
    name='Earth Science',
    textposition='middle center',
    marker=dict(
        size=clust0_freq_norm0,
    color=['rgb(255, 144, 14)','rgb(255, 144, 14)','rgb(255, 144, 14)', 'rgb(255, 144, 14)',
               'rgb(255, 144, 14)'],
    
    )
       
)

        
trace1 = go.Scatter(
    x=[1.2, 1.2, 1.2, 1.2, 1.2],
    y=[1, 2, 3, 4, 5],
    mode='markers+text',
    text=nm1,
    name='Life Science',
    textposition='middle center',
    marker=dict(
        size=clust1_freq_norm0,
    color=['rgb(93, 164, 214)','rgb(93, 164, 214)','rgb(93, 164, 214)','rgb(93, 164, 214)','rgb(93, 164, 214)'],
           
    )
)

trace2 = go.Scatter(
    x=[1.4, 1.4, 1.4, 1.4, 1.4],
    y=[1, 2, 3, 4, 5],
    mode='markers+text',
    text=nm2,
    name='Astronomy',
    textposition='middle center',
    marker=dict(
        size=clust2_freq_norm0,
    color=['rgb(144, 255, 14)','rgb(144, 255, 14)','rgb(144, 255, 14)', 'rgb(144, 255, 14)',
               'rgb(144, 255, 14)'],
    
    )
       
)

trace3 = go.Scatter(
    x=[1.6, 1.6, 1.6, 1.6, 1.6],
    y=[1, 2, 3, 4, 5],
    mode='markers+text',
    text=nm3,
    name='Biology',
    textposition='middle center',
    marker=dict(
        size=clust3_freq_norm0,
    color=['rgb(50, 100, 150)','rgb(50, 100, 150)','rgb(50, 100, 150)', 'rgb(50, 100, 150)',
               'rgb(50, 100, 150)'],
    
    )
       
)

trace4 = go.Scatter(
    x=[1.8, 1.8, 1.8, 1.8, 1.8],
    y=[1, 2, 3, 4, 5],
    mode='markers+text',
    text=nm4,
    textposition='middle center',
    name='Physics',
    marker=dict(
        size=clust4_freq_norm0,
    color=['rgb(200, 150, 0)','rgb(200, 150, 0)','rgb(200, 150, 0)', 'rgb(200, 150, 0)',
               'rgb(200, 150, 0)'],
    
    )
       
)

layout = go.Layout(
    showlegend=False,
    
)


data1 = [trace0,trace1,trace2,trace3,trace4]
py.iplot(data1, filename='bubblechart-size', layout=layout)

High five! You successfully sent some data to your account on plotly. View your plot in your browser at https://plot.ly/~andyucb/0 or inside your plot.ly account where it is named 'bubblechart-size'


In [70]:
data.to_csv('data_with_clusters.csv', index=False)