In [1]:
import json
import matplotlib.pyplot as plt
import numpy as np
import scipy as sp
from scipy import stats
import sklearn

%matplotlib inline
plt.style.use('ggplot')

In [2]:
with open("json/filenames_topics.json", 'r') as f:
        topics_articles = json.load(f)

In [6]:
header = ['article', 'subject']

for subject, articles in topics_articles.iteritems():
    for pub_id, topics in articles.iteritems():
        header.extend(topics)        

print header[:20]

['article', 'subject', u'scale', u'antibody', u'arrow', u'indicate', u'nonspecific', u'mutant', u'receptive', u'signal', u'junction', u'epithelial', u'hand', u'reduced', u'arm', u'higher', u'hand', u'behaviour', u'white', u'matter']


In [7]:
top_data = []

for subject, articles in topics_articles.iteritems():
    for pub_id, topics in articles.iteritems():        
        ct = []
        ct.append(pub_id)
        ct.append(subject)
        tpcs = ['1' if h in topics else '0' for h in header[2:]]
        ct.extend(tpcs)
        top_data.append(ct)
        
topics_data = np.array(top_data)
print topics_data

[[u'elife05116' u'Neuroscience' u'1' ..., u'0' u'0' u'0']
 [u'elife02094' u'Neuroscience' u'0' ..., u'0' u'0' u'0']
 [u'elife01206' u'Neuroscience' u'0' ..., u'0' u'0' u'0']
 ..., 
 [u'elife04810' u'Cell biology' u'0' ..., u'0' u'0' u'0']
 [u'elife02678' u'Cell biology' u'0' ..., u'0' u'0' u'0']
 [u'elife05697' u'Cell biology' u'1' ..., u'1' u'1' u'1']]


In [8]:
## Data matrix: column 3 to the end
X = topics_data[:, 2:2392].astype(int)
## Class vector: column 2
Y = topics_data[:, 1] 

In [14]:
## NAIVE BAYES MODEL:
from sklearn.naive_bayes import GaussianNB
from sklearn import metrics, preprocessing
from sklearn.cross_validation import train_test_split

In [19]:
## Encode labels ('Cell biology', 'Neuroscience') as 0 and 1.
le = preprocessing.LabelEncoder()
le.fit(Y)
y_transformed = le.transform(Y)
#print Y
#print y_transformed
## => 1: Neuroscience, 0: Cell biology

## Randomly split the data between training and testing:
X_train, X_test, Y_train, Y_test = train_test_split(X, y_transformed)

nbmodel_train = GaussianNB().fit(X_train, Y_train)
predicted_nb = nbmodel_train.predict(X_test)

In [33]:
## Confusion matrix:
cm_nb = metrics.confusion_matrix(Y_test, predicted_nb)
print "True positive: ", cm_nb[0][0]," - False negative: ", cm_nb[0][1]
print "False positive: ", cm_nb[1][0], " - True negative: ", cm_nb[1][1]

True positive:  12  - False negative:  7
False positive:  5  - True negative:  36


In [20]:
# Precision: fraction of retrieved instances that are relevant, TP / TP + FP
# Recall: fraction of relevant instances that are retrieved
print metrics.classification_report(Y_test, predicted)

# Accuracy: overall correctness of the model
print "Accuracy: ", metrics.accuracy_score(Y_test, predicted) 

             precision    recall  f1-score   support

          0       0.64      0.69      0.67        13
          1       0.91      0.89      0.90        47

avg / total       0.85      0.85      0.85        60

Accuracy:  0.85


In [21]:
## => Test with a high precision and recall for class 1 = Neuroscience

In [23]:
## K NEAREST NEIGHBOUR MODEL
from sklearn import metrics
from sklearn import neighbors

In [31]:
knnk3 = neighbors.KNeighborsClassifier(n_neighbors=3).fit(X, y_transformed)
predicted_k3 = knnk3.predict(X)

knnk5 = neighbors.KNeighborsClassifier(n_neighbors=5).fit(X, y_transformed)
predicted_k5 = knnk5.predict(X)

knnk15 = neighbors.KNeighborsClassifier(n_neighbors=15).fit(X, y_transformed)
predicted_k15 = knnk15.predict(X)

In [30]:
print "Accuracy 3 NN: ", metrics.accuracy_score(y_transformed, predicted_k3)
print "Accuracy 5 NN: ", metrics.accuracy_score(y_transformed, predicted_k5)
print "Accuracy 15 NN: ", metrics.accuracy_score(y_transformed, predicted_k15)

Accuracy 3 NN:  0.882845188285
Accuracy 5 NN:  0.811715481172


In [34]:
X_train, X_test, Y_train, Y_test = train_test_split(X, y_transformed)

knn = neighbors.KNeighborsClassifier(n_neighbors=3).fit(X_train, Y_train)
predicted_knn = knn.predict(X_test)

cm_knn = metrics.confusion_matrix(Y_test, predicted)
print "True positive: ", cm[0][0]," - False negative: ", cm[0][1]
print "False positive: ", cm[1][0], " - True negative: ", cm[1][1]

True positive:  10  - False negative:  6
False positive:  4  - True negative:  40
