In [24]:
import random
import numpy as np
import csv
from sklearn import svm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from sklearn import preprocessing
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import jaccard_similarity_score
import nltk

In [3]:
nltk.download('punkt') # for tokenization
nltk.download('stopwords')
stpwds = set(nltk.corpus.stopwords.words("english"))
stemmer = nltk.stem.PorterStemmer()

[nltk_data] Downloading package punkt to /home/caykroyd/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/caykroyd/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
with open("testing_set.txt", "r") as f:
    reader = csv.reader(f)
    testing_set  = list(reader)

In [5]:
testing_set = [element[0].split(" ") for element in testing_set]

In [6]:
with open("training_set.txt", "r") as f:
    reader = csv.reader(f)
    training_set  = list(reader)

training_set = [element[0].split(" ") for element in training_set]

In [7]:
with open("node_information.csv", "r") as f:
    reader = csv.reader(f)
    node_info  = list(reader)

In [8]:
IDs = [element[0] for element in node_info]

In [9]:
# compute TFIDF vector of each paper
corpus = [element[5] for element in node_info]
vectorizer = TfidfVectorizer(stop_words="english")
# each row is a node in the order of node_info
features_TFIDF = vectorizer.fit_transform(corpus)

In [10]:
# Fit LSA. Use algorithm = "randomized" for large datasets
components = 3
lsa = TruncatedSVD(components, algorithm = 'randomized')
dtm_lsa = lsa.fit_transform(features_TFIDF)
dtm_lsa = preprocessing.Normalizer(copy=False).fit_transform(dtm_lsa)

In [103]:
radius = np.arange(0.05, 0.5, 0.05)
IDs_dic = dict(zip(IDs,range(len(IDs))))
set_id1 = set(features_TFIDF[IDs_dic['1001']].nonzero()[1])
set_id2 = set(features_TFIDF[IDs_dic['1015']].nonzero()[1])
jaccard = len(set_id1 & set_id2) / len(set_id1 | set_id2)
jaccard

0.023255813953488372

In [105]:
F1 = []

jac_eps = 0.01
cos_eps = 0.2
tp = 0 #true positive
fp = 0 #false positive
fn = 0 #false negative

for (id1, id2, expected) in training_set:
    is_earlier = int(node_info[IDs_dic[id1]][1]) >= int(node_info[IDs_dic[id2]][1])

    #dist = sum([(u - v)**2 for (u,v) in zip(dtm_lsa[IDs_dic[id1]], dtm_lsa[IDs_dic[id2]])])
    #norm_u = np.sum([u**2 for (u,v) in zip(dtm_lsa[IDs_dic[id1]], dtm_lsa[IDs_dic[id2]])])
    #norm_v = np.sum([v**2 for (u,v) in zip(dtm_lsa[IDs_dic[id1]], dtm_lsa[IDs_dic[id2]])])
    #prod = np.sum([u*v for (u,v) in zip(dtm_lsa[IDs_dic[id1]], dtm_lsa[IDs_dic[id2]])])

    #predicted = prod/(norm_u * norm_v) >= epsilon and is_earlier
    set_id1 = set(features_TFIDF[IDs_dic[id1]].nonzero()[1])
    set_id2 = set(features_TFIDF[IDs_dic[id2]].nonzero()[1])
    jaccard = len(set_id1 & set_id2) / len(set_id1 | set_id2) 
    predicted = jaccard >= jac_eps and is_earlier

    if (predicted == True and expected == '1'):
        tp += 1
    elif (predicted == True and expected == '0'):
        fp += 1
    elif (expected == '1'):
        fn += 1

p = tp/(tp+fp) #precision
r = tp/(tp+fn) #recall
f = (2*p*r)/(p+r)

print(epsilon, tp,fp,fn,p,r,f)


0.01 326373 123699 8757 0.7251573081640271 0.9738698415540238 0.831309650255603


In [None]:
import matplotlib.pyplot as plt
figure()
plt.scatter(radius, F1)
show()
F1

In [28]:
xs = [w[0] for w in dtm_lsa]
ys = [w[1] for w in dtm_lsa]
zs = [w[2] for w in dtm_lsa]
#qs = [w[3] for w in dtm_lsa]

In [29]:
%pylab inline
import matplotlib.pyplot as plt
figure()
plt.scatter(xs, ys)
xlabel('First principal component')
ylabel('Seconf principal component')
show()

Populating the interactive namespace from numpy and matplotlib


AttributeError: module 'matplotlib' has no attribute 'path'

<Figure size 432x288 with 0 Axes>

In [125]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
from sklearn.neighbors import KNeighborsClassifier

X_train_raw = corpus
y_train_labels = raw_text_dataset[1] 
X_test_raw = raw_text_dataset[2]
y_test_labels = raw_text_dataset[3]

# The Reuters dataset consists of ~100 categories. However, we are going to
# simplify this to a binary classification problem. The 'positive class' will
# be the articles related to "acquisitions" (or "acq" in the dataset). All
# other articles will be negative.
y_train = ["acq" in y for y in y_train_labels]
y_test = ["acq" in y for y in y_test_labels]


vectorizer = TfidfVectorizer(max_df=0.5, max_features=10000,
                             min_df=2, stop_words='english',
                             use_idf=True)

# Build the tfidf vectorizer from the training data ("fit"), and apply it 
# ("transform").
X_train_tfidf = vectorizer.fit_transform(X_train_raw)

print("  Actual number of tfidf features: %d" % X_train_tfidf.get_shape()[1])

print("\nPerforming dimensionality reduction using LSA")
t0 = time.time()

# Project the tfidf vectors onto the first N principal components.
# Though this is significantly fewer features than the original tfidf vector,
# they are stronger features, and the accuracy is higher.
svd = TruncatedSVD(100)
lsa = make_pipeline(svd, Normalizer(copy=False))

# Run SVD on the training data, then project the training data.
X_train_lsa = lsa.fit_transform(X_train_tfidf)

print("  done in %.3fsec" % (time.time() - t0))

explained_variance = svd.explained_variance_ratio_.sum()
print("  Explained variance of the SVD step: {}%".format(int(explained_variance * 100)))


# Now apply the transformations to the test data as well.
X_test_tfidf = vectorizer.transform(X_test_raw)
X_test_lsa = lsa.transform(X_test_tfidf)


###############################################################################
#  Run classification of the test articles
###############################################################################

print("\nClassifying tfidf vectors...")

# Time this step.
t0 = time.time()

# Build a k-NN classifier. Use k = 5 (majority wins), the cosine distance, 
# and brute-force calculation of distances.
knn_tfidf = KNeighborsClassifier(n_neighbors=5, algorithm='brute', metric='cosine')
knn_tfidf.fit(X_train_tfidf, y_train)

# Classify the test vectors.
p = knn_tfidf.predict(X_test_tfidf)

# Measure accuracy
numRight = 0;
for i in range(0,len(p)):
    if p[i] == y_test[i]:
        numRight += 1

print("  (%d / %d) correct - %.2f%%" % (numRight, len(y_test), float(numRight) / float(len(y_test)) * 100.0))

# Calculate the elapsed time (in seconds)
elapsed = (time.time() - t0)
print("  done in %.3fsec" % elapsed)


print("\nClassifying LSA vectors...")

# Time this step.
t0 = time.time()

# Build a k-NN classifier. Use k = 5 (majority wins), the cosine distance, 
# and brute-force calculation of distances.
knn_lsa = KNeighborsClassifier(n_neighbors=5, algorithm='brute', metric='cosine')
knn_lsa.fit(X_train_lsa, y_train)

# Classify the test vectors.
p = knn_lsa.predict(X_test_lsa)

# Measure accuracy
numRight = 0;
for i in range(0,len(p)):
    if p[i] == y_test[i]:
        numRight += 1

print("  (%d / %d) correct - %.2f%%" % (numRight, len(y_test), float(numRight) / float(len(y_test)) * 100.0))

# Calculate the elapsed time (in seconds)
elapsed = (time.time() - t0)    
print("    done in %.3fsec" % elapsed)