Permalink
Browse files

CorpusExport produces matrices for each category (for a faceted grid)…

…, for the whole corpus, and for the centroids. The R script to graph all this is updated likewise.
  • Loading branch information...
1 parent 0364120 commit 57fb2f79be1284ce5ea8d2cd6f8f4905f34672fb @joshuaeckroth joshuaeckroth committed Jul 27, 2011
Showing with 142 additions and 51 deletions.
  1. +95 −41 CorpusExport.py
  2. +47 −10 corpus-mds.r
View
@@ -1,60 +1,114 @@
import sys
import re
+from AINewsConfig import paths
+from AINewsCorpus import AINewsCorpus
from AINewsCentroidClassifier import AINewsCentroidClassifier
-def find_max_key(models):
- maxes = map(lambda m: max(m.keys()), models)
- return max(maxes)
+aicorpus = AINewsCorpus()
+
+def dissim(tfidf1, tfidf2, category = None):
+ d = 1.0 - aicorpus.cos_sim(tfidf1, tfidf2, category)
+ if d < 0.1E-10: d = 0.0
+ return d
if __name__ == "__main__":
- ident = sys.argv[1]
- strategy = sys.argv[2]
+ directory = sys.argv[1]
+ ident = sys.argv[2]
- classifier = AINewsCentroidClassifier()
- corpus = classifier.load_corpus(ident, 1.0, strategy)[0]
+ corpus = aicorpus.load_corpus(ident, 1.0)[0]
- classifier.icsd_pow = 0.0
- classifier.csd_pow = 0.0
- classifier.sd_pow = 0.0
+ centroid = AINewsCentroidClassifier(aicorpus)
+ for category in aicorpus.categories:
+ centroid.train_centroid(category, corpus, 'centroid_eval', True)
+ centroid.init_predict(paths['ainews.category_data'] + 'centroid_eval')
- models = classifier.models
+ aicorpus.icsd_pow = 0.0
+ aicorpus.csd_pow = 0.0
+ aicorpus.sd_pow = 0.0
- for c in corpus:
- models["%s %s" % (c[0], c[2])] = classifier.get_tfidf(int(c[0]), c[1])
+ models = {}
+ for cat in aicorpus.categories:
+ models[cat] = []
+ articles = {}
cache = {}
- names = sorted(models.keys())
- for name in names:
- print name,
- if name != names[-1]: print ",",
- print
- for name in names:
- print name + ",",
- for other in names:
- if name in classifier.categories:
- urlidA = name
- else:
- urlidA = re.sub(r'[^\d]*', '', name)
+ for c in corpus:
+ for cat in c[2].split(' '):
+ tfidf = aicorpus.get_tfidf(int(c[0]), c[1])
+ articles[c[0]] = (tfidf, cat)
+ models[cat].append(c[0])
- if other in classifier.categories:
- urlidB = other
- else:
- urlidB = re.sub(r'[^\d]*', '', other)
+ models_csv = open("%s/models.csv" % directory, 'w')
+ ms = sorted(models.keys())
+ for model in ms:
+ models_csv.write(model)
+ if model != ms[-1]: models_csv.write(",")
+ models_csv.write("\n")
+ for model in ms:
+ models_csv.write(model + ",")
+ for other in ms:
+ d = dissim(centroid.models[model], centroid.models[other], model)
+ models_csv.write(str(d))
+ if other != ms[-1]: models_csv.write(",")
+ models_csv.write("\n")
+ models_csv.close()
- if (urlidA,urlidB) in cache:
- print str(cache[(urlidA,urlidB)]),
- elif (urlidB,urlidA) in cache:
- print str(cache[(urlidB,urlidA)]),
- else:
- # go from similarity to dissimilarity
- dissim = 1.0 - classifier.cos_sim(models[name], models[other])
- if dissim < 0.1E-10: dissim = 0.0
- cache[(urlidA,urlidB)] = dissim
- print str(dissim),
- if other != names[-1]: print ",",
- print
+ for model in models:
+ model_csv = open("%s/%s.csv" % (directory, model), 'w')
+ urlids = sorted(models[model])
+ model_csv.write(model + ",")
+ for urlid in urlids:
+ model_csv.write(str(urlid) + " " + model)
+ if urlid != urlids[-1]: model_csv.write(",")
+ model_csv.write("\n")
+ model_csv.write(model + ",0.0,")
+ for urlid in urlids:
+ d = dissim(articles[urlid][0], centroid.models[model], model)
+ cache[(model, urlid)] = d
+ model_csv.write(str(d))
+ if urlid != urlids[-1]: model_csv.write(",")
+ model_csv.write("\n")
+ for urlid in urlids:
+ model_csv.write(str(urlid) + " " + model + ",")
+ model_csv.write(str(cache[(model, urlid)]) + ",")
+ for other in urlids:
+ if (urlid, other) in cache:
+ model_csv.write(str(cache[(urlid, other)]))
+ elif (other, urlid) in cache:
+ model_csv.write(str(cache[(other, urlid)]))
+ else:
+ tfidf_article = articles[urlid][0]
+ tfidf_other = articles[other][0]
+ d = dissim(tfidf_article, tfidf_other, model)
+ cache[(urlid, other)] = d
+ model_csv.write(str(d))
+ if other != urlids[-1]: model_csv.write(",")
+ model_csv.write("\n")
+ model_csv.close()
+ corpus_csv = open("%s/corpus.csv" % directory, 'w')
+ urlids = sorted(articles.keys())
+ for urlid in urlids:
+ corpus_csv.write(str(urlid) + " " + articles[urlid][1])
+ if urlid != urlids[-1]: corpus_csv.write(",")
+ corpus_csv.write("\n")
+ for urlid in urlids:
+ corpus_csv.write(str(urlid) + " " + articles[urlid][1] + ",")
+ for other in urlids:
+ if (urlid, other) in cache:
+ corpus_csv.write(str(cache[(urlid, other)]))
+ elif (other, urlid) in cache:
+ corpus_csv.write(str(cache[(other, urlid)]))
+ else:
+ tfidf_article = articles[urlid][0]
+ tfidf_other = articles[other][0]
+ d = dissim(tfidf_article, tfidf_other, articles[urlid][1])
+ cache[(urlid, other)] = d
+ corpus_csv.write(str(d))
+ if other != urlids[-1]: corpus_csv.write(",")
+ corpus_csv.write("\n")
+ corpus_csv.close()
View
@@ -1,30 +1,67 @@
library(ggplot2)
args <- commandArgs(trailingOnly = T)
-corpus <- read.csv(paste(args[1],".csv", sep=""))
+directory <- args[1]
+corpus <- read.csv(paste(directory,"/corpus.csv", sep=""))
fit <- cmdscale(corpus, k=2)
data <- as.data.frame(fit)
data$Category <- gsub("\\d+ ", "", rownames(data))
-data$URLID <- gsub("(\\d+)?.*", "\\1", rownames(fit))
Category <- factor(gsub("\\d+ ", "", rownames(fit)))
-png(paste(args[1],"-mds.png", sep=""), width=500, height=500, res=100)
+png(paste(directory,"/corpus-mds.png", sep=""), width=500, height=500, res=100)
p <- ggplot(data) +
- geom_point(data=subset(data, URLID != ""),
- aes(x=V1, y=V2, size=1.5, color=Category)) +
- geom_point(data=subset(data, URLID == ""),
- aes(x=V1, y=V2, size=7, shape=c(1), color=Category)) +
+ geom_point(aes(x=V1, y=V2, color=Category)) +
scale_x_continuous("", breaks=NA) +
scale_y_continuous("", breaks=NA) +
opts(axis.text.x = theme_blank(), axis.title.x=theme_blank(),
axis.text.y = theme_blank(), axis.title.y=theme_blank(),
legend.position = "none")
-
p
dev.off()
-png(paste(args[1],"-mds-faceted.png", sep=""), width=500, height=500, res=100)
-p + facet_wrap(~ Category)
+corpus <- read.csv(paste(directory,"/models.csv", sep=""))
+fit <- cmdscale(corpus, k=2)
+data <- as.data.frame(fit)
+data$Category <- rownames(data)
+png(paste(directory,"/corpus-mds-centroids.png", sep=""),
+ width=500, height=500, res=100)
+p <- ggplot(data) +
+ geom_text(aes(x=V1, y=V2, label=Category, size=3, color=Category)) +
+ scale_x_continuous("", breaks=NA) +
+ scale_y_continuous("", breaks=NA) +
+ opts(axis.text.x = theme_blank(), axis.title.x=theme_blank(),
+ axis.text.y = theme_blank(), axis.title.y=theme_blank(),
+ legend.position = "none")
+p
+dev.off()
+
+png(paste(directory,"/corpus-mds-faceted.png", sep=""),
+ width=500, height=500, res=100)
+p <- ggplot(data)
+cats <- as.vector(unique(Category))
+for(cat in cats)
+{
+ corpus <- read.csv(paste(directory,"/",cat,".csv", sep=""))
+ if(nrow(corpus) > 2)
+ {
+ fit <- cmdscale(corpus, k=2)
+ data_cat <- as.data.frame(fit)
+ data_cat$Category <- gsub("\\d+ ", "", rownames(data_cat))
+ data_cat$URLID <- gsub("(\\d+)?.*", "\\1", rownames(fit))
+
+ p <- p + geom_point(data=subset(data_cat, URLID != ""),
+ aes(x=V1, y=V2, size=1.5, color=Category)) +
+ geom_point(data=subset(data_cat, URLID == ""),
+ aes(x=V1, y=V2, size=7, shape=c(1)))
+ }
+}
+p <- p + scale_x_continuous("", breaks=NA) +
+ scale_y_continuous("", breaks=NA) +
+ opts(axis.text.x = theme_blank(), axis.title.x=theme_blank(),
+ axis.text.y = theme_blank(), axis.title.y=theme_blank(),
+ legend.position = "none") +
+ facet_wrap(~ Category)
+p
dev.off()

0 comments on commit 57fb2f7

Please sign in to comment.