In [1]:
from ngram import ngram
import os
import preprocess
from gt_ngram import gt_ngram
from li_ngram import li_ngram
import sys
import operator
import csv
import numpy as linspace

# TODO: delete it when not used

indir_pre = os.getcwd() + "/"
outdir_pre = os.getcwd() + "/"
topics = {'atheism':0, 'autos':1, 'graphics':2, 'medicine':3, 'motorcycles':4, 'religion':5, 'space':6}

def random_sentence_ngram(n = 2, sent_pre = "I have"):
    for topic in topics:
        indir = indir_pre + "data/classification_task/{}/train_docs".format(topic)
        content = preprocess.preprocess_dir(indir)
        ngrams = ngram(content)
        print "\n\n\nTopic: {}\n".format(topic)
        for k in xrange(1, n + 1):
            print "[{}-gram]\n".format(k)

            print "Empty sentence"
            for i in xrange(3):
                print "[{}]  ".format(i + 1) + ngrams.generate_sentence(k)

            print "\nWith incomplete sentence: " + "\"{}\"".format(sent_pre)
            for i in xrange(3):
                print "[{}]  ".format(i + 1) + ngrams.generate_sentence(k, sent_pre)


def generate_perplexity_gt_ngram():
    gt_ngrams = {}
    for topic in topics:
        indir = indir_pre + "data/classification_task/{}/train_docs".format(topic)
        content = preprocess.preprocess_dir(indir)
        gt_ngrams[topic] = gt_ngram(content)

        print "\nTopic: {}".format(topic)
        for i in xrange(1, 6):
            print "[{}-gram]: {}".format(i, gt_ngrams[topic].generate_perplexity(i, content))


def topic_classification_gt_ngram():
    """
    calculate the accuracy for topic classification with different
    n in Good-Turing ngram, then choose the best one to classify files
    in test_for_classification directory, and write results into
    gt_result.csv in classification_task directory
    """

    # get gt_ngram for each topic and read all test data
    gt_ngrams, train_text, test_text  = {}, {}, {} #key: topic
    for topic in topics:
        train_f = indir_pre + "data/classification_task/{}/train.txt".format(topic)
        test_f = indir_pre + "data/classification_task/{}/train.txt".format(topic)
        if not os.path.isfile(train_f) or not os.path.isfile(test_f):
            split_train_test()

        train_text[topic] = open(train_f, 'r').read()
        test_text[topic] = open(test_f, 'r').read()

        gt_ngrams[topic] = gt_ngram(train_text[topic])

    # calculate the accuracy for n-gram and choose the best one
    accuracy = {} # key: the n in gt_ngram
    for i in xrange(1, 5):
        _sum, correct = 0, 0
        for label_topic, text in test_text.items():
            sentences = text.split('</s>')
            for sentence in sentences:
                sentence += ' </s>'
                min_perp, min_topic = sys.maxint, label_topic

                for topic in topics:
                    perp = gt_ngrams[topic].generate_perplexity(i, sentence)
                    if perp < min_perp:
                        min_perp = perp
                        min_topic = topic

                if label_topic == min_topic:
                    correct += 1
                _sum += 1

        accuracy[i] = 1.0 * correct / _sum
        print "[{}-gram] {}".format(i, accuracy[i])
    #choose the best n
    n = max(accuracy.iteritems(), key = operator.itemgetter(1))[0]

    # get the result for files in test_for_classification directory
    test_dir = indir_pre + "data/classification_task/test_for_classification"
    csv_f = indir_pre + "data/classification_task/gt_result.csv"

    with open(csv_f, 'w') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames = ['ID', 'Prediction'])
        writer.writeheader()

        for root, dirs, filenames in os.walk(test_dir):
            for f in filenames:
                text = preprocess.preprocess_file(os.path.join(root, f))
                min_perp, min_topic = sys.maxint, ''

                for topic in topics:
                    perp = gt_ngrams[topic].generate_perplexity(n, text)
                    if perp < min_perp:
                        min_perp = perp
                        min_topic = topic

                writer.writerow({'ID': f, 'Prediction': '{}'.format(topics[min_topic])})


def split_train_test():
    """
    split train_docs into     training:test = 4:1
    store the preprocessed file train.txt and test.txt in each topic directory
    """
    for topic in topics:
        indir = indir_pre + "data/classification_task/{}/train_docs".format(topic)
        content = preprocess.preprocess_dir(indir)
        tokens = content.split()

        # find the nearest </s> after 80% content
        pointer = int(len(tokens) * 0.8)
        while tokens[pointer] != '</s>':
            pointer += 1

        train_text = ' '.join(tokens[:(pointer+1)])
        test_text = ' '.join(tokens[(pointer+2):])

        train_path = indir_pre + "data/classification_task/{}/train.txt".format(topic)
        test_path = indir_pre + "data/classification_task/{}/test.txt".format(topic)
        open(train_path, 'w').write(train_text)
        open(test_path, 'w').write(test_text)


def topic_classification_li_ngram():
    # TODO when li_gram done, test
    # get gt_ngram for each topic and read all test data
    li_ngrams, train_text, test_text  = {}, {}, {} #key: topic
    for topic in topics:
        train_f = indir_pre + "data/classification_task/{}/train.txt".format(topic)
        test_f = indir_pre + "data/classification_task/{}/train.txt".format(topic)
        if not os.path.isfile(train_f) or not os.path.isfile(test_f):
            split_train_test()

        train_text[topic] = open(train_f, 'r').read()
        test_text[topic] = open(test_f, 'r').read()

        li_ngrams[topic] = li_ngram(train_text[topic])

    accuracy, r = {}, []
    for i in xrange(0, 11):
        for j in xrange(0, 11 - i):
            r[0] = round(i * 0.1, 1)
            r[1] = round(j * 0.1, 1)
            r[2] = round(1 - r[0] - r[1], 1)

            _sum, correct = 0, 0
            for label_topic, text in test_text.items():
                sentences = text.split('</s>')
                for sentence in sentences:
                    sentence += ' </s>'
                    min_perp, min_topic = sys.maxint, label_topic

                    for topic in topics:
                        perp = li_ngrams[topic].generate_perplexity(3, sentence, r)
                        if perp < min_perp:
                            min_perp = perp
                            min_topic = topic

                    if label_topic == min_topic:
                        correct += 1
                    _sum += 1

            accuracy[tuple(r)] = 1.0 * correct / _sum
            print "{}: {}".format(r, accuracy[tuple(r)])

    #choose the best r
    r_tuple = max(accuracy.iteritems(), key = operator.itemgetter(1))[0]
    r = list(r_tuple)
    print "Best: {}: {}".format(list(r_tuple), accuracy[r_tuple])

    # get the result for files in test_for_classification directory
    test_dir = indir_pre + "data/classification_task/test_for_classification"
    csv_f = indir_pre + "data/classification_task/li_result.csv"

    with open(csv_f, 'w') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames = ['ID', 'Prediction'])
        writer.writeheader()

        for root, dirs, filenames in os.walk(test_dir):
            for f in filenames:
                text = preprocess.preprocess_file(os.path.join(root, f))
                min_perp, min_topic = sys.maxint, ''

                for topic in topics:
                    perp = gt_ngrams[topic].generate_perplexity(n, text, r)
                    if perp < min_perp:
                        min_perp = perp
                        min_topic = topic

                writer.writerow({'ID': f, 'Prediction': '{}'.format(topics[min_topic])})



def spell_checker_gt_nrgam():
    pass


def main():
    generate_perplexity_gt_ngram()


if __name__ == "__main__":
    main()



Topic: motorcycles
69460
[1-gram]: 383.367395055
[2-gram]: 50.9102244898
[3-gram]: 20.4338299783
[4-gram]: 21.6217053971
[5-gram]: 32.690829999

Topic: religion
122240
[1-gram]: 374.349757888
[2-gram]: 61.2114565452
[3-gram]: 21.9773614137
[4-gram]: 20.9704505182
[5-gram]: 30.7291032114

Topic: space
98356
[1-gram]: 452.182876807
[2-gram]: 50.2179340294
[3-gram]: 22.1660110232
[4-gram]: 21.8857576873
[5-gram]: 33.7432371164

Topic: atheism
123052
[1-gram]: 352.506629811
[2-gram]: 58.3163221624
[3-gram]: 22.0791240261
[4-gram]: 20.7034454704
[5-gram]: 29.777218442

Topic: autos
75882
[1-gram]: 370.809009053
[2-gram]: 53.9476850901
[3-gram]: 20.8764338813
[4-gram]: 21.2980987217
[5-gram]: 32.6500295317

Topic: graphics
88393
[1-gram]: 431.023923274
[2-gram]: 59.2557115101
[3-gram]: 26.0881617245
[4-gram]: 27.5959418782
[5-gram]: 41.7218570274

Topic: medicine
97481
[1-gram]: 435.307195493
[2-gram]: 57.2599781142
[3-gram]: 23.0292740828
[4-gram]: 23.6068411911
[5-gram]: 37.6671774318
