In [None]:
import regex
import string
from collections import Counter
import csv

In [None]:
import nltk
from nltk.tokenize import casual_tokenize
from nltk.util import ngrams
from nltk.util import bigrams
from nltk.lm import NgramCounter
from nltk.corpus import stopwords

In [None]:
import matplotlib
import matplotlib.pyplot as plt

In [None]:
%matplotlib

In [None]:
DATA_DIR = '../data/'
DATA = 'INPUT.txt'

In [None]:
punctuation = list(string.punctuation) + ['…', '...','’']
stop_words = stopwords.words('english') + ['rt', 'via']

In [None]:
# The total number of distinct words (vocabulary)
# The total number of tokens corresponding to the top 10 most frequent words (types) in the vocabulary
def totals(text):
    token_count = 0
    counts = Counter(text)
    for key, val in counts.most_common(10):
        token_count += val
    return len(counts), counts.most_common(10), token_count

In [None]:
# The number of words, characters and av char count for each tweet
def tweet_counts(tweet):
    char_counts = [len(word) for word in tweet]
    total_chars = sum(char_counts)
    av_chars = total_chars / len(char_counts)
    return len(tweet), total_chars, av_chars

In [None]:
# calculate standard deviation of characters per token in a tweet
def standard_deviation(text):
    char_count = []
    words = text.split(' ')
    for word in words:
        char_count.append(len(word))

    diffs = 0
    average = sum(char_count)/len(char_count)
    for n in char_count:
        diffs += (n - average)**(2)
    return (diffs/(len(char_count)-1))**(0.5)

In [None]:
# The average number and standard deviation of characters per token
# The total number of characters
def token_counts(tweet_list):
    # sum the values with same keys for char_count and av_char_count
    char_counts = []
    word_counts = []

    for tweet in tweet_list:
        char_counts.append(tweet['char_count'])
        word_counts.append(tweet['word_count'])

    sd = standard_deviation(tweet['text'])
    total_chars = sum(char_counts)
    total_words = sum(word_counts)
    av_char_count = total_chars / total_words
    return total_chars, av_char_count, sd

In [None]:
# debugging
def write_counter_to_file(counters):
    filenames = ['bigrams','trigrams','fourgrams','fivegrams']
    num = 0
    for counter in counters:
        filename = filenames[num]
        with open(filename + '.csv', 'w') as csvfile:
            writer = csv.writer(csvfile)
            for key, value in counter.items():
                writer.writerow(list(key) + [value])
            num = num+1

In [None]:
        # The total number of distinct n-grams (of words) that appear in the dataset for n=2,3
# You can use sets or Counters pretty easily
# https://dbader.org/blog/sets-and-multiset-in-python
# Check http://www.nltk.org/api/nltk.html#nltk.util.bigrams
# This also looks promising
# http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html
def ngram_counts(tweet_list):
    bigram_counter = Counter()
    trigram_counter = Counter()
    fourgram_counter = Counter()
    fivegram_counter = Counter()
    list_counters = []

    for tweet in tweet_list:
        tokens = tweet['text'].split()
        bigrams = list(nltk.ngrams(tokens, 2))
        trigrams = list(nltk.ngrams(tokens, 3))
        fourgrams = list(nltk.ngrams(tokens, 4))
        fivegrams = list(nltk.ngrams(tokens, 5))
        bigram_counter.update(bigrams)
        trigram_counter.update(trigrams)
        fourgram_counter.update(fourgrams)
        fivegram_counter.update(fivegrams)
    distinct_bigram_counter = len(bigram_counter) # distinct counts
    distinct_trigram_counter = len(trigram_counter) # distinct counts
    distinct_fourgram_counter = len(fourgram_counter) # distinct counts
    distinct_fivegram_counter = len(fivegram_counter) # distinct counts

    # debug... trigrams count slightly higher??
    list_counters = [bigram_counter, trigram_counter, fourgram_counter, fivegram_counter]
    write_counter_to_file(list_counters)

    #total_counter = sum(ngram_counter.values()) # total counts
    return distinct_bigram_counter, distinct_trigram_counter, distinct_fourgram_counter, distinct_fivegram_counter

Plot a token log frequency. Describe what this plot means and how to interpret it.
A useful way to do this is by plotting log-frequency against the log-rank
You could also plot Heap's law - types versus tokens
You could also do a log frequency graph of the top n tokens
Describe out it might help you understand coverage when training a model?

In [None]:
def token_log_freq(corpus):
    corpus_counts = Counter(corpus)
    plt.loglog([val for word, val in corpus_counts.most_common(1000)])
    plt.xlabel('rank')
    plt.ylabel('frequency')
    #plt.show()
    return

In [None]:
with open(DATA_DIR + DATA, 'r') as data:
    tweet_list = []
    corpus = []
    for line in data:
        #print(line.split('\t'))

        info = line.strip().split('\t')
        id, label, text = info[0], info[1], ' '.join(info[2:])

        #tokens = tokenize(text)
        tokens = casual_tokenize(text)
        tokens = [token.lower() for token in tokens]
        tokens = [token for token in tokens if not token in stop_words]
        tokens = [token for token in tokens if not token in punctuation]
        #tokens = [tokens for token in tokens if not token.isdigit()]

        corpus.extend(tokens) # one giant list
        word_count, char_count, av_char_count = tweet_counts(tokens)

        row = {
            'id': id,
            'label': label,
            'text': ' '.join(tokens),
            'word_count': word_count,
            'char_count': char_count,
        }
        tweet_list.append(row)

        total_tweets = len(tweet_list)

In [None]:
total_chars, av_char_count, sd = token_counts(tweet_list)

In [None]:
print("total tweets = ", total_tweets)
print("total tokens = ", len(corpus))
print("total number of characters = ", total_chars)
print("av chars per token = ", av_char_count)
print("sd for chars per token = ", sd)

In [None]:
vocab, most_common, tokens_common = totals(corpus)

In [None]:
print("vocabulary size = ", vocab)
print("10 most common words = ", most_common)
print("token count of 10 most common words = ", tokens_common)

In [None]:
# The token/type ratio in the dataset
# Type-Token Ratio can be obtained by dividing the total type count by the total token count.
# The basic idea is that the higher the number, the more lexically diverse
print("token/type ratio = ", vocab / len(corpus))

In [None]:
bigram_counts, trigram_counts, fourgram_counts, fivegram_counts = ngram_counts(tweet_list)

In [None]:
print("unigram count = ", len(corpus))
print("bigram count = ", bigram_counts)
print("trigram count = ", trigram_counts)
print("four-gram count = ", fourgram_counts)
print("five-gram count = ", fivegram_counts)

In [None]:
# If the graph approximately accurate, it should look linear
# The idea here is that word count distributions follow a power law
# According to Zipf's law, the frequency of any word is inversely proportional
# to its rank in the frequency table.
token_log_freq(corpus)

In [None]:
# Just to prove that the nltk ngram function is working as expected
# The total number of trigrams is always one less than the number of bigrams, 
# but the number of unique trigrams is likely to be greater and will have lower numbers unless a lot of the bigrams 
# are wholly subsumed by the corresponding trigrams

input = "a ab abc abcd abcde abcdef abcdefg."

bigrams = {}
trigrams = {}
bigrams_total = 0
trigrams_total = 0

i = 0
while i < len(input):
    if i+1 < len(input):
        bigram = input[i:i+2]
        print("bigram : [" + bigram + "]   length: ", len(bigram))
        if bigram in bigrams:
            bigrams[bigram] += 1
        else:
            bigrams[bigram] = 1
        bigrams_total += 1
    if i + 2 < len(input):
        trigram = input[i:i+3]
        print("trigram: ["+ trigram + "]    length", len(trigram))
        if trigram in trigrams:
            trigrams[trigram] += 1
        else:
            trigrams[trigram] = 1
        trigrams_total += 1
    i += 1
print("\n")
print("length of input", len(input), "\n")
print("bigrams length (unique bigrams): ", len(bigrams))
print("bigrams total: ", bigrams_total, "\n")
print("trigrams length (unique trigrams): ", len(trigrams))
print("trigrams total: ", trigrams_total, "\n")
print ("\nnum bigrams == len(input) -1")
print ("num trigrams == len(input) -2")