## Importing the required libraries

In [1]:
from __future__ import unicode_literals
import numpy as np # linear algebra
import pandas as pd # data processing
import nltk
from nltk import NaiveBayesClassifier
from nltk.metrics.scores import f_measure, precision, recall
import collections
import re
from itertools import islice
import random

# Cleaning Data Functions

In [2]:
def window(words_seq, n):
    """Returns a sliding window (of width n) over data from the iterable"""
    "   s -> (s0,s1,...s[n-1]), (s1,s2,...,sn), ...                   "
    it = iter(words_seq)
    result = tuple(islice(it, n))
    if len(result) == n:
        yield result
    for elem in it:
        result = result[1:] + (elem,)
        yield result

In [3]:
def remove_repeating_char(text):
    return re.sub(r'(.)\1+', r'\1\1', text)  # keep 2 repeat

In [4]:
def document_features(document, corpus_features):
    document_words = set(document)
    features = {}
    for word in corpus_features:
        features['has({})'.format(word)] = (word in document_words)
    return features

In [5]:
import pyarabic.araby as araby
def remove_diacritics(text):
  return araby.strip_diacritics(text)

In [6]:
def process_text(text, n=1,
                 remove_vowel_marks=False,
                 remove_repeated_chars=False,
                 ):
    clean_text = text
    if remove_vowel_marks:
        clean_text = remove_diacritics(clean_text)
    if remove_repeated_chars:
        clean_text = remove_repeating_char(clean_text)

    if n == 1:
        return clean_text.split()
    else:
        tokens = clean_text.split()
        grams = tokens
        for i in range(2, n + 1):
            grams += [  ' '.join(g) for g in list(window(tokens, i))  ]
        return grams

In [7]:
def remove_emojis(data):
    emoj = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002500-\U00002BEF"  # chinese char
        u"\U00002702-\U000027B0"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642" 
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"  # dingbats
        u"\u3030"
                      "]+", re.UNICODE)
    return re.sub(emoj, '', data)

# Load Data

In [8]:
def load_tsv(data_file, n):
    data_features = list()
    data = list()
    infile = open(data_file, encoding='utf-8')
    for line in infile:
        if not line.strip():
            continue
        label, text = line.split('\t')
        text_features = process_text(text, n)
        if text_features:
            data_features += text_features
            data.append((text_features, label))
    return data, data_features

In [9]:
pos_train_file = '/content/train_pos.tsv'
neg_train_file = '/content/train_neg.tsv'
pos_test_file = '/content/test_pos.tsv'
neg_test_file = '/content/test_neg.tsv'

## N-Grams

In [10]:
n=1

In [11]:
pos_train_data, pos_train_feat = load_tsv(pos_train_file, n)
neg_train_data, neg_train_feat = load_tsv(neg_train_file, n)
pos_test_data, pos_test_feat = load_tsv(pos_test_file, n)
neg_test_data, neg_test_feat = load_tsv(neg_test_file, n)

In [12]:
#Merge Train Data
print('Train data info : ')
train_data = pos_train_data + neg_train_data
print('     Train data size', len(train_data))
print('     # of positive', len(pos_train_data))
print('     # of negative', len(neg_train_data))

Train data info : 
     Train data size 34154
     # of positive 14484
     # of negative 19670


# Sample Of Train Data

In [13]:
sample_size = 10
print('{} random tweets .... '.format(sample_size))
for s in random.sample(train_data, sample_size):
    print(s)

10 random tweets .... 
(['ÿµÿ±ÿ™', 'ÿßŸÖŸäŸÑ', 'ŸÑŸÑÿµŸÖÿ™', 'ŸàÿßŸÑÿßÿ≥ÿ™ŸÖÿßÿπ', 'ŸÉÿ´Ÿäÿ±ÿß', 'Ÿã', 'ŸÑŸÖ', 'ÿßÿπÿØ', 'ÿßÿ≠ÿ±ÿµ', 'ÿπ', 'ÿßŸÑÿßŸÖÿ≥ÿßŸÉ', 'ÿ®ÿ≤ŸÖÿßŸÖ', 'ÿßŸÑŸÖÿ¨ŸÑÿ≥', 'ŸàÿßŸÑÿ≠ÿØŸäÿ´', '!', '!', 'ŸäŸÖŸÉŸÜ', 'ÿ™ŸÉŸàŸÜ', 'ÿπÿ≤ŸÑÿ©', 'ÿØÿßÿÆŸÑŸäÿ©', 'ÿßŸà', 'ŸÖŸÑŸÑ', 'ÿßŸà', 'ÿ™ÿ¥ÿ®ÿπ', '.', '.', 'ÿßŸÑÿß', 'ÿπ', '‚Ä¶'], 'pos')
(['ÿ™ŸàŸÇÿπÿßÿ™ŸÉŸÖ', 'Ÿàÿ¥', 'ÿ®ŸäŸÉŸàŸÜ', 'ŸÑŸàŸÜ', 'ÿßŸÑÿ∑ŸÇŸÖ', 'ÿü', 'ÿßÿµŸÅÿ±', 'ŸÉÿßŸÖŸÑ', 'ŸàŸÑÿß', 'ÿ±ŸÖÿßÿØŸä', 'ŸàŸÑÿß', 'ÿßÿ≥ŸàÿØ', 'ŸàŸÑÿß', 'ŸÑŸàŸÜ', 'ÿ¨ÿØŸäÿØ', 'ü§î'], 'neg')
(['¬´', 'üò¥', 'üíî', '¬ª', '-', 'ŸÑÿß', 'ÿ™Ÿáÿ™ŸÖŸÄŸàÿß', '⁄™ÿ´ŸäŸÄŸÄÿ±ÿß', 'ŸàŸÑÿß', 'ÿ™ÿπÿßÿ™ÿ®ŸÄŸÄŸàÿß', 'ÿ£ÿ≠ŸÄŸÄÿØÿß', 'ŸàŸÑÿß', 'ÿ™ÿ±⁄™ÿ∂ŸÄŸàÿß', 'ÿÆŸÑŸÄŸÅ', 'ÿ£ÿ≠ŸÄÿØ', 'ŸÑÿ™ÿ∞⁄™ŸÄÿ±ŸàŸáŸÄŸÄŸÖ', 'ÿ®Ÿàÿ¨ŸàÿØ⁄™ŸÄŸÖ', 'ŸàÿßÿπŸÑŸÖŸÄŸÄŸàÿß', 'ÿ¨ŸäŸÄŸÄÿØÿß', 'ÿ£ŸÜ', 'ŸÖŸÄŸÄŸÜ', 'Ÿäÿ≠ŸÄŸÄÿ®', 'ŸÑÿß', '‚Ä¶'], 'neg')
(['ÿ¨ŸàŸÜ', 'üòó', 'üòó', 'üòó', 'üòó'], 'pos')
(['ŸÖŸàÿ∂Ÿàÿπ', 'ÿßŸÜŸä', 'ÿßÿ≠ÿßŸÅÿ∏', 'ÿπ', 'ÿßŸÜŸàÿ´ÿ™Ÿä', 'ŸàÿßŸÜÿß', 'ÿßŸÜÿ≥ŸàŸáÿß', 

# Test Data


In [14]:
# Merge Test Data
print('Test data info : ')
test_data = pos_test_data + neg_test_data
print('     Test data size : ', len(test_data))
print('     # of positive : ', len(pos_test_data))
print('     # of negative : ', len(neg_test_data))

Test data info : 
     Test data size :  8540
     # of positive :  3622
     # of negative :  4918


In [15]:
sample_size = 10
print('{} random tweets .... '.format(sample_size))
for s in random.sample(test_data, sample_size):
    print(s)

10 random tweets .... 
(['ŸÖÿßŸáŸàŸÖÿ¥', 'ŸÖÿ™ÿßŸÉÿØŸäŸÜ', 'ŸàÿπŸÑŸâ', 'ÿßŸÑÿßÿ∫ŸÑÿ®', 'ŸÇÿßŸÑ', 'ŸÖŸàÿ¥', 'ŸÑŸäŸÜÿß', 'ŸäÿπŸÜŸä', 'ÿ¨ÿßÿ™', 'ÿ®ÿßŸÑÿ∫ŸÑÿ∑', '.', '.', '.', 'ÿ™ŸÅÿ≥Ÿäÿ±', 'ÿßŸÑŸä', 'ŸÖÿß', 'ÿπŸÜÿØŸà', 'ŸÖÿß', 'ŸäŸÇŸàŸÑ', 'üòë'], 'neg')
(['ŸÖÿ≠ÿØÿ¥', 'ŸÉŸÑŸÖŸÜŸâ', 'ÿ¥ŸÉÿ±ÿß', 'ü§î'], 'neg')
(['ÿßÿ™ŸÖŸÜŸä', 'ÿßŸÑŸÜÿßÿ≥', 'ÿßŸÑŸä', 'Ÿàÿ≠ÿ¥ÿßŸÜŸä', 'ŸàŸÖÿ®ÿ¥ŸàŸÅŸáŸÖÿ¥', 'ÿßÿ¥ŸàŸÅŸáŸÖ', 'ÿßŸÑŸÜŸáÿßÿ±ÿØŸá', 'ÿπ', 'ÿßŸÑŸàÿßÿØŸâ', 'Ÿäÿßÿ±Ÿäÿ™', 'ŸäÿßÿπŸÜŸä', 'ü§∑'], 'neg')
(['ÿßŸÑÿ®ÿÆŸÑ', '.', '.', 'ŸÖÿ´ŸÑ', 'ÿßŸÑÿ≥ŸÑÿßÿ≠', 'ÿßŸÑŸÑŸä', 'ÿ®ŸÑÿß', 'ÿ∑ŸÑŸÇŸá', 'ŸÖÿß', 'ŸÅÿßÿØ', 'ÿ±ÿßÿπŸäŸá', '.', '.', 'ŸÑŸà', 'ÿ≤ŸäŸÜ', 'Ÿë', 'Ÿá', ':)', ')', 'ŸÅŸä', 'ŸÑÿ®ÿ≥Ÿá', 'ÿßÿ≠ÿØ', 'Ÿç', 'ÿßŸÑŸäÿß', 'ŸÖŸÜ', 'ÿπÿ∑ÿßŸá', 'ÿå', 'ÿßŸÑŸÑŸá', 'ÿπÿ∑ÿß', 'ÿÆŸÑŸÇŸá', 'Ÿà', 'ÿßÿ≠ÿØ', 'Ÿç', '‚Ä¶'], 'pos')
(['ÿ≠ÿßÿ≥ÿ©', 'ÿßŸÜŸáÿß', 'Ÿáÿ™ÿ®ŸÇŸâ', 'ÿ≠ŸÑŸàÿ©', 'ÿ£ŸàŸâ', 'üòç', 'üòç', 'üòç', 'üòç'], 'pos')
(['ÿ£ÿ≠ÿ®', 'ÿ£ÿ®ÿ¥ÿ±ŸÉ', 'ŸàŸÑÿ≥Ÿá', 'ü§¶', 'üèº', '\u200d', '‚ôÄ', 'Ô∏è', 'üíî'], 'neg')
([

# Getting Features

## Merge All Features

In [16]:
all_features = pos_train_feat + neg_train_feat + \
               pos_test_feat + pos_test_feat
print('Len(all_features):', len(all_features))

Len(all_features): 631249


In [17]:
print('{} sample features ...'.format(sample_size))
print(random.sample(all_features, sample_size))

10 sample features ...
['ŸÑ', '.', 'üîê', 'ÿ®ÿ≥', 'üòç', 'ÿ™ŸÜÿ¥ÿ±Ÿä', 'ÿ£ÿ¨ŸÖŸÑ', 'ŸäŸàŸÖ', 'ÿå', 'ÿßŸÑÿ≥ÿπŸàÿØŸäŸäŸÜ']


## Cleaning Features

In [18]:
for i in all_features:
  i=process_text(i)

In [19]:
all_features

['ÿµÿ®ÿ≠ÿ¨',
 'ÿßŸÑŸÑŸá',
 'ÿ®ÿßŸÑÿÆŸäÿ±',
 'Ÿäÿß',
 'üá∞',
 'üáº',
 'üá∞',
 'üáº',
 'üá∞',
 'üáº',
 'Ÿàÿµÿ®ÿ≠ŸÉŸÖ',
 'ÿßŸÑŸÑŸá',
 'ÿ®ÿßŸÑÿÆŸäÿ±',
 'ÿ≠ÿ®ÿßŸäÿ®Ÿä',
 'üòó',
 'üòó',
 '‚ù§',
 'üåπ',
 'ÿ≥ÿ®ÿ≠ÿßŸÜ',
 'ÿßŸÑŸÑŸá',
 '(',
 '3',
 'ŸÖÿ±ÿßÿ™',
 ')',
 'ÿßŸÑÿ≠ŸÖÿØŸÑŸÑŸá',
 '(',
 '3',
 'ŸÖÿ±ÿßÿ™',
 ')',
 'ŸÑÿß',
 'ÿßŸÑŸá',
 'ÿßŸÑÿß',
 'ÿßŸÑŸÑŸá',
 '(',
 '3',
 'ŸÖÿ±ÿßÿ™',
 ')',
 'ÿßŸÑŸÑŸá',
 'ÿßŸÉÿ®ÿ±',
 '(',
 '3',
 'ŸÖÿ±ÿßÿ™',
 ')',
 'ŸÑÿß',
 'ÿ≠ŸàŸÑ',
 'ŸàŸÑÿß',
 'ŸÇŸàÿ©',
 'ÿßŸÑÿß',
 'ÿ®ÿßŸÑŸÑŸá',
 '(',
 '3',
 'ŸÖÿ±ÿßÿ™',
 '‚Ä¶',
 'ÿßŸÅÿßÿß',
 'ŸàÿßŸÜÿß',
 'ÿßŸÇÿØÿ±',
 'ŸÖÿßÿßÿ±ÿ™Ÿàÿ™ŸÑŸÉ',
 'üòä',
 'ÿßŸÑŸÜŸàŸÖŸá',
 'ŸÉŸÑŸáÿß',
 'ÿ±ÿßÿ≠ÿ™',
 'ÿ®ÿ£ÿ≠ŸÑÿßŸÖ',
 'ÿ∫ÿ®Ÿäÿ©',
 'üòÄ',
 '.',
 'ÿπŸÑÿßŸÇÿßÿ™',
 'ÿ≥ÿ∑ÿ≠ŸäŸá',
 '=',
 'ÿ≠Ÿäÿßÿ©',
 'ÿµÿ≠Ÿäÿ©',
 'üòÅ',
 'ÿ£ÿ™ŸÖŸÜŸâ',
 'ÿ™ÿ∑ŸÑŸÇŸàŸÜŸá',
 'ŸÖŸÜ',
 'ŸÖŸÜÿ∑ŸÇÿ©',
 'ÿÆÿßŸÑŸäÿ©',
 'ŸÖŸÜ',
 'ÿßŸÑÿ≥ŸÉÿßŸÜ',
 'ÿ®ŸÖÿ≥ÿßŸÅÿ©',
 'ŸÑÿßÿ™ŸÇŸÑ',
 'ÿπŸÜ',
 'ŸÉŸÑŸÖ',
 'ŸÖŸÜ',
 'ŸÉŸÑ',
 'ÿßŸÑÿ•ÿ™ÿ¨ÿßŸáÿßÿ™',
 'ŸÖŸÜ',
 'ÿ®ÿßÿ®',
 'ÿßŸÑÿ

In [20]:
cleaned_features=[]
for feature in all_features:
  feature=remove_emojis(feature)
  cleaned_features.append(feature)

In [21]:
cleaned_features=list(filter(('').__ne__, cleaned_features))
cleaned_features=list(filter(('.').__ne__, cleaned_features))

In [22]:
cleaned_features

['ÿµÿ®ÿ≠ÿ¨',
 'ÿßŸÑŸÑŸá',
 'ÿ®ÿßŸÑÿÆŸäÿ±',
 'Ÿäÿß',
 'Ÿàÿµÿ®ÿ≠ŸÉŸÖ',
 'ÿßŸÑŸÑŸá',
 'ÿ®ÿßŸÑÿÆŸäÿ±',
 'ÿ≠ÿ®ÿßŸäÿ®Ÿä',
 'ÿ≥ÿ®ÿ≠ÿßŸÜ',
 'ÿßŸÑŸÑŸá',
 '(',
 '3',
 'ŸÖÿ±ÿßÿ™',
 ')',
 'ÿßŸÑÿ≠ŸÖÿØŸÑŸÑŸá',
 '(',
 '3',
 'ŸÖÿ±ÿßÿ™',
 ')',
 'ŸÑÿß',
 'ÿßŸÑŸá',
 'ÿßŸÑÿß',
 'ÿßŸÑŸÑŸá',
 '(',
 '3',
 'ŸÖÿ±ÿßÿ™',
 ')',
 'ÿßŸÑŸÑŸá',
 'ÿßŸÉÿ®ÿ±',
 '(',
 '3',
 'ŸÖÿ±ÿßÿ™',
 ')',
 'ŸÑÿß',
 'ÿ≠ŸàŸÑ',
 'ŸàŸÑÿß',
 'ŸÇŸàÿ©',
 'ÿßŸÑÿß',
 'ÿ®ÿßŸÑŸÑŸá',
 '(',
 '3',
 'ŸÖÿ±ÿßÿ™',
 '‚Ä¶',
 'ÿßŸÅÿßÿß',
 'ŸàÿßŸÜÿß',
 'ÿßŸÇÿØÿ±',
 'ŸÖÿßÿßÿ±ÿ™Ÿàÿ™ŸÑŸÉ',
 'ÿßŸÑŸÜŸàŸÖŸá',
 'ŸÉŸÑŸáÿß',
 'ÿ±ÿßÿ≠ÿ™',
 'ÿ®ÿ£ÿ≠ŸÑÿßŸÖ',
 'ÿ∫ÿ®Ÿäÿ©',
 'ÿπŸÑÿßŸÇÿßÿ™',
 'ÿ≥ÿ∑ÿ≠ŸäŸá',
 '=',
 'ÿ≠Ÿäÿßÿ©',
 'ÿµÿ≠Ÿäÿ©',
 'ÿ£ÿ™ŸÖŸÜŸâ',
 'ÿ™ÿ∑ŸÑŸÇŸàŸÜŸá',
 'ŸÖŸÜ',
 'ŸÖŸÜÿ∑ŸÇÿ©',
 'ÿÆÿßŸÑŸäÿ©',
 'ŸÖŸÜ',
 'ÿßŸÑÿ≥ŸÉÿßŸÜ',
 'ÿ®ŸÖÿ≥ÿßŸÅÿ©',
 'ŸÑÿßÿ™ŸÇŸÑ',
 'ÿπŸÜ',
 'ŸÉŸÑŸÖ',
 'ŸÖŸÜ',
 'ŸÉŸÑ',
 'ÿßŸÑÿ•ÿ™ÿ¨ÿßŸáÿßÿ™',
 'ŸÖŸÜ',
 'ÿ®ÿßÿ®',
 'ÿßŸÑÿ•ÿ≠ÿ™Ÿäÿßÿ∑',
 'ÿ®ÿ≥',
 'ŸÅÿØÿßÿßÿ™ŸÖ',
 'ÿßÿ∞ÿßÿßŸÑŸÇÿ∑Ÿà',
 'ÿßŸÑÿ¥Ÿäÿ±ÿßÿ≤Ÿä',
 'ŸÖÿπÿßŸá',
 'ÿØŸÅÿ™ÿ±',
 'ÿ™ÿ∑ÿπŸäŸÖÿß

## Compute Frequencies


In [23]:
all_features_count = {}
for w in all_features:
    all_features_count[w] = all_features_count.get(w, 0) + 1

In [24]:
print('Sample frequencies')
print(random.sample(list(all_features_count.items()), 30))
word = 'ŸÅŸä'
print('freq of word {} is {}'.format(word, all_features_count.get(word, 0)))
word = 'ŸÅŸâ'
print('freq of word {} is {}'.format(word, all_features_count.get(word, 0)))
word = 'ŸÖŸÜ'
print('freq of word {} is {}'.format(word, all_features_count.get(word, 0)))

Sample frequencies
[('ŸàŸÉÿßŸÑÿ≥ÿßÿπÿ©', 1), ('‚àö', 6), ('ÿßÿ®ÿ≠ÿ´ŸÄŸÄ', 3), ('ÿØŸÖÿ±ÿ™', 2), ('ÿßŸÖŸÑŸÜÿß', 2), ('ÿßŸÑÿ≥Ÿáÿ±ÿ©', 1), ('üéº', 59), ('ÿßŸÑÿ™ÿ∫Ÿäÿ±ÿ±', 2), ('ŸÉÿ®ÿßÿ±Ÿàÿµÿ∫ÿßÿ±ÿ≠ÿ™Ÿâ', 2), ('ÿ£ÿ¥ÿßÿØ', 2), ('ŸÖŸÜÿµ', 1), ('Ÿäÿ≠ŸäŸäŸáÿß', 1), ('ÿßŸÑŸÖÿ±', 4), ('ÿ≥ÿ≠ÿ®Ÿà', 1), ('ÿ®ÿØŸÅÿπŸáÿß', 1), ('ÿßŸÑŸÜÿ®ŸàŸäÿ©', 1), ('ÿ®ÿ™ŸÅŸÇÿπ', 1), ('üîó', 3), ('ŸÑÿ£ÿ™ÿ®ÿßÿπ', 1), ('ÿßŸÑÿ®ŸÑÿßÿ∫ÿßÿ™', 1), ('ÿ®ÿß', 38), ('Ÿàÿ±ÿ≠ŸÑÿ©', 1), ('ÿßŸÖŸàÿ™ÿ™', 1), ('ŸÖÿ®ŸÜÿß', 2), ('ÿπŸÑ€å', 3), ('ŸÉÿ∞ÿßÿ®ŸäŸÜ', 4), ('ŸÖÿÆŸÉ', 3), ('ÿßÿ≠ŸÖÿØ', 22), ('ÿßÿ∞ÿß', 606), ('ÿØÿ±ÿßŸàŸäÿ¥Ÿá', 6)]
freq of word ŸÅŸä is 5168
freq of word ŸÅŸâ is 306
freq of word ŸÖŸÜ is 7836


## Compute Threshold

In [25]:
print('Size of training data:',  len(train_data))
min_df = int(0.001 * len(train_data))
max_df = int(0.98 * len(train_data))
print('min document frequency:', min_df)
print('max document frequency:', max_df)

Size of training data: 34154
min document frequency: 34
max document frequency: 33470


## Select Features

In [26]:
my_features = set([word for word, freq in all_features_count.items() if  max_df > freq > min_df ])
print(len(my_features), 'are kept out of', len(all_features))

1785 are kept out of 631249


In [27]:
print('{} sample of selected features:'.format(sample_size))
print(random.sample(list(my_features), sample_size))

10 sample of selected features:
['ÿßŸÑÿ®Ÿäÿ™', 'ÿπŸÑ', 'üí´', 'ÿßŸÑŸÉŸÑ', 'ÿ¢ŸÑÿ®ŸÄÿπÿØ', 'Ÿê', 'Ôªò', 'ŸäÿßŸÇŸÑÿ®Ÿä', 'ÿßŸÑÿ∫ÿ∂ÿ®', 'ÿµÿßÿ±']


## generating features for training documents ....

In [28]:
feature_sets = [(document_features(d, my_features), c) for (d, c) in train_data]

In [29]:
# Training
classifier = nltk.NaiveBayesClassifier.train(feature_sets)
print('training is done...')

training is done...


## Most informative features

In [30]:
classifier.show_most_informative_features(10)

Most Informative Features
               has(ÿ¥ÿßÿ±ŸÉ) = True              neg : pos    =    339.5 : 1.0
             has(ŸÑŸÑÿ™ÿ∑Ÿàÿπ) = True              neg : pos    =    338.0 : 1.0
              has(ÿßŸÑÿ™ÿ≤ŸÖ) = True              neg : pos    =    238.8 : 1.0
             has(ÿ™ÿßÿ®ÿπŸàÿß) = True              neg : pos    =    238.8 : 1.0
                  has(‚Åß) = True              neg : pos    =    213.3 : 1.0
                 has(:)) = True              pos : neg    =    201.8 : 1.0
                  has(üíµ) = True              neg : pos    =    148.2 : 1.0
                has(ŸÜÿ¥ÿ±) = True              neg : pos    =    146.8 : 1.0
           has(ŸÑŸÑÿ™ÿ∫ÿ±ŸäÿØÿ©) = True              neg : pos    =    144.9 : 1.0
                  has(üîï) = True              neg : pos    =    131.6 : 1.0


## Testing

In [31]:
test_features = [(document_features(d, my_features), c) for (d, c) in test_data]

In [32]:
ref_sets = collections.defaultdict(set)
test_sets = collections.defaultdict(set)

for i, (feats, label) in enumerate(test_features):
    ref_sets[label].add(i)
    observed = classifier.classify(feats)
    test_sets[observed].add(i)

# Conclusion

In [33]:
print('accuracy: ', nltk.classify.accuracy(classifier, test_features))
print('pos precision: ', precision(ref_sets['pos'], test_sets['pos']))
print('pos recall:', recall(ref_sets['pos'], test_sets['pos']))
print('neg precision: ', precision(ref_sets['neg'], test_sets['neg']))
print('neg recall:', recall(ref_sets['neg'], test_sets['neg']))
print('positive f-score:', f_measure(ref_sets['pos'], test_sets['pos']))
print('negative f-score:', f_measure(ref_sets['neg'], test_sets['neg']))

accuracy:  0.8953161592505855
pos precision:  0.8467208947635994
pos recall: 0.9196576477084484
neg precision:  0.9368215371254885
neg recall: 0.8773891825945507
positive f-score: 0.8816834303864478
negative f-score: 0.9061318773624528
