## Importing the required libraries

In [1]:
from __future__ import unicode_literals
import numpy as np # linear algebra
import pandas as pd # data processing
import nltk
from nltk import NaiveBayesClassifier
from nltk.metrics.scores import f_measure, precision, recall
import collections
import re
from itertools import islice
import random

# Cleaning Data Functions

In [2]:
def window(words_seq, n):
    """Returns a sliding window (of width n) over data from the iterable"""
    "   s -> (s0,s1,...s[n-1]), (s1,s2,...,sn), ...                   "
    it = iter(words_seq)
    result = tuple(islice(it, n))
    if len(result) == n:
        yield result
    for elem in it:
        result = result[1:] + (elem,)
        yield result

In [3]:
def remove_repeating_char(text):
    return re.sub(r'(.)\1+', r'\1\1', text)  # keep 2 repeat

In [4]:
def document_features(document, corpus_features):
    document_words = set(document)
    features = {}
    for word in corpus_features:
        features['has({})'.format(word)] = (word in document_words)
    return features

In [5]:
import pyarabic.araby as araby
def remove_diacritics(text):
  return araby.strip_diacritics(text)

In [6]:
def process_text(text, n=1,
                 remove_vowel_marks=False,
                 remove_repeated_chars=False,
                 ):
    clean_text = text
    if remove_vowel_marks:
        clean_text = remove_diacritics(clean_text)
    if remove_repeated_chars:
        clean_text = remove_repeating_char(clean_text)

    if n == 1:
        return clean_text.split()
    else:
        tokens = clean_text.split()
        grams = tokens
        for i in range(2, n + 1):
            grams += [  ' '.join(g) for g in list(window(tokens, i))  ]
        return grams

In [7]:
def remove_emojis(data):
    emoj = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002500-\U00002BEF"  # chinese char
        u"\U00002702-\U000027B0"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642" 
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"  # dingbats
        u"\u3030"
                      "]+", re.UNICODE)
    return re.sub(emoj, '', data)

# Load Data

In [8]:
def load_tsv(data_file, n):
    data_features = list()
    data = list()
    infile = open(data_file, encoding='utf-8')
    for line in infile:
        if not line.strip():
            continue
        label, text = line.split('\t')
        text_features = process_text(text, n)
        if text_features:
            data_features += text_features
            data.append((text_features, label))
    return data, data_features

In [9]:
pos_train_file = '/content/train_pos.tsv'
neg_train_file = '/content/train_neg.tsv'
pos_test_file = '/content/test_pos.tsv'
neg_test_file = '/content/test_neg.tsv'

## N-Grams

In [10]:
n=1

In [11]:
pos_train_data, pos_train_feat = load_tsv(pos_train_file, n)
neg_train_data, neg_train_feat = load_tsv(neg_train_file, n)
pos_test_data, pos_test_feat = load_tsv(pos_test_file, n)
neg_test_data, neg_test_feat = load_tsv(neg_test_file, n)

In [12]:
#Merge Train Data
print('Train data info : ')
train_data = pos_train_data + neg_train_data
print('     Train data size', len(train_data))
print('     # of positive', len(pos_train_data))
print('     # of negative', len(neg_train_data))

Train data info : 
     Train data size 34154
     # of positive 14484
     # of negative 19670


# Sample Of Train Data

In [13]:
sample_size = 10
print('{} random tweets .... '.format(sample_size))
for s in random.sample(train_data, sample_size):
    print(s)

10 random tweets .... 
(['و', 'لكنك', 'تعلم', 'بأن', 'النظر', 'إليك', 'نجاتي', '.', '❤', '❤', '🎼'], 'pos')
(['فخامة', 'وملوك', '👌', '💙', '…'], 'pos')
(['يعني', 'قاعد', 'يسلك', 'لي', '🤐'], 'neg')
(['الله', 'يسعدك', 'يا', 'سيف', 'الاهلي', '"', 'يا', 'غلاك', 'بقلوبنا', 'شاركت', 'او', 'لم', 'تشارك', 'فرحتك', 'مع', 'كل', 'هدف', 'لاهلينا', 'تأكد', 'انك', 'لست', 'مجرد', 'منتمي', 'بل', 'انت', 'عاشق', 'وفي', '…'], 'pos')
(['يجي', 'يدلع', 'معانا', '😐'], 'neg')
(['المشكلة', 'اني', 'عاذرتهم', 'و', 'ما', 'اقدر', 'اقولهم', 'شي', 'كلبة', 'وحدة', '😔', '💙'], 'neg')
(['بنت', 'دلوعة', '😌', 'ما', 'تآكل', 'لحم', 'الجمل', 'سألوها', ':', 'ليش', '.', '.', 'ماتآكلين', 'لحم', 'الجمل', '؟', '؟', 'قالت', 'وعع', 'يمشــي', 'حافي', 'ياحياتي', 'انتي', '،', '😍', '😍', 'يعني', 'الدجاجة', '…'], 'pos')
(['تامر', 'حسني', '😍', '😩', '!', '!', '!'], 'neg')
(['قطر', 'وتركيا', 'والأخوان', 'المجرمين', 'مثل', 'النائحة', 'المستأجرة', 'صوتها', 'أعلى', 'من', 'المكلومه', '👍', '😎', '😁', '😁', '😁', '😁', '😁', '😁'], 'pos')
(['موفقة', 'يا'

# Test Data


In [14]:
# Merge Test Data
print('Test data info : ')
test_data = pos_test_data + neg_test_data
print('     Test data size : ', len(test_data))
print('     # of positive : ', len(pos_test_data))
print('     # of negative : ', len(neg_test_data))

Test data info : 
     Test data size :  8540
     # of positive :  3622
     # of negative :  4918


In [15]:
sample_size = 10
print('{} random tweets .... '.format(sample_size))
for s in random.sample(test_data, sample_size):
    print(s)

10 random tweets .... 
(['سفهاء', 'الدنيا', '😊', '👏', '🏻', 'لاحول', 'لله', 'ولاقوة', 'الا', 'يالله'], 'pos')
(['وانا', 'اخلص', 'المطبخ', 'كله', 'في', 'يوم', '😙'], 'pos')
(['🔴', '🔴', '🔴', '🔴', '🔴', '🔴', '🔴', 'العلاقة', 'بين', 'والتطعيمات', '!', '!', 'تعقيب', 'مهم', 'من', '🔽', '🔽', '🔽', '🔽', '🔽', '…'], 'neg')
(['🦗', '🐝', '🦗', '🐝', '🦗', 'قريبا', 'ً', 'في', 'فصل', 'الشتاء', 'يستلم', 'الذباب', 'صباحا', 'ويسلم', 'المغرب', '😎', 'ويستلم', 'البعوض', 'المغرب', 'ويسلم', 'صباحا', '😀', '😀', 'يعني', '…'], 'pos')
(['الحلو', 'بيأكل', 'الحلو', '❤', '️', '😋'], 'pos')
(['قللبي', 'انتي', 'والله', '😢', '😢', '😢', 'انا', 'بععد', 'اشتقت', 'لك', 'جددا', 'ً', ':('], 'neg')
(['ناقصين', 'فلسفه', 'حنا', '😔'], 'neg')
(['وانتت', 'ي', 'محمد', 'بعد', 'مغرورر', 'خل', 'نششوف', 'وجهكك', '🙁'], 'neg')
(['وهتسبيني', 'لوحدي', '؟', '!', '🤷'], 'neg')
(['والي', 'يتغلى', 'يربط', 'حزام', 'فرقاه', 'رحله', 'بلا', 'رجعه', 'و', 'فرصه', 'سعيده', '😉'], 'pos')


# Getting Features

## Merge All Features

In [16]:
all_features = pos_train_feat + neg_train_feat + \
               pos_test_feat + pos_test_feat
print('Len(all_features):', len(all_features))

Len(all_features): 631249


In [17]:
print('{} sample features ...'.format(sample_size))
print(random.sample(all_features, sample_size))

10 sample features ...
['️', 'َ', 'المبدعه', 'اليوم', '😂', 'الحق', 'النوم', 'دون', '💔', 'منها']


## Cleaning Features

In [18]:
for i in all_features:
  i=process_text(i)

In [19]:
all_features

['صبحج',
 'الله',
 'بالخير',
 'يا',
 '🇰',
 '🇼',
 '🇰',
 '🇼',
 '🇰',
 '🇼',
 'وصبحكم',
 'الله',
 'بالخير',
 'حبايبي',
 '😗',
 '😗',
 '❤',
 '🌹',
 'سبحان',
 'الله',
 '(',
 '3',
 'مرات',
 ')',
 'الحمدلله',
 '(',
 '3',
 'مرات',
 ')',
 'لا',
 'اله',
 'الا',
 'الله',
 '(',
 '3',
 'مرات',
 ')',
 'الله',
 'اكبر',
 '(',
 '3',
 'مرات',
 ')',
 'لا',
 'حول',
 'ولا',
 'قوة',
 'الا',
 'بالله',
 '(',
 '3',
 'مرات',
 '…',
 'افاا',
 'وانا',
 'اقدر',
 'ماارتوتلك',
 '😊',
 'النومه',
 'كلها',
 'راحت',
 'بأحلام',
 'غبية',
 '😀',
 '.',
 'علاقات',
 'سطحيه',
 '=',
 'حياة',
 'صحية',
 '😁',
 'أتمنى',
 'تطلقونه',
 'من',
 'منطقة',
 'خالية',
 'من',
 'السكان',
 'بمسافة',
 'لاتقل',
 'عن',
 'كلم',
 'من',
 'كل',
 'الإتجاهات',
 'من',
 'باب',
 'الإحتياط',
 'بس',
 '🙂',
 'فدااتم',
 '😗',
 '😗',
 '😗',
 '😗',
 'اذاالقطو',
 'الشيرازي',
 'معاه',
 'دفتر',
 'تطعيماته',
 'تروني',
 'شراي',
 '😂',
 '😎',
 'وقالت',
 'علا',
 ':',
 '"',
 'زي',
 'القمر',
 'ما',
 'شاء',
 'الله',
 '"',
 '.',
 '.',
 'مافي',
 'إعاقة',
 '.',
 '.',
 'الإعاقة',
 'إعاقة',


In [20]:
cleaned_features=[]
for feature in all_features:
  feature=remove_emojis(feature)
  cleaned_features.append(feature)

In [21]:
cleaned_features=list(filter(('').__ne__, cleaned_features))
cleaned_features=list(filter(('.').__ne__, cleaned_features))

In [22]:
cleaned_features

['صبحج',
 'الله',
 'بالخير',
 'يا',
 'وصبحكم',
 'الله',
 'بالخير',
 'حبايبي',
 'سبحان',
 'الله',
 '(',
 '3',
 'مرات',
 ')',
 'الحمدلله',
 '(',
 '3',
 'مرات',
 ')',
 'لا',
 'اله',
 'الا',
 'الله',
 '(',
 '3',
 'مرات',
 ')',
 'الله',
 'اكبر',
 '(',
 '3',
 'مرات',
 ')',
 'لا',
 'حول',
 'ولا',
 'قوة',
 'الا',
 'بالله',
 '(',
 '3',
 'مرات',
 '…',
 'افاا',
 'وانا',
 'اقدر',
 'ماارتوتلك',
 'النومه',
 'كلها',
 'راحت',
 'بأحلام',
 'غبية',
 'علاقات',
 'سطحيه',
 '=',
 'حياة',
 'صحية',
 'أتمنى',
 'تطلقونه',
 'من',
 'منطقة',
 'خالية',
 'من',
 'السكان',
 'بمسافة',
 'لاتقل',
 'عن',
 'كلم',
 'من',
 'كل',
 'الإتجاهات',
 'من',
 'باب',
 'الإحتياط',
 'بس',
 'فدااتم',
 'اذاالقطو',
 'الشيرازي',
 'معاه',
 'دفتر',
 'تطعيماته',
 'تروني',
 'شراي',
 'وقالت',
 'علا',
 ':',
 '"',
 'زي',
 'القمر',
 'ما',
 'شاء',
 'الله',
 '"',
 'مافي',
 'إعاقة',
 'الإعاقة',
 'إعاقة',
 'الفكر',
 'إعاقته',
 'عن',
 'التقدم',
 'والإنجاز',
 'والنجاح',
 'وح',
 'ُ',
 'ب',
 'ومساعدة',
 'الغير',
 'لا',
 'داعي',
 'لجسدك',
 'مهما',
 'كان',
 '

## Compute Frequencies


In [19]:
all_features_count = {}
for w in all_features:
    all_features_count[w] = all_features_count.get(w, 0) + 1

In [20]:
print('Sample frequencies')
print(random.sample(list(all_features_count.items()), 30))
word = 'في'
print('freq of word {} is {}'.format(word, all_features_count.get(word, 0)))
word = 'فى'
print('freq of word {} is {}'.format(word, all_features_count.get(word, 0)))
word = 'من'
print('freq of word {} is {}'.format(word, all_features_count.get(word, 0)))

Sample frequencies
[('سـواها', 1), ('ومرحبا', 2), ('بالج', 2), ('المرسم', 2), ('تناديني', 1), ('يأدون', 2), ('تتغذي', 2), ('غـفو', 1), ('👏', 468), ('مانويت', 3), ('كرييم', 1), ('تساوي', 8), ('هريرة', 4), ('اگول', 1), ('يضحي', 1), ('دحلان', 17), ('صبورة', 1), ('كيوتة', 1), ('بتوجهات', 1), ('باباه', 1), ('غيرران', 2), ('الامان', 13), ('الموجب', 1), ('يمرضها', 1), ('تفاهة', 3), ('علسك', 1), ('يامشلحط', 1), ('ويتكلم', 6), ('كريستين', 2), ('بالسماء', 4)]
freq of word في is 5168
freq of word فى is 306
freq of word من is 7836


## Compute Threshold

In [21]:
print('Size of training data:',  len(train_data))
min_df = int(0.001 * len(train_data))
max_df = int(0.98 * len(train_data))
print('min document frequency:', min_df)
print('max document frequency:', max_df)

Size of training data: 34154
min document frequency: 34
max document frequency: 33470


## Select Features

In [22]:
my_features = set([word for word, freq in all_features_count.items() if  max_df > freq > min_df ])
print(len(my_features), 'are kept out of', len(all_features))

1785 are kept out of 631249


In [23]:
print('{} sample of selected features:'.format(sample_size))
print(random.sample(list(my_features), sample_size))

10 sample of selected features:
['اجل', 'الايميل', 'عشق', '😭', 'فعلا', 'ح', 'ماهو', 'يبي', 'شيخ', 'يصير']


## Sklearn Feature Extractions

In [28]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
vec=CountVectorizer(binary=True)
vec.fit(all_features)

CountVectorizer(binary=True)

### Binary

In [29]:
vocabulary=sorted (vec.vocabulary_.keys())
[w for w in vocabulary]

['00',
 '04',
 '10',
 '100',
 '11',
 '12',
 '13',
 '14',
 '1440',
 '15',
 '16',
 '160',
 '17',
 '179',
 '18',
 '181105',
 '181130',
 '19',
 '200',
 '2011',
 '2014',
 '2016',
 '2018',
 '2145',
 '22',
 '2233',
 '25',
 '27',
 '28',
 '30',
 '32',
 '33',
 '35',
 '36',
 '365',
 '38',
 '39',
 '40',
 '400',
 '44',
 '45',
 '47',
 '50',
 '500',
 '511',
 '52',
 '53',
 '554516',
 '59',
 '61',
 '63',
 '80',
 '95',
 '96',
 '_4_',
 '__',
 '__اتت',
 '__ترى',
 '_أ',
 '_ارطغر',
 '_اطلقوا',
 '_الحمدلله',
 '_الخيانه',
 '_الكذب',
 '_المعجب',
 '_الن',
 '_انا',
 '_اه',
 '_اهبط_الاتحاد٢٠',
 '_بـآگـر',
 '_جميل',
 '_دووق',
 '_س',
 '_سحلب',
 '_شاى',
 '_عبثا',
 '_علىﷺ_',
 '_غزاوي',
 '_فجأة',
 '_قراءة',
 '_قهوه',
 '_كيف',
 '_للنقاش',
 '_لون',
 '_نسكافيه',
 '_هما',
 '_و',
 '_ولا',
 'ءء',
 'آء',
 'آآخ',
 'آئ',
 'آااخ',
 'آاهاا',
 'آب',
 'آبآدر',
 'آبتسامه',
 'آبد',
 'آبدآع',
 'آبدا',
 'آبعـد',
 'آبـتـسـم',
 'آبـرهــن',
 'آبن',
 'آبهاا',
 'آبوكم',
 'آبي',
 'آبيك',
 'آت',
 'آتاك',
 'آتاهم',
 'آتخيل',
 'آتصـآله',
 'آتض

In [30]:
features=vec.transform(my_features)
print(len(my_features))

1785


In [31]:
pd.DataFrame(features.toarray(),columns=vocabulary)

Unnamed: 0,00,04,10,100,11,12,13,14,1440,15,...,ﻳﻠﺘﻔﺖ,ﻳﻤﻜﻨﻪ,ﻳﻨﺴﺤﺐ,ﻳﻮﺍﺟﻪ,ﻷلمك,ﻷنفسنا,ﻷنني,ﻷنها,ﻷﻱ,ﻹﺻﺤﺎﺏ
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1780,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1781,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1782,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1783,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Counting

In [32]:
vec2=CountVectorizer(binary=False)
vec2.fit(all_features)
pd.DataFrame(vec2.transform(my_features).toarray(),columns=vocabulary)

Unnamed: 0,00,04,10,100,11,12,13,14,1440,15,...,ﻳﻠﺘﻔﺖ,ﻳﻤﻜﻨﻪ,ﻳﻨﺴﺤﺐ,ﻳﻮﺍﺟﻪ,ﻷلمك,ﻷنفسنا,ﻷنني,ﻷنها,ﻷﻱ,ﻹﺻﺤﺎﺏ
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1780,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1781,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1782,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1783,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [33]:
features2=vec2.transform(my_features)

### TF- DIF

In [34]:
vec3=TfidfVectorizer()
vec3.fit(all_features)
pd.DataFrame(vec3.transform(my_features).toarray(),columns=vocabulary)

Unnamed: 0,00,04,10,100,11,12,13,14,1440,15,...,ﻳﻠﺘﻔﺖ,ﻳﻤﻜﻨﻪ,ﻳﻨﺴﺤﺐ,ﻳﻮﺍﺟﻪ,ﻷلمك,ﻷنفسنا,ﻷنني,ﻷنها,ﻷﻱ,ﻹﺻﺤﺎﺏ
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1780,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1781,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1782,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1783,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## generating features for training documents ....

In [24]:
feature_sets = [(document_features(d, my_features), c) for (d, c) in train_data]

In [25]:
# Training
classifier = nltk.NaiveBayesClassifier.train(feature_sets)
print('training is done...')

training is done...


## Most informative features

In [26]:
classifier.show_most_informative_features(10)

Most Informative Features
               has(شارك) = True              neg : pos    =    339.5 : 1.0
             has(للتطوع) = True              neg : pos    =    338.0 : 1.0
             has(تابعوا) = True              neg : pos    =    238.8 : 1.0
              has(التزم) = True              neg : pos    =    238.8 : 1.0
                  has(⁧) = True              neg : pos    =    213.3 : 1.0
                 has(:)) = True              pos : neg    =    201.8 : 1.0
                  has(💵) = True              neg : pos    =    148.2 : 1.0
                has(نشر) = True              neg : pos    =    146.8 : 1.0
           has(للتغريدة) = True              neg : pos    =    144.9 : 1.0
                  has(🔕) = True              neg : pos    =    131.6 : 1.0


## Testing

In [27]:
test_features = [(document_features(d, my_features), c) for (d, c) in test_data]

In [28]:
ref_sets = collections.defaultdict(set)
test_sets = collections.defaultdict(set)

for i, (feats, label) in enumerate(test_features):
    ref_sets[label].add(i)
    observed = classifier.classify(feats)
    test_sets[observed].add(i)

# Conclusion

In [29]:
print('accuracy: ', nltk.classify.accuracy(classifier, test_features))
print('pos precision: ', precision(ref_sets['pos'], test_sets['pos']))
print('pos recall:', recall(ref_sets['pos'], test_sets['pos']))
print('neg precision: ', precision(ref_sets['neg'], test_sets['neg']))
print('neg recall:', recall(ref_sets['neg'], test_sets['neg']))
print('positive f-score:', f_measure(ref_sets['pos'], test_sets['pos']))
print('negative f-score:', f_measure(ref_sets['neg'], test_sets['neg']))

accuracy:  0.8953161592505855
pos precision:  0.8467208947635994
pos recall: 0.9196576477084484
neg precision:  0.9368215371254885
neg recall: 0.8773891825945507
positive f-score: 0.8816834303864478
negative f-score: 0.9061318773624528
