<a href="https://colab.research.google.com/github/MohamadElnomrossie/Omdena-seniment-analysis/blob/rest_ML_models(Naive_Bayes)/arabic_sentiment_analysis_in_tweets_nb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Arabic Sentiment Analysis in tweets using Naive Bayes Machine learning Algorithm 


In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import nltk
from nltk import NaiveBayesClassifier
from nltk.metrics.scores import f_measure, precision, recall
import collections


# Input data files are available in the "input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
path= "/content/input/"
for filename in os.listdir(path):
    print(filename)

# Any results you write to the current directory are saved as output.

test_Arabic_tweets_negative_20190413.tsv
train_Arabic_tweets_negative_20190413.tsv
test_Arabic_tweets_positive_20190413.tsv
train_Arabic_tweets_positive_20190413.tsv


# define functions 

In [3]:
import re
from itertools import islice

def load_tsv(data_file, n):
    data_features = list()
    data = list()
    infile = open(data_file, encoding='utf-8')
    for line in infile:
        if not line.strip():
            continue
        label, text = line.split('\t')
        text_features = process_text(text, n)
        if text_features:
            data_features += text_features
            data.append((text_features, label))
    return data, data_features

def process_text(text, n=1,
                 remove_vowel_marks=False,
                 remove_repeated_chars=False,
                 ):
    clean_text = text
    if remove_vowel_marks:
        clean_text = remove_diacritics(clean_text)
    if remove_repeated_chars:
        clean_text = remove_repeating_char(clean_text)

    if n == 1:
        return clean_text.split()
    else:
        tokens = clean_text.split()
        grams = tokens
        for i in range(2, n + 1):
            grams += [  ' '.join(g) for g in list(window(tokens, i))  ]
        return grams



def window(words_seq, n):
    """Returns a sliding window (of width n) over data from the iterable"""
    "   s -> (s0,s1,...s[n-1]), (s1,s2,...,sn), ...                   "
    it = iter(words_seq)
    result = tuple(islice(it, n))
    if len(result) == n:
        yield result
    for elem in it:
        result = result[1:] + (elem,)
        yield result


def remove_repeating_char(text):
    # return re.sub(r'(.)\1+', r'\1', text)     # keep only 1 repeat
    return re.sub(r'(.)\1+', r'\1\1', text)  # keep 2 repeat

def document_features(document, corpus_features):
    document_words = set(document)
    features = {}
    for word in corpus_features:
        features['has({})'.format(word)] = (word in document_words)
    return features

# Load corpus

In [4]:
pos_train_file = 'input/train_Arabic_tweets_positive_20190413.tsv'
neg_train_file = 'input/train_Arabic_tweets_negative_20190413.tsv'

pos_test_file = 'input/test_Arabic_tweets_positive_20190413.tsv'
neg_test_file = 'input/test_Arabic_tweets_negative_20190413.tsv'
print('data files')
print('train file (pos)', pos_train_file)
print('train file (neg)', neg_train_file)
print('test file (pos)', pos_test_file)
print('test file (neg)', neg_test_file)

data files
train file (pos) input/train_Arabic_tweets_positive_20190413.tsv
train file (neg) input/train_Arabic_tweets_negative_20190413.tsv
test file (pos) input/test_Arabic_tweets_positive_20190413.tsv
test file (neg) input/test_Arabic_tweets_negative_20190413.tsv


# Parameters (ngrams)

In [5]:
print('parameters')
n = 1
print('n grams:', n)

parameters
n grams: 1


# loading train data .... 

In [6]:
print('loading train data ....')
pos_train_data, pos_train_feat = load_tsv(pos_train_file, n)
neg_train_data, neg_train_feat = load_tsv(neg_train_file, n)
print('loading test data ....')
pos_test_data, pos_test_feat = load_tsv(pos_test_file, n)
neg_test_data, neg_test_feat = load_tsv(neg_test_file, n)

loading train data ....
loading test data ....


# Training data information

In [7]:
print('train data info')
train_data = pos_train_data + neg_train_data
print('train data size', len(train_data))
print('# of positive', len(pos_train_data))
print('# of negative', len(neg_train_data))

train data info
train data size 47000
# of positive 23879
# of negative 23121


# Sample training data 

In [8]:
import random
sample_size = 100
print('{} random tweets .... '.format(sample_size))
for s in random.sample(train_data, sample_size):
    print(s)

100 random tweets .... 
(['صباح', 'الخير', '..🌻', 'ينام', 'العبد', 'على', 'أمر', 'قد', 'يئس', 'منه', 'و', 'يستيقظ', 'على', 'انفراجه', '،', 'دع', 'الخالق', 'يجبر', 'كسر', 'قلبك', 'و', 'اجعل', 'همومك', 'وراء', 'ظه…'], 'pos')
(['الحمد', 'لله', 'على', 'الفوز', 'و', 'الصدارة', 'المؤقته', 'بتكاتف', 'الجميع', 'و', 'عدم', 'التفريط', 'في', 'اي', 'نقطه', 'بإذن', 'الله', 'نحقق', 'المطلوب', '💙'], 'pos')
(['بدون', 'أخيك', 'ستأكلك', 'الضباع', 'حتى', 'لو', 'كنت', 'أسدا', '..', 'فلا', 'تفرط', 'ذات', 'يوم', 'بأخيك', '💔', '✨'], 'neg')
(['سبحان', 'الله', '💙'], 'pos')
(['عادي', 'شاعرها', 'أو', 'شاعرك', '🤔', '#Malak'], 'neg')
(['كل', 'المتابعين', 'نايمين', 'لا', 'احساس', 'ولا', 'ضمير', 'ضايفه', 'محلات', 'أنا', '😜', 'ما', 'تفتح', 'الا', 'عقب', 'العشر', '😳'], 'neg')
(['منتظرتك', 'تنامين', 'نامي', '😡'], 'neg')
(['مالتي', 'ماكا👉', 'منشط', 'طبيعي🐝', '💯💯', 'وخالى', 'من', 'اي', 'كيماويات', 'وبديل', 'للفياجرا', '👋', 'يعالج', 'سرعة', 'القذف', 'المبكر🌿', 'يحسن', 'من…'], 'pos')
(['الآرمي', 'مبدعين', '😰', 'ناويين', 'ي

# Test data info

In [9]:
print('test data info')
test_data = pos_test_data + neg_test_data
print('test data size', len(train_data))
print('# of positive', len(pos_test_data))
print('# of negative', len(neg_test_data))

test data info
test data size 47000
# of positive 5970
# of negative 5781


# merging all features ...

In [10]:
print('merging all features ... ')
all_features = pos_train_feat + neg_train_feat + \
               pos_test_feat + pos_test_feat
print('len(all_features):', len(all_features))

merging all features ... 
len(all_features): 770508


# Sample features 

In [11]:
print('{} sample features ...'.format(sample_size))
print(random.sample(all_features, sample_size))

100 sample features ...
['أنفيلد', 'وطن', '💙🙏', '🌹', 'هه', 'النقاء', 'من', 'رتويت', 'لله💙', 'ما', 'عزله', 'حتى', 'آلهية', 'نبينا', '/', 'أقابل', 'لا…', 'آلقلب', 'ابيي', 'أصدق', 'فيارب', 'علاوي', 'القارية', 'صرف', 'صباح', 'أي', 'للناس', 'ق…', 'علينا', 'يبھجون', '،', 'الله', 'العذر', 'يطقطقون', 'دي', 'هذه', 'سنن', 'ندق', 'الساعة', 'ملگية♛', 'خفيف', 'حتى', 'وزيره', 'شاهد', 'شخصا', '😷', 'تقول', '#نبض_الامل_للدعم', 'وخير', 'من', 'صباح', 'بالاضافه', 'المفرووض', 'لم', 'حيا', '↓❁🌸', '💕', 'أحلامي', 'اللي', 'نجمتي', '😂', 'تثبت', 'هذا', 'لكل', 'في', 'بعدد', 'إن', 'مش', 'قلبك', 'حول', 'و', 'مابه', 'الجراك', 'عنها', 'يا', 'اشفي', 'يوفقك', '|', 'الأخيره', 'الوضع', 'من', 'لزقة…', 'أفضل', 'كل', 'ألف', '..', 'بعدين', 'تسمع', 'معقوله', 'نفس', 'يسالونك', 'حساب', 'جماهير', 'عين', 'لكني', '"تلاتين', 'دهشة', 'اشلع', 'يا', 'خي']


# compute frequencies

In [12]:
all_features_count = {}
for w in all_features:
    all_features_count[w] = all_features_count.get(w, 0) + 1

# Sample Frequency

In [13]:
print('sample frequencies')
print(random.sample(list(all_features_count.items()), 30))
word = 'في'
print('freq of word {} is {}'.format(word, all_features_count.get(word, 0)))
word = 'فى'
print('freq of word {} is {}'.format(word, all_features_count.get(word, 0)))
word = 'من'
print('freq of word {} is {}'.format(word, all_features_count.get(word, 0)))

sample frequencies
[('المماراة', 2), ('انشيلوتي', 1), ('..💙💭', 1), ('للإتي', 2), ('يداريني', 1), ('إستجابه', 5), ('ده؟😁💔🚶', 1), ('حرووف', 2), ('لوتسوي', 1), ('والنسيم', 1), ('ورديتها', 1), ('اليوسى', 1), ('وعندما', 30), ('نجاة', 1), ('القصة؟', 1), ('k+', 1), ('يغلبني', 4), ('للمبتسمين', 7), ('ديال', 2), ('اللامبالاه', 2), ('سعه،', 1), ('ابيج', 2), ('متانة..', 1), ('DADDY', 1), ('حسافة', 7), ('ياضحكھ', 1), ('ييما', 1), ('ورضا..', 1), ('عمكم', 2), ('(النسخة', 1)]
freq of word في is 9550
freq of word فى is 220
freq of word من is 12655


# Compute Threshold

In [14]:
print('size of training data:',  len(train_data))
min_df = int(0.001 * len(train_data))
max_df = int(0.98 * len(train_data))
print('min document frequency:', min_df)
print('max document frequency:', max_df)

size of training data: 47000
min document frequency: 47
max document frequency: 46060


# Selecting Features 

In [15]:
# remove features that have frequency below/above the threshold
my_features = set([word for word, freq in all_features_count.items() if  max_df > freq > min_df ])
print(len(my_features), 'are kept out of', len(all_features))

1961 are kept out of 770508


# Sample of selected features 

In [16]:
print('{} sample of selected features:'.format(sample_size))
print(random.sample(list(my_features), sample_size))

100 sample of selected features:
['نبينا', '×', 'فضلت', '😳', 'تبارك', 'تعال', 'ﻣﻦ', 'والسرور', 'من', 'والشوق', 'احب', 'عمرنا', 'الذنوب', 'عندنا', 'أسعد', 'الحلوه', 'يد', 'المهم', 'الفجر', 'نفرح', 'اكتب', 'امطار', 'اليوم', 'كانوا', 'وكيف', 'يرد', 'خبر', 'القصبي', 'ماوحشتك', 'القدر', 'وفوق', '؟!', 'فيك', 'إشراقة', 'والي', 'رحمتك', 'الجمعه', 'رجال', '😕', 'اجازه', 'تركت', '🎀', 'السحب', 'لين', 'معاه', 'يقولون', 'تصير', 'التغريده', '#الفيفا_يرد_احتجاج_الهلال_صحيح', 'يارب', 'برحيل', '#زلزل_الملعب_نصرنا_بيلعب', 'به', 'يتحدثون', 'وعن', 'الهم', "'", 'صارت', 'قالوا', 'العالم', 'بسبب', 'المعروف', 'الطيب', 'بيني', 'كم', 'القيم', 'لك؟', 'فلا', 'فيه', 'موسم', '#الأهلي_الهلال', 'يابو', 'بلغنا', 'داود', 'التي', 'ترد', 'ال…', 'قدر', 'توقفون', 'مدريد', 'وليس', '☆', 'الحي', 'صبح', 'كثير', 'حسين', 'رجعنا', 'ﻭﻓﻲ', 'كان', 'قبل', 'ღ♬', 'عشي', 'وطبق', 'العسل', 'شوي', 'او', '👌', 'العافيه', 'منه', 'المتاريس']


# generating features for training documents ...

In [None]:
feature_sets = [(document_features(d, my_features), c) for (d, c) in train_data]

# training ...

In [None]:
classifier = nltk.NaiveBayesClassifier.train(feature_sets)
print('training is done')

# Most informative features 

In [None]:
classifier.show_most_informative_features(40)

# generating features for test documents ...

In [None]:
test_features = [(document_features(d, my_features), c) for (d, c) in test_data]

# classify test instances 

In [None]:
ref_sets = collections.defaultdict(set)
test_sets = collections.defaultdict(set)

for i, (feats, label) in enumerate(test_features):
    ref_sets[label].add(i)
    observed = classifier.classify(feats)
    test_sets[observed].add(i)

# Results 

In [None]:
print('accuracy: ', nltk.classify.accuracy(classifier, test_features))
print('pos precision: ', precision(ref_sets['pos'], test_sets['pos']))
print('pos recall:', recall(ref_sets['pos'], test_sets['pos']))
print('neg precision: ', precision(ref_sets['neg'], test_sets['neg']))
print('neg recall:', recall(ref_sets['neg'], test_sets['neg']))
print('positive f-score:', f_measure(ref_sets['pos'], test_sets['pos']))
print('negative f-score:', f_measure(ref_sets['neg'], test_sets['neg']))