In [107]:
import nltk
from nltk.stem import isri
from nltk.corpus import stopwords

import gensim
import numpy as np
import pandas as pd
import pyLDAvis
import spacy

### Data Loading

In [242]:
data = pd.read_csv('Data/posts.csv', encoding='utf-8',error_bad_lines=False);
docs=data['body'].drop_duplicates().values.tolist()
data_text = data[['body']]
data_text['id'] = data_text.index
documents = data_text

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


## Data Preprocessing

In [266]:
STOPWORDS = set(stopwords.words('english')+stopwords.words('arabic'))

tobespace_re = r'[\/\\(){}\[\]\|\"\'،…—.;,-=+$%^&*:÷×؛~٪؟?!_#@]'
arabic_diacritics = re.compile("""
                             ّ    | # Tashdid
                             َ    | # Fatha
                             ً    | # Tanwin Fath
                             ُ    | # Damma
                             ٌ    | # Tanwin Damm
                             ِ    | # Kasra
                             ٍ    | # Tanwin Kasr
                             ْ    | # Sukun
                             ـ     # Tatwil/Kashida
                         """, re.VERBOSE)

def normalize(text):
    text = re.sub("[إأآ]", "ا", text)
    text = re.sub("ى", "ي", text)
    text = re.sub("[ؤئ]", "ء", text)
    text = re.sub("هة", "ه", text)
    text = re.sub("[ة]", "ه", text)
    text = re.sub("گ", "ك", text)
    text = re.sub(arabic_diacritics, '', text)
    return text

def clean(text):
    text = re.sub(r'http\S+', '', text)    #لإزالة الروابط
    text = re.sub(r'\S+@\S+\s?', '', text)  #لإزالة البريد الإلكتروني  
    text = re.sub(r'@\S+', '', text)       #لإزالة المنشنات
    text = re.sub(r'pic.\S+', '', text)    #لإزالة الصور
#     stemmer = isri.ISRIStemmer()
#     text = " ".join(stemmer.stem(w) for w in text.split() if ( (w not in STOPWORDS) and (len(w)>3) ))
    text = " ".join(w for w in text.split() if w not in STOPWORDS)
    text = normalize(text)
    text = re.sub(r'(.)\1+', r'\1\1', text)         # إزالة الأحرف المكررة أكثر من مرتين
    text = re.sub(r'(.)(.)(\1\2)+', r'\1\2', text)  #إزالة ثنائية الأحرف المكررة
    text = re.sub(tobespace_re, ' ', text)          # الإستبدال بفراغ
    text = re.sub(r'\s+', ' ', text)                #إزالة الفراغات المكررة 
    text = text.lower()
    clean = [w for w in text.split()]
    return clean

In [267]:
processed_docs = documents['body'].map(clean)
processed_docs[:10]

0    [ابن, يالسبيعيابنتي, بدا, كشف, غرز, جنس, خااءف...
1    [لمذ, لماذ, لمذ, احس, الظ, اخر, قلب, اسا, الظ,...
2    [خمس, عشر, عنا, نقذ, فوت, اون, بسم, الل, رحم, ...
3    [ف, انصات, كيف, علم, انص, رات, ناس, تحس, عمل, ...
4    [بكء, دكتور, سبيعيالسلام, علي, رحم, الل, وبر, ...
5    [طءر, سجد, هطر, سجد, منقول, عجب, كرم, قصه, نقل...
6    [علم, خفف, غرب, نوع, مامعلوم, خفف, غرب, نوع, ع...
7    [فته, خرج, انف, مسك, تغسيلهافتاه, خرج, انف, مس...
8    [موسيقي, اسمع, وتمتعليش, سمع, نست, انه, حرم, ي...
9    [علاج, حسد, لايفوت, م, جدا, بسم, الل, رحم, رحم...
Name: body, dtype: object

#### Bag of Words on the Data set

In [268]:
dictionary = gensim.corpora.Dictionary(processed_docs)

In [269]:
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

0 ابن
1 اثر
2 اخه
3 انه
4 بان
5 بدا
6 بدت
7 ترك
8 تعد
9 جنس
10 خااءفه


Filter out tokens that appear in

>less than 15 documents (absolute number) or  
>more than 0.5 documents (fraction of total corpus size, not absolute number).   
>after the above two steps, keep only the first 100000 most frequent tokens.    

In [270]:
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

Gensim doc2bow  
For each document we create a dictionary reporting how many
words and how many times those words appear. Save this to ‘bow_corpus’, then check our selected document earlier.

In [271]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
bow_corpus[4310]

[(1, 1),
 (2, 4),
 (3, 2),
 (5, 1),
 (16, 9),
 (21, 1),
 (23, 1),
 (26, 1),
 (29, 4),
 (30, 1),
 (32, 1),
 (37, 24),
 (46, 2),
 (54, 1),
 (55, 1),
 (71, 4),
 (85, 1),
 (96, 1),
 (121, 1),
 (123, 1),
 (142, 5),
 (143, 2),
 (147, 1),
 (158, 2),
 (164, 2),
 (168, 1),
 (174, 1),
 (179, 2),
 (180, 4),
 (185, 1),
 (190, 2),
 (198, 5),
 (211, 4),
 (218, 3),
 (222, 1),
 (225, 1),
 (226, 3),
 (230, 1),
 (237, 1),
 (244, 1),
 (248, 1),
 (249, 1),
 (250, 10),
 (256, 1),
 (258, 1),
 (260, 1),
 (264, 1),
 (276, 2),
 (279, 1),
 (282, 4),
 (291, 1),
 (294, 5),
 (335, 1),
 (337, 1),
 (349, 1),
 (352, 2),
 (353, 1),
 (361, 1),
 (376, 1),
 (378, 1),
 (381, 2),
 (402, 18),
 (411, 1),
 (418, 2),
 (421, 2),
 (429, 1),
 (437, 4),
 (440, 1),
 (442, 3),
 (451, 1),
 (453, 2),
 (455, 1),
 (457, 2),
 (463, 15),
 (468, 1),
 (479, 1),
 (483, 1),
 (506, 1),
 (528, 1),
 (530, 2),
 (534, 1),
 (553, 1),
 (558, 1),
 (559, 1),
 (562, 3),
 (564, 1),
 (596, 3),
 (600, 1),
 (602, 1),
 (630, 2),
 (640, 1),
 (664, 2),
 (665,

Preview Bag Of Words for our sample preprocessed document.

In [272]:
bow_doc_4310 = bow_corpus[4310]
for i in range(len(bow_doc_4310)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_4310[i][0], 
                                               dictionary[bow_doc_4310[i][0]], 
bow_doc_4310[i][1]))

Word 1 ("اثر") appears 1 time.
Word 2 ("اخه") appears 4 time.
Word 3 ("انه") appears 2 time.
Word 5 ("بدا") appears 1 time.
Word 16 ("عمل") appears 9 time.
Word 21 ("كشف") appears 1 time.
Word 23 ("لدي") appears 1 time.
Word 26 ("مثل") appears 1 time.
Word 29 ("وحد") appears 4 time.
Word 30 ("ولك") appears 1 time.
Word 32 ("احس") appears 1 time.
Word 37 ("امر") appears 24 time.
Word 46 ("حول") appears 2 time.
Word 54 ("رجل") appears 1 time.
Word 55 ("رحم") appears 1 time.
Word 71 ("كانت") appears 4 time.
Word 85 ("وجد") appears 1 time.
Word 96 ("ارض") appears 1 time.
Word 121 ("بغر") appears 1 time.
Word 123 ("بلغ") appears 1 time.
Word 142 ("جمع") appears 5 time.
Word 143 ("جنب") appears 2 time.
Word 147 ("حرب") appears 1 time.
Word 158 ("خرج") appears 2 time.
Word 164 ("خلل") appears 2 time.
Word 168 ("دخل") appears 1 time.
Word 174 ("ذكر") appears 1 time.
Word 179 ("رسل") appears 2 time.
Word 180 ("رفع") appears 4 time.
Word 185 ("سبق") appears 1 time.
Word 190 ("سلم") appears 2 tim

Word 5502 ("زرقاء") appears 1 time.
Word 5579 ("وبغ") appears 1 time.
Word 5627 ("تسق") appears 1 time.
Word 5743 ("جيت") appears 1 time.
Word 5853 ("وطر") appears 1 time.
Word 6002 ("هال") appears 1 time.
Word 6422 ("حمز") appears 1 time.
Word 6759 ("ارو") appears 1 time.
Word 6904 ("فزز") appears 1 time.
Word 7137 ("شلك") appears 2 time.
Word 7227 ("طقم") appears 2 time.
Word 7236 ("وغط") appears 1 time.
Word 7255 ("يستخدمون") appears 1 time.
Word 7265 ("نحز") appears 3 time.
Word 7379 ("منطقه") appears 2 time.


TF-IDF

Create tf-idf model object using models.TfidfModel on ‘bow_corpus’ and save it to ‘tfidf’, then apply transformation to the entire corpus and call it ‘corpus_tfidf’. Finally we preview TF-IDF scores for our first document.

In [273]:
from gensim import corpora, models
tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]

from pprint import pprint
for doc in corpus_tfidf:
    pprint(doc)
    break

[(0, 0.11926844028978215),
 (1, 0.09951728522556409),
 (2, 0.1279280216482939),
 (3, 0.07853007524450244),
 (4, 0.20330284308410904),
 (5, 0.0935026632782139),
 (6, 0.1794926550969176),
 (7, 0.13671062809845977),
 (8, 0.12466424474741321),
 (9, 0.1391418226264802),
 (10, 0.12139416359477628),
 (11, 0.10967040093886413),
 (12, 0.11268044184862377),
 (13, 0.08291217117391671),
 (14, 0.11719880674787139),
 (15, 0.07525290064735914),
 (16, 0.05271547087421837),
 (17, 0.18407775509636765),
 (18, 0.6575550913008688),
 (19, 0.09676246994489064),
 (20, 0.12218912497976632),
 (21, 0.22543690855202006),
 (22, 0.11882914388214565),
 (23, 0.13287568274541597),
 (24, 0.17761765617407252),
 (25, 0.15583860788437154),
 (26, 0.09876375046061575),
 (27, 0.18181706588166663),
 (28, 0.22276949897694923),
 (29, 0.06188522727343813),
 (30, 0.09530642448233898),
 (31, 0.12084156013786333)]


Running LDA using Bag of Words

Train our lda model using gensim.models.LdaMulticore and save it to ‘lda_model’

In [274]:
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=10, id2word=dictionary, passes=2, workers=2)

For each topic, we will explore the words occuring in that topic and its relative weight.

In [275]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.019*"سلم" + 0.018*"زوج" + 0.014*"رسل" + 0.007*"كلم" + 0.006*"رحم" + 0.006*"حمد" + 0.005*"علم" + 0.005*"علي" + 0.005*"فقل" + 0.005*"حدث"
Topic: 1 
Words: 0.017*"علم" + 0.010*"شكل" + 0.007*"كثر" + 0.007*"فكر" + 0.007*"وضع" + 0.006*"كلم" + 0.006*"عرف" + 0.005*"جمع" + 0.005*"دكتور" + 0.005*"كتب"
Topic: 2 
Words: 0.014*"سلم" + 0.011*"علم" + 0.010*"قلب" + 0.006*"ذكر" + 0.006*"عمل" + 0.006*"جمع" + 0.006*"امر" + 0.005*"عبد" + 0.005*"انس" + 0.004*"رحم"
Topic: 3 
Words: 0.008*"علم" + 0.008*"زوج" + 0.007*"عرف" + 0.007*"سلم" + 0.007*"كلم" + 0.006*"عمل" + 0.006*"امر" + 0.006*"جمع" + 0.006*"دخل" + 0.005*"ولد"
Topic: 4 
Words: 0.010*"شعر" + 0.008*"عمل" + 0.007*"فكر" + 0.006*"كثر" + 0.006*"علم" + 0.006*"حدث" + 0.006*"فعل" + 0.005*"كلم" + 0.005*"شخص" + 0.005*"عرف"
Topic: 5 
Words: 0.012*"سلم" + 0.012*"رحم" + 0.011*"جمع" + 0.009*"راه" + 0.008*"كتب" + 0.007*"علم" + 0.007*"رجل" + 0.007*"درس" + 0.006*"سعد" + 0.006*"علي"
Topic: 6 
Words: 0.010*"شعر" + 0.008*"نظر" + 0.007*"قلب" + 0.005*"حو

### Running LDA using TF-IDF

In [276]:
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=10, id2word=dictionary, passes=2, workers=4)
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.003*"رحم" + 0.003*"سلم" + 0.002*"قلب" + 0.002*"رسل" + 0.002*"علم" + 0.002*"جمع" + 0.002*"عرف" + 0.002*"صور" + 0.002*"حضر" + 0.002*"علي"
Topic: 1 Word: 0.004*"رحم" + 0.004*"ندي" + 0.003*"وبر" + 0.003*"دكتور" + 0.003*"سلم" + 0.003*"علي" + 0.003*"مني" + 0.003*"شكر" + 0.003*"عضء" + 0.003*"رجو"
Topic: 2 Word: 0.003*"رحم" + 0.003*"سلم" + 0.003*"زوج" + 0.002*"لحب" + 0.002*"دكتور" + 0.002*"رسل" + 0.002*"عرف" + 0.002*"جمل" + 0.002*"شكر" + 0.002*"وبر"
Topic: 3 Word: 0.003*"زوج" + 0.003*"شكل" + 0.003*"علم" + 0.002*"سلم" + 0.002*"عمل" + 0.002*"كلم" + 0.002*"شعر" + 0.002*"رحم" + 0.002*"كثر" + 0.002*"عرف"
Topic: 4 Word: 0.005*"رحم" + 0.004*"ندي" + 0.004*"شكل" + 0.004*"زوج" + 0.004*"وبر" + 0.004*"شكر" + 0.003*"علي" + 0.003*"رجو" + 0.003*"دكتور" + 0.003*"سلم"
Topic: 5 Word: 0.003*"سلم" + 0.003*"رحم" + 0.003*"رسل" + 0.002*"زوج" + 0.002*"شكر" + 0.002*"علم" + 0.002*"شكل" + 0.002*"علي" + 0.002*"عرف" + 0.002*"جمع"
Topic: 6 Word: 0.003*"سلم" + 0.003*"رحم" + 0.002*"رسل" + 0.002*"عضه" + 0.002

### Performance evaluation by classifying sample document using LDA Bag of Words model

We will check where our test document would be classified.

In [277]:
processed_docs[4310]

['عمل',
 'كماندوز',
 'امر',
 'حال',
 'عمل',
 'كماندوز',
 'امر',
 'غنس',
 'عمل',
 'كماندوز',
 'امر',
 'ءمن',
 'قندهار',
 'عمل',
 'كماندوز',
 'طار',
 'سفر',
 'جنب',
 'هلمند',
 'عمل',
 'كماندوز',
 'عسكر',
 'اخه',
 'بلش',
 'قلم',
 'عدل',
 'عمل',
 'وقع',
 'يدن',
 'درت',
 'ميد',
 'نتج',
 'سجل',
 'لدي',
 'سجل',
 'امر',
 'ونح',
 'تحد',
 'قيد',
 'عسكر',
 'امر',
 'وزر',
 'دفع',
 'رامسفيلد',
 'ذكر',
 'شعب',
 'امر',
 'علي',
 'كار',
 'قبل',
 'بدا',
 'بطل',
 'جهد',
 'امر',
 'شكل',
 'سرع',
 'عدد',
 'جهد',
 'عرب',
 'غنس',
 'خلل',
 'حرب',
 'علم',
 'شعب',
 'سلم',
 'خصه',
 'شعب',
 'علم',
 'شعب',
 'امر',
 'هول',
 'كاذيب',
 'سوق',
 'نتج',
 'ورج',
 'علم',
 'امر',
 'سد',
 'جهد',
 'قلع',
 'جنك',
 'شمل',
 'غنس',
 'جهد',
 'عرب',
 'شهد',
 'كمل',
 'جهد',
 'شمل',
 'كبل',
 'قرب',
 'جهد',
 'عرب',
 'جهد',
 'قندهار',
 'جهد',
 'عرب',
 'وصل',
 'اخه',
 'نحز',
 'كبل',
 'الي',
 'جهد',
 'خوس',
 'جهد',
 'عرب',
 'وصل',
 'اخه',
 'نحز',
 'كبل',
 'الي',
 'جهد',
 'تور',
 'بور',
 'جهد',
 'عرب',
 'وصل',
 'اخه',
 'نحز',
 'كبل',
 'ال

In [278]:
for index, score in sorted(lda_model[bow_corpus[4310]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model.print_topic(index, 10)))


Score: 0.4781235456466675	 
Topic: 0.017*"علم" + 0.010*"شكل" + 0.007*"كثر" + 0.007*"فكر" + 0.007*"وضع" + 0.006*"كلم" + 0.006*"عرف" + 0.005*"جمع" + 0.005*"دكتور" + 0.005*"كتب"

Score: 0.3324850797653198	 
Topic: 0.014*"سلم" + 0.011*"علم" + 0.010*"قلب" + 0.006*"ذكر" + 0.006*"عمل" + 0.006*"جمع" + 0.006*"امر" + 0.005*"عبد" + 0.005*"انس" + 0.004*"رحم"

Score: 0.18659639358520508	 
Topic: 0.008*"علم" + 0.008*"زوج" + 0.007*"عرف" + 0.007*"سلم" + 0.007*"كلم" + 0.006*"عمل" + 0.006*"امر" + 0.006*"جمع" + 0.006*"دخل" + 0.005*"ولد"


Our test document has the highest probability to be part of the topic that our model assigned, which is the accurate classification.

Performance evaluation by classifying sample document using LDA TF-IDF model.

In [279]:
for index, score in sorted(lda_model_tfidf[bow_corpus[4310]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model_tfidf.print_topic(index, 10)))


Score: 0.9058488011360168	 
Topic: 0.003*"زوج" + 0.003*"شكل" + 0.003*"علم" + 0.002*"سلم" + 0.002*"عمل" + 0.002*"كلم" + 0.002*"شعر" + 0.002*"رحم" + 0.002*"كثر" + 0.002*"عرف"

Score: 0.09245883673429489	 
Topic: 0.004*"زوج" + 0.003*"علج" + 0.003*"رحم" + 0.003*"شكل" + 0.002*"سلم" + 0.002*"جمع" + 0.002*"دكتور" + 0.002*"شكر" + 0.002*"عمل" + 0.002*"شعر"


Our test document has the highest probability to be part of the topic that our model assigned, which is the accurate classification.

Testing model on unseen document

In [280]:
unseen_document = 'أنا أعاني من الإكتئاب بسبب أنني أتعرض للإهانة لماذا أنا يا رب  '
bow_vector = dictionary.doc2bow(clean(unseen_document))
for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]):
    print("Score: {}\t Topic: {}".format(score, lda_model.print_topic(index, 5)))

Score: 0.8874394297599792	 Topic: 0.013*"شكل" + 0.013*"علج" + 0.011*"رحم" + 0.009*"عمل" + 0.009*"جمع"
Score: 0.012509438209235668	 Topic: 0.010*"شعر" + 0.008*"عمل" + 0.007*"فكر" + 0.006*"كثر" + 0.006*"علم"
Score: 0.01250746101140976	 Topic: 0.008*"علم" + 0.008*"زوج" + 0.007*"عرف" + 0.007*"سلم" + 0.007*"كلم"
Score: 0.012507245875895023	 Topic: 0.013*"شكل" + 0.012*"طفل" + 0.012*"علم" + 0.009*"جمع" + 0.008*"عرف"
Score: 0.012506572529673576	 Topic: 0.021*"رحم" + 0.014*"علي" + 0.011*"سلم" + 0.007*"علم" + 0.007*"صدق"
Score: 0.012506390921771526	 Topic: 0.010*"شعر" + 0.008*"نظر" + 0.007*"قلب" + 0.005*"حول" + 0.005*"علم"
Score: 0.012506315484642982	 Topic: 0.017*"علم" + 0.010*"شكل" + 0.007*"كثر" + 0.007*"فكر" + 0.007*"وضع"
Score: 0.012506249360740185	 Topic: 0.012*"سلم" + 0.012*"رحم" + 0.011*"جمع" + 0.009*"راه" + 0.008*"كتب"
Score: 0.012505576945841312	 Topic: 0.014*"سلم" + 0.011*"علم" + 0.010*"قلب" + 0.006*"ذكر" + 0.006*"عمل"
Score: 0.012505319900810719	 Topic: 0.019*"سلم" + 0.018*"زوج" + 0.0