In [1]:
from gensim.models import Word2Vec, KeyedVectors
from multiprocessing import cpu_count
from gensim.test.utils import get_tmpfile
from gensim.parsing.preprocessing import remove_stopwords, strip_numeric, strip_non_alphanum, \
    strip_multiple_whitespaces, strip_tags, strip_short
from os import listdir
import pandas as pd
import numpy as np

In [2]:
core_count = cpu_count()

In [3]:
folder = "news"
path = f"Thesis/{folder}"

In [4]:
WINDOW_SIZE = 5

In [5]:
def _prepossess_data(u_path, language):
    files = listdir(f"{u_path}")
    for f_name in files:

        with open(f"{u_path}/{f_name}", "r", encoding="utf-8") as file:
            if language == "AR":
                for sentence in file.read().split("\n"):
                    tokens = [token.strip() for token in sentence.split() if token.strip()]
                    if len(tokens) > 0:
                        yield tokens
            else:
                for sentence in file.read().split("\n"):
                    strr = strip_short(
                        remove_stopwords(
                            strip_numeric(strip_non_alphanum(strip_tags(strip_multiple_whitespaces(sentence))))),
                        minsize=2).lower()
                    if strr != '':
                        yield strr.split()

In [17]:
def train_model(name, u_path, language):
    sentences = [file for file in _prepossess_data(u_path, language)]
    word_index = dict()
    # for each sentence replace words with emotion and generated 
    # words with orginial term
    for sentence in sentences:
        for index, word in enumerate(sentence):
            if word in word2em.keys() and not word in mapped.keys():
                emotions_arr = word2em[word]
                if word in word_index.keys():
                    sentence[index] = word2em[word][word_index[word]]
                    word_index[word] = (word_index[word] + 1) % len(emotions_arr)
                else:
                    sentence[index] = word2em[word][0]
                    word_index[word] = 1 % len(emotions_arr)
            if word in mapped.keys():
                sentence[index] = mapped[word]
    try:
        if len(sentences) > 0:
            try:
                model = Word2Vec.load(f"../models/{name}.model")
                model.build_vocab(sentences, update=True)
                model.train(sentences, total_examples=len(sentences), epochs=50)
                model.save(f"../models/{name}.model")
                model.wv.save(f"../models/{name}.kv")
                del model
            except FileNotFoundError:
                model = Word2Vec(sentences, iter=50, workers=core_count, min_count=1, window=WINDOW_SIZE, sg=1, size=300)
                get_tmpfile(f"../models/{name}.model")
                get_tmpfile(f"../models/{name}.kv")
                model.save(f"../models/{name}.model")
                model.wv.save(f"../models/{name}.kv")
                del model
            print(f"finished training model {name}")
        else:
            raise ModelTrainFailed("provided files are empty!")

    except Exception as err:
        print(f"model {name} training failed due to the following error: {err}")

    return True

In [7]:
def fetch_results(name, words, nbers, language):
    try:
        model = Word2Vec.load(f"../models/{name}.model")
        wv = KeyedVectors.load(f"../models/{name}.kv", mmap='r')
#         x = model.wv
#         del model
#         terms = file.read().split()

        results = dict()
        for word in words:
            d = dict()
            lowered = word.lower()
            for term in nbers:
                if language != "AR":
                    decoded = term.lower()
                else:
                    decoded = term

                try:
                    d[decoded] = wv.similarity(lowered, decoded)
                except KeyError as err:
                    d[decoded] = 0.0

            results[lowered] = d

        return results
    except FileNotFoundError:
        raise FileNotFoundError(f"model with name {name} does not exist")

In [8]:
def extract_emotions(mapper, table):
    data = dict()
    sset = set(table.columns)
    for k,v in mapper.items():
        col = pd.Series([0.0]*table.shape[0])
        i = 0
        for item in v:
            if item in sset:
                col = col + table[item].values
                i += 1
        
        if i == 0:
            data[k] = col
        else:
            data[k] = col/i

    return data

In [9]:
def create_results(terms_frequencies, col, model_name):
    df = pd.DataFrame(data=fetch_results(model_name, terms_frequencies, col, "AR"))
    df = df.loc[:, (df != 0).any(axis=0)]
    df = df.loc[(df != 0).any(axis=1), :]
    return df

In [2]:
em2words = dict()
word2em = dict()
for f_name in listdir("../emotion-lexicon-master/arb"):
    with open(f"../emotion-lexicon-master/arb/{f_name}", encoding="utf-8") as file:
        for word in file.read().split(" "):
            w = word.strip()
            if w in word2em.keys():
                if f_name.split(".")[0] not in word2em[w]:
                    word2em[w].append(f_name.split(".")[0])
            else:
                word2em[w] = [f_name.split(".")[0]]
            if f_name.split(".")[0] in em2words.keys():
                em2words[f_name.split(".")[0]].append(w)
            else:
                em2words[f_name.split(".")[0]] = []
            

In [3]:
import json
word2em = None
mapped = None
with open("../transformed/json/mapper.json", "r", encoding="utf-8") as mp:
    mapped = json.load(mp)
    
with open("../transformed/json/word2em.json", "r", encoding="utf-8") as fp:
    word2em = json.load(fp)

In [4]:
for v, k in em2words.items():
    em2words[v] = set(k)

In [13]:
controled_vocab = pd.read_excel(f"../Thesis/emotions.xlsx")

In [14]:
controled_vocab.head()

Unnamed: 0,Emotion_Terms,Unnamed: 1,Unnamed: 2,Unnamed: 3,Trust_Terms,Unnamed: 5,Unnamed: 6,Unnamed: 7,Trust_Terms_stemmed,Unnamed: 9,...,Unnamed: 13,political_Terms,political_stemmed,Unnamed: 16,political_generated,Unnamed: 18,filtered_political,Unnamed: 20,trsut,distrust
0,,,,,,,,,,,...,لجنه,حكومة,حكومه,,اامحفظه=محافظه=,,اخوان,,استقامه,احتيال
1,إنبساط,InbsAT,high spirits,הנאה,استقامة,AstqAmp,,,استقامه,,...,مملكه,السياسة,سياسه,,اامحفظهك=محافظه=,,اسلام,,احلاف,اضلال
2,الفة,Alfp,Friendship,חברות,إحلاف,IHlAf (?),Honesty,יושרה,احلاف,,...,مستعمره,سياسي,سياسي,,اامحفظهكم=محافظه=,,جمهوريه,,امانه,اغراء
3,امومة,Amwmp,Motherhood,אמהות,اَمانة,>mAnp,Confidence/security,יושר,امانه,,...,ملك,حزب,حزب,,اامحفظهكما=محافظه=,,جيش,,ائتمان,اغواء
4,اهتمام,AhtmAm,consideration,דאגה,ائتمان,A<tmAn,Reliance,הסתמכות,ائتمان,,...,امير,حركة,حركه,,اامحفظهكن=محافظه=,,حاكم,,اتكال,تامر


In [15]:
# add all terms for mapping
# trusted_terms = controled_vocab.Trust_Terms_stemmed.dropna()
# col = set([term.strip() for term in trusted_terms])
# col.update([term.strip() for term in controled_vocab.political_stemmed.dropna()])
# mapped = dict()
# for item in col:
#     mapped[item] = item
# for values in controled_vocab.political_generated.dropna():
#     key, value, _ = values.split("=")
#     mapped[key] = value
# for values in controled_vocab.Trust_Terms_generated.dropna():
#     try:
#         key, value, _ = values.split("=")
#         mapped[key] = value
#     except Exception as s:
#         s

In [16]:
trusted_terms = controled_vocab.Trust_Terms_stemmed.dropna().append(controled_vocab.political_stemmed.dropna())
c = set([term.strip() for term in trusted_terms])

In [17]:
c

{'ائتلافيه',
 'ائتمان',
 'اتحاديه',
 'اتكال',
 'احتيال',
 'احلاف',
 'اخلاص',
 'اخوان',
 'اداره',
 'استقامه',
 'اسلام',
 'اشتراكيه',
 'اضلال',
 'اعتماد',
 'اغراء',
 'اغواء',
 'اقتصاد',
 'امانه',
 'امبراطوريه',
 'ايمان',
 'برلمان',
 'بيروقراطيه',
 'تامر',
 'تدليس',
 'تزوير',
 'تشريعي',
 'تصديق',
 'تضليل',
 'تعريض',
 'تعهد',
 'تلاعب',
 'تنفيذي',
 'ثقه',
 'جاسوس',
 'جمعيه',
 'جمهوريه',
 'جيش',
 'حاكم',
 'حركه',
 'حريه',
 'حزب',
 'حكومه',
 'حنث',
 'ختل',
 'خدع',
 'خدعه',
 'خيانه',
 'داهيه',
 'دستور',
 'دهاء',
 'دوله',
 'ديكتاتوريه',
 'ديموقراطيه',
 'رئيس',
 'راسماليه',
 'رده',
 'زائف',
 'زاره',
 'زعيم',
 'زير',
 'سياسه',
 'سياسي',
 'شريعه',
 'شموليه',
 'شيخ',
 'شيوخ',
 'شيوعيه',
 'صدق',
 'عضو',
 'عمده',
 'عهد',
 'غدر',
 'غر',
 'غرر',
 'غش',
 'فاء',
 'فاسد',
 'فساد',
 'فسخ',
 'فصيله',
 'قائد',
 'قانون',
 'قضائي',
 'قوميه',
 'كذب',
 'كيل',
 'لاء',
 'مؤامره',
 'مؤسسه',
 'مجلس',
 'محافظ',
 'مخابرات',
 'مخاتله',
 'مداهنه',
 'مرشح',
 'مسؤول',
 'مستشار',
 'مسلم',
 'مسلمون',
 'مسلمين',
 'مصداقيه',


In [28]:
folder = "news"
train_model("Al_Wafd 2000_2013", f"../Thesis/{folder}/split/Al_Wafd 2000_2013", "AR")

finished training model Addustour 2000_2013


True

In [17]:
folder = "Addustour"
train_model(f"{folder}", f"../Thesis/news/out/{folder}", "AR")

finished training model Addustour


True

In [19]:
folder = "Akhbar_el_Yom"
train_model(folder, f"../Thesis/news/out/{folder}", "AR")

finished training model Akhbar_el_Yom


True

In [21]:
folder = "Al Ahali"
train_model(folder, f"../Thesis/news/out/{folder}", "AR")

finished training model Al Ahali


True

In [23]:
folder = "Al_Masry_Alyoum"
train_model(folder, f"../Thesis/news/out/{folder}", "AR")

finished training model Al_Masry_Alyoum


True

In [25]:
folder = "Al_Shorouk"
train_model(folder, f"../Thesis/news/out/{folder}", "AR")

finished training model Al_Shorouk


True

In [34]:
folder = "Al_Wafd"
train_model(folder, f"../Thesis/news/out/{folder}", "AR")

finished training model Al_Wafd


True

In [36]:
folder = "Alfagr"
train_model(folder, f"../Thesis/news/out/{folder}", "AR")

finished training model Alfagr


True

In [21]:
folder = "AlQuds"
train_model(folder, f"../Thesis/news/out/{folder}", "AR")

finished training model AlQuds


True

In [38]:
folder = "AlRay"
train_model(folder, f"../Thesis/news/out/{folder}", "AR")

finished training model AlRay


True

In [40]:
folder = "EG_AlDostour"
train_model(folder, f"../Thesis/news/out/{folder}", "AR")

finished training model EG_AlDostour


True

In [42]:
folder = "Youm 7"
train_model(folder, f"../Thesis/news/out/{folder}", "AR")

finished training model Youm 7


True

In [18]:
folder = "Muslims_2014_2020_2"
name = f"w2v_lexicon_{folder}.xlsx"
train_model(folder, f"../Thesis/news/split/{folder}", "AR")
create_results(em2words.keys(), c, folder).to_excel(f"../lexicons/{name}")

finished training model Muslims_2014_2020_2


In [19]:
folder = "Muslims_2013"
name = f"w2v_lexicon_{folder}.xlsx"
train_model(folder, f"../Thesis/news/split/{folder}", "AR")
create_results(em2words.keys(), c, folder).to_excel(f"../lexicons/{name}")

finished training model Muslims_2013


In [20]:
folder = "Goverment 2013_2020"
name = f"w2v_lexicon_{folder}.xlsx"
train_model(folder, f"../Thesis/news/split/{folder}", "AR")
create_results(em2words.keys(), c, folder).to_excel(f"../lexicons/{name}")

finished training model Goverment 2013_2020


In [22]:
folder = "Goverment_2013"
name = f"w2v_lexicon_{folder}.xlsx"
train_model(folder, f"../Thesis/news/split/{folder}", "AR")
create_results(em2words.keys(), c, folder).to_excel(f"../lexicons/{name}")

finished training model Goverment_2013


In [18]:
folder = "all"
name = f"w2v_lexicon_{folder}.xlsx"
train_model(folder, f"../Thesis/news/split/Muslims_2013", "AR")
create_results(em2words.keys(), c, folder).to_excel(f"../lexicons/{name}")

finished training model all


folder = "news"
name = f"w2v_lexicon_Muslims_2013.xlsx"
train_model("Muslims_2013", f"../Thesis/{folder}/split/Muslims_2013", "AR")


In [21]:
create_results(em2words.keys(), c, "Muslims_2013").to_excel(f"../lexicons/Muslims_2013.xlsx")

In [19]:
df = create_results(em2words.keys(), c, "Goverment 2013_2020")
for index in df.index:
    x = df.loc[index]
    df.loc[index] = x/np.linalg.norm(x)
df.to_excel(f"../lexicons/unit/Goverment 2013_2020.xlsx")

In [18]:
folder = "aaa"
name = f"w2v_lexicon_{folder}.xlsx"
train_model(folder, f"C:/Users/Amjad Nassar/Desktop/aaa", "AR")
create_results(em2words.keys(), c, folder).to_excel(f"../lexicons/{name}")

finished training model aaa
