In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
folder = "all"
lex_name = f"w2v_lexicon_{folder}.xlsx"
export_file = f"w2v_dataset_{folder}.xlsx"
path = f"../Thesis/news/split/{folder}"
# path = f"../Thesis/news/split/{folder}"
vocab_path = f"../lexicons/w2v_lexicon_{folder}.xlsx"
vocab_column = "Unnamed: 0"

In [3]:
def load_terms(path, colname):
    controled_vocab = pd.read_excel(path)
    trusted_terms = controled_vocab[colname].dropna()
    terms2index = dict()
    index2terms = dict()
    for index, term in enumerate(set(trusted_terms)):
        terms2index[term.strip()] = index
        index2terms[index] = term.strip()
    
    return terms2index, index2terms

In [4]:
def generate_documents_by_terms(terms, path):
    d_termf = dict()
    num_terms = len(terms)
    for filename in os.listdir(path):
        with open(f"{path}/{filename}","r", encoding="utf-8") as file:
            dtr = 0
            d_termf[filename] = np.array([0]*num_terms)
            for line in file.read().split("\n"):
                for term in line.split():
                    if term in terms:
                        d_termf[filename][terms[term]] += 1
                        dtr += 1
            # normalize by ratio
            if dtr > 0:
                d_termf[filename] = d_termf[filename]/dtr
    
    data = pd.DataFrame(data = d_termf).transpose()
    data.index = d_termf.keys()
    data.columns = terms.keys()
    data = data.loc[:, (data != 0).any(axis=0)]
    data = data.loc[(data != 0).any(axis=1), :]
    
    return data

In [5]:
def generate_documents_by_emotions(lex_name, df, index2col):
    lex_df = pd.read_excel(f"../lexicons/{lex_name}", index_col=[0])
    # for each documrnt add up the vector values of each 
    # term that is represnting * normalized tf value for each one 
    d_termf = dict()
    for x in df.index:
        summ = np.array([0.0]*len(lex_df.columns))
        for index, y in enumerate(df.loc[x].values):
            if y != 0.0:
                try:
                    summ += lex_df.loc[index2col[index]]*y
                except Exception as e:
                    e

        d_termf[x] = summ
    data = pd.DataFrame(data = d_termf).transpose()
    data.index = d_termf.keys()
    data = data.loc[:, (data != 0).any(axis=0)]
    data = data.loc[(data != 0).any(axis=1), :]
    return data

In [6]:
terms2index, index2terms = load_terms(vocab_path, vocab_column)

In [7]:
docs_by_terms = generate_documents_by_terms(terms2index, path)

In [8]:
docs_by_terms.head()

Unnamed: 0,حركه,شموليه,ديموقراطيه,مخاتله,اشتراكيه,تصديق,اخلاص,بيروقراطيه,فاسد,لاء,...,كيل,ديكتاتوريه,ختل,دوله,جمعيه,خدعه,نصاب,كذب,زعيم,دستور
‏ مصفحة للتأمين بعد الإعلان عن مسيرة رابعة إلى الحرس الجمهورى اليوم السابع.txt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
% من القراء يتوقعون استجابة أنصار مرسى لدعوات مصالحة الأزهر اليوم السابع.txt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
« مصر للطيران » تسمح للمضيفات بارتداء الحجاب.txt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
«17» محافظة تحسم مصير مشـروع الدستور في مصـر.txt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25
«اخوان مصر» التزوير في الانتخابات يفوق الخيال.txt,0.035714,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.035714,0.0,0.0,0.0,0.0,0.0,0.0


In [41]:
docs_by_terms.to_excel(f"../raw_sets/w2v_df_{folder}.xlsx")

In [None]:
docs_by_emotion = generate_documents_by_emotions(lex_name, docs_by_terms, index2terms)

In [None]:
# clustering 2 sets 1 for

In [None]:
docs_by_emotion.head()

In [45]:
docs_by_emotion.to_excel(f"../datasets/1_{export_file}")