### Word Frequency and N-Grams

#### Import modules

In [1]:
import sys, os
import pandas as pd
sys.path.append(os.path.abspath(os.path.join('../scripts')))
from util import Util

  "\-"
  "\*"
  "\_"


In [2]:
cleaned_dir = "../data/cleaned"
final_dir = "../data/final"
file_name = "አዲስ ነገር መረጃ"
util = Util()

In [3]:
df = pd.read_csv(f"{cleaned_dir}/{file_name}.csv", index_col='id')
df.head()

Unnamed: 0_level_0,text,date,hashtags,emojis,symbols,links,mentions
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
11233,ህወሀት እንደ ቺቺኒያ ያሉ ለመኖርያ ተብለው በማስተር ፕላኑ የተቀመጡ ሰፈ...,2021-01-04T11:27:32,[],,_,[],['Addis']
11258,የኤርትራ ውጭ ጉ/ሚንስትር እና የፕሬዝዳንት ኢሳይያስ አማካሪ የተካተቱበት...,2021-01-05T23:27:06,[],,_,[],['Addis']
11259,ግብፃውያኑ በአባይ (በህዳሴ) ግድብ ጉዳይ ከመጨነቃቸው የተነሳ በቴሌቪዥን...,2021-01-06T08:23:56,[],📍✒♦♦📌,"""""""""""_",[],['Addis']
11261,በአዲስ አበባ ፖፖላሬ አካባቢ በደረሰ የእሳት አደጋ ሁለት ተሽከርካሪዎች ...,2021-01-06T09:22:51,['#ዳጉ_ጆርናል'],,_,[],['Addis']
11262,የገና ሥጦታ ሎተሪ እጣ ውጤት የገና ሥጦታ ሎተሪ ትላንት ማክሰኞ ታህሳስ...,2021-01-06T09:23:52,[],,_,[],['Addis']


#### Word Frequency

In [4]:
# function to convert token to list and unpack
from functools import reduce
import operator


def term_freq(x):
    # Tokenize each text into words
    word_lists = [str(text).split() for text in x.tolist()]
    
    # Flatten the list of lists
    words = reduce(operator.add, word_lists)
    
    return words

In [5]:
# function to count the term/word and create a dataframe for frequencies
from collections import Counter

def counting(x, y):
  counter = Counter(x)
  most_occurrences = counter.most_common()
  count_df = pd.DataFrame(most_occurrences, columns = ['Word', 'Count'])
  return count_df


In [6]:
df_frequency = counting(term_freq(df['text']), 2)
df_frequency.head()

Unnamed: 0,Word,Count
0,News,20603
1,እና,19791
2,ላይ,16174
3,ወደ,7934
4,ነው,6095


#### Save word frequency

In [7]:
df_frequency.to_csv(f"{final_dir}/{file_name}_frequency.csv")

#### n-grams

In [8]:
import re


def list_and_tokenize(data):
    return str(data).split()

In [9]:
# function to prepare n-grams
import collections

def count_n_grams(lines, min_length=2, max_length=4):
    lengths = range(min_length, max_length + 1)
    n_grams = {length: collections.Counter() for length in lengths}
    queue = collections.deque(maxlen=max_length)

    # Helper function to add n-grams at start of current queue to dict
    def add_queue():
        current = tuple(queue)
        for length in lengths:
            if len(current) >= length:
                n_grams[length][current[:length]] += 1

    # Loop through all lines and words and add n-grams to dict
    for line in lines:
        for word in list_and_tokenize(line):
            # if len(word) > 3:
            queue.append(word)
            if len(queue) >= max_length:
                add_queue()

    # Make sure we get the n-grams at the tail end of the queue
    while len(queue) > min_length:
        queue.popleft()
        add_queue()

    return n_grams

In [10]:
bigram_to_df = pd.DataFrame({'2-grams': [], '2-grams freq': []})
trigram_to_df = pd.DataFrame({'3-grams': [], '3-grams freq': []})
quadgram_to_df = pd.DataFrame({'4-grams': [], '4-grams freq': []})

bigram = {'2-grams': [], '2-grams freq': []}
trigram = {'3-grams': [], '3-grams freq': []}
quadgram = {'4-grams': [], '4-grams freq': []}
    
def print_most_freq_ng(n_grams, num=30):
    global bigram_to_df, trigram_to_df, quadgram_to_df
    for n in sorted(n_grams):
        for gram, count in n_grams[n].most_common(num):
            if n == 2:
                bigram['2-grams'].append(gram)
                bigram['2-grams freq'].append(count)
            elif n == 3:
                trigram['3-grams'].append(gram)
                trigram['3-grams freq'].append(count)
            else:
                quadgram['4-grams'].append(gram)
                quadgram['4-grams freq'].append(count)
                
    bigram_to_df = pd.DataFrame({'2-grams': bigram['2-grams'], '2-grams freq': bigram['2-grams freq']})
    trigram_to_df = pd.DataFrame({'3-grams': trigram['3-grams'], '3-grams freq': trigram['3-grams freq']})
    quadgram_to_df = pd.DataFrame({'4-grams': quadgram['4-grams'], '4-grams freq': quadgram['4-grams freq']})

In [11]:
print_most_freq_ng(count_n_grams(df['text']))
n_gram_df = pd.concat([bigram_to_df, trigram_to_df, quadgram_to_df], axis=1)

n_gram_df

Unnamed: 0,2-grams,2-grams freq,3-grams,3-grams freq,4-grams,4-grams freq
0,"(News, News)",9689,"(ቀን, 2015, ኣም)",373,"(አአ, ላላችሁትም, በእራሳችን, transport)",279
1,"(ምክር, ቤት)",1231,"(የአዲስ, አበባ, ከተማ)",339,"(ላላችሁትም, በእራሳችን, transport, ያሉበት)",279
2,"(በቁጥጥር, ስር)",1052,"(ጠቅላይ, ሚኒስትር, አቢይ)",324,"(በእራሳችን, transport, ያሉበት, ድረስ)",279
3,"(የአዲስ, አበባ)",1009,"(ቀን, 2013, ኣም)",312,"(transport, ያሉበት, ድረስ, በ)",279
4,"(news, news)",950,"(የአዲስ, አበባ, ፖሊስ)",291,"(ያሉበት, ድረስ, በ, 30)",279
5,"(በአዲስ, አበባ)",933,"(ወደ, ክልል, ከተሞች)",282,"(ድረስ, በ, 30, ደቂቃ)",279
6,"(ፍርድ, ቤት)",860,"(በ, 30, ደቂቃ)",281,"(በ, 30, ደቂቃ, እናደርሳለን)",279
7,"(አዲስ, አበባ)",746,"(አአ, ላላችሁትም, በእራሳችን)",279,"(30, ደቂቃ, እናደርሳለን, ወደ)",279
8,"(ጠቅላይ, ሚኒስትር)",716,"(ላላችሁትም, በእራሳችን, transport)",279,"(ደቂቃ, እናደርሳለን, ወደ, ክልል)",279
9,"(አበባ, ከተማ)",706,"(በእራሳችን, transport, ያሉበት)",279,"(እናደርሳለን, ወደ, ክልል, ከተሞች)",279


#### Save n-grams

In [12]:
n_gram_df.to_csv(f"{final_dir}/{file_name}_n_gram.csv")