In [1]:
!pip install git+https://github.com/dabeaz/sly
from sly import Lexer

import io
import re
import glob
import pandas as pd

# mount google drive to access files
from google.colab import drive
path = "/gdrive/My Drive/Colab Notebooks/nlp"

drive.mount('/gdrive')

Collecting git+https://github.com/dabeaz/sly
  Cloning https://github.com/dabeaz/sly to /tmp/pip-req-build-8vli1rfd
  Running command git clone -q https://github.com/dabeaz/sly /tmp/pip-req-build-8vli1rfd
Building wheels for collected packages: sly
  Building wheel for sly (setup.py) ... [?25l[?25hdone
  Created wheel for sly: filename=sly-0.4-cp36-none-any.whl size=28331 sha256=3e14c7bebb5640dbc8bbb416d177eaac7e3ed1cac145ac34a21cd90a29aa62ae
  Stored in directory: /tmp/pip-ephem-wheel-cache-rtj_a0zf/wheels/fd/a1/a0/07789f27b5fa3cab050b37f60193d3b28f26b2d6497cbf4097
Successfully built sly
Installing collected packages: sly
Successfully installed sly-0.4
Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdriv

In [0]:
def convert_ar_characters(input_str):
    """
    Converts Arabic chars to related Persian unicode char
    :param input_str: String contains Arabic chars
    :return: New str with converted arabic chars
    """
    mapping = {
        'ك' : 'ک',
        'دِ' : 'د',
        'بِ' : 'ب',
        'زِ' : 'ز',
        'ذِ' : 'ذ',
        'شِ' : 'ش',
        'سِ' : 'س',
        'ى' : 'ی',
        'ي' : 'ی',
        '“' : '"',
        '”' : '"',
        ')' : '(',
        '(' : ')',
        ')' : '(',
        '٠' : '0', # arabic numbers
        '١' : '1',
        '٢' : '2',
        '٣' : '3',
        '٤' : '4',
        '٥' : '5',
        '٦' : '6',
        '٧' : '7',
        '٨' : '8',
        '٩' : '9',
        '۰' : '0', # persian numbers
        '۱' : '1',
        '۲' : '2',
        '۳' : '3',
        '۴' : '4',
        '۵' : '5',
        '۶' : '6',
        '۷' : '7',
        '۸' : '8',
        '۹' : '9',
        'ـ' : '',
        '–' : '-',
        'ة' : 'ه',
        'ؤ' : 'و',
        'ي' : 'ی',
        'إ' : 'ا',
        'أ' : 'ا',
        'ئ' : 'ی',
        'ۀ' : 'ه',
        'هٔ' : 'ه\u200cی',
        '\u200e' : '\u200c',
        '\u064e' : '',
        '\u0650' : '',
        '\u064b' : '',
        '\u0652' : '',
        '\u064e' : '',
        '\u0650' : '',
        '\u064d' : '',
        '\u064c' : '',
        '\u064b' : '',
        '\u064f' : '',
        '٬' : ',',
    }
    return _multiple_replace(mapping, input_str)


def _multiple_replace(mapping, text):
    """
    Internal function for replace all mapping keys for a input string
    :param mapping: replacing mapping keys
    :param text: user input string
    :return: New string with converted mapping keys to values
    """
    pattern = "|".join(map(re.escape, mapping.keys()))
    return re.sub(pattern, lambda m: mapping[m.group()], str(text))

In [0]:
class Tokenizer(Lexer):
    tokens = { EMOJI, ABBREVIATION, DATE, TIME, IP, SPACIALNAMES, EMAIL, URL, TAG, ENGLISH, PERSIAN, PERCENT, FLOAT, THUSANDSCOMMA, NUMBER, MATHCHAR, LPAREN, RPAREN, LBRAKET, RBRAKET, LOTHER, ROTHER, PUNCTION, INVALID }
    ignore = ' \t\u200f\ufeff\ufe0f'

    # Tokens
    EMOJI = r'[\U0001F1E0-\U0001F1FF\U0001F300-\U0001F5FF\U0001F600-\U0001F64F\U0001F680-\U0001F6FF\U0001F700-\U0001F77F\U0001F780-\U0001F7FF\U0001F800-\U0001F8FF\U0001F900-\U0001F9FF\U0001FA00-\U0001FA6F\U0001FA70-\U0001FAFF\U00002702-\U000027B0\U000024C2-\U0001F251]([\u200d][\U0001F1E0-\U0001F1FF\U0001F300-\U0001F5FF\U0001F600-\U0001F64F\U0001F680-\U0001F6FF\U0001F700-\U0001F77F\U0001F780-\U0001F7FF\U0001F800-\U0001F8FF\U0001F900-\U0001F9FF\U0001FA00-\U0001FA6F\U0001FA70-\U0001FAFF\U00002702-\U000027B0\U000024C2-\U0001F251])*'
    ABBREVIATION = r'([A-Z]\.){2,}'
    DATE = r'^([1-9]|0[1-9]|1[0-9]|2[0-9]|3[0-1])(\.|-|/)([1-9]|0[1-9]|1[0-2])(\.|-|/)([0-9][0-9]|[0-9][0-9][0-9]|[0-9][0-9][0-9][0-9])$|^([0-9][0-9]|[0-9][0-9][0-9]|[0-9][0-9][0-9][0-9])(\.|-|/)([1-9]|0[1-9]|1[0-2])(\.|-|/)([1-9]|0[1-9]|1[0-9]|2[0-9]|3[0-1])$'
    TIME = r'^(1[0-2]|0?[1-9]):([0-5]?[0-9])(●?[AP]M)?$|^(2[0-3]|[01]?[0-9]):([0-5]?[0-9])$|^(1[0-2]|0?[1-9]):([0-5]?[0-9]):([0-5]?[0-9])(●?[AP]M)?$|^(2[0-3]|[01]?[0-9]):([0-5]?[0-9]):([0-5]?[0-9])$'
    IP = r'((telnet://)|)[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}(/[0-9]{1,2})?'
    SPACIALNAMES = r'((\.(net)|(NET)|(Net))|([Cc]#|[Cc]\+\+)|(3[Gg])|(4[Gg][+]?))'
    EMAIL = r'[a-zA-Z0-9_.+-]+@[ا-یآa-zA-Z0-9-]+\.(?:com|net|org|edu|gov|ir|co|info|blog|club|llc|us|shop|opr|app|inc|website|uk|de|icu|me|tr|mil|arpa|gl|gr|ly)+'
    URL = r'(((http(s)?://)|(ftp://)|(ldap://)|(mailto:)|(news:)|)[?آا-یa-zA-Z0-9%\.\~\(\)\*\'&\+=!@_\#-]+(?:\.com|\.net|\.org|\.edu|\.gov|\.ir|\.co|\.info|\.blog|\.club|\.llc|\.us|\.shop|\.opr|\.app|\.inc|\.website|\.uk|\.de|\.icu|\.me|\.tr|\.mil|\.arpa|\.gl|\.gr|\.ly)+([/][آا-یa-zA-Z0-9%\.\~\(\)\*\'&\+=!@_\#-?]*)*)'
    TAG = r'[\#@][?آا-یa-zA-Z0-9%\.\~\(\)\*\'&\+=!@_-]+'
    ENGLISH = r'([a-zA-Z]+([-_][a-zA-Z]+)*)[a-zA-Z][™©®]?'
    PERSIAN = r'[ئا-یآء][ئا-ی\u200cّءآ_]*[ئا-یآء]'
    PERCENT = r'[-]?\d*[.\\]?d+%'
    FLOAT = r'[-]?\d*[\.\\]\d+'
    THUSANDSCOMMA = r'(?<!,)\b(\d{1,3}(?:,\d{3})*)\b(?!,)'
    NUMBER = r'[-]?\d+'

    # Special symbols
    MATHCHAR = r'[\+\-\*/=\^%]'
    LPAREN = r'\('
    RPAREN = r'\)'
    LBRAKET = r'\['
    RBRAKET = r'\]'
    LOTHER = r'[«{→]'
    ROTHER = r'[»}←]'
    PUNCTION = r'[,\.\':;.،!:?؛؟"•▪…|@$#~`]'

    INVALID = r'.'


    # Ignored pattern
    ignore_newline = r'\n+'

    # Extra action for newlines
    def ignore_newline(self, t):
        self.lineno += t.value.count('\n')

    def error(self, t):
        print("Illegal character '%s'" % t.value[0])
        self.index += 1

In [13]:
# load all files in directory
files = glob.glob(f'{path}/big/*.txt')
fileCount = len(files)
# create place holder data frame to hold information of all files
data = pd.DataFrame(columns=['Word'])

# proccess each file
for i, filename in enumerate(files):
   with io.open(filename, mode="r", encoding="utf-8") as f:
    text = ''
    for x in f:
      text = text + x

    text = convert_ar_characters(text)
    tokenizer = Tokenizer()      
    dic = {}
    prev = ''
    prevType = ''
    for token in tokenizer.tokenize(text):
      index = -1 
      while True:
          index = token.value.find('\u200c', index + 1)
          if index == -1:
              break
          if token.value[index-1] in {'ا', 'د', 'ذ', 'ر', 'ز'}:
            token.value = token.value[:index] + token.value[index+1:]
      if token.value in {'ها', 'های', 'ها\u200cی', 'تر', 'ترین', 'ی'} and prevType == 'PERSIAN':
        if prev[-1] in {'ا', 'د', 'ذ', 'ر', 'ز'}:
          prev = prev + token.value
        else:
          prev = prev + '\u200c' + token.value
        try:
          dic[prev] = dic[prev] + 1
        except:
          dic[prev] = 1
        prev = ''
        prevType = ''
      elif prev in {'می', 'نمی'} and token.type == 'PERSIAN':
        prev = prev + '\u200c' + token.value
        prevType = 'PERSIAN'
      else:
        if prev != '':
          if prev[-1] == '\u200c':
            prev = prev[:-1]
          try:
            dic[prev] = dic[prev] + 1
          except:
            dic[prev] = 1
        prev = token.value
        prevType = token.type
    if prev != '':
          if prev[-1] == '\u200c':
            prev = prev[:-1]
          try:
            dic[prev] = dic[prev] + 1
          except:
            dic[prev] = 1
    # convert dictionary to data frame for better management      
    df = pd.DataFrame(list(dic.items()))
    # rename columns
    df = df.rename(columns={0:'Word', 1:f'{i+1} TF'})
    # join data frame of current file with data frame of all files
    data = pd.merge(data, df, on = ['Word'], how = 'outer').fillna(0)

data

Unnamed: 0,Word,1 TF
0,Text,1
1,🔸,8354
2,شان,98
3,اسپایسر,3
4,سخنگوی,288
...,...,...
34875,می‌انجامید,1
34876,پایه‌ی,1
34877,مقنّنه,1
34878,1954,1


In [14]:
# convert data to boolean and sum them on rows to get number of docs with that word (minus one is to reduce word itself)
data["DF"] = data.astype(bool).sum(axis=1) - 1
for i in range(1, fileCount+1):
  data[f'{i} TF-IDF'] = data[f'{i} TF'] / data['DF']

data = data[data['DF'] != 0]
data = data.sort_values(by=['Word'])
data

Unnamed: 0,Word,1 TF,DF,1 TF-IDF
31893,,2,1,2.0
350,!,2469,1,2469.0
32901,!!!!twitter.com/AadamEbneHavva/status/87236257...,5,1,5.0
31341,!!!twitter.com/a_a_kh1981/status/8716736383360...,1,1,1.0
11597,!#قیام_طبرستانRajanews.com,1,1,1.0
...,...,...,...,...
6408,🤼‍♂,1,1,1.0
29269,🥀,1,1,1.0
21250,🥇,3,1,3.0
21251,🥈,3,1,3.0


In [0]:
data.to_csv(f'{path}/output.csv', index=False, encoding='utf-8-sig')