Imports and Loading google drive

In [39]:
import io
import re
import glob
import pandas as pd

# mount google drive to access files
from google.colab import drive
path = "/gdrive/My Drive/Colab Notebooks/nlp"

drive.mount('/gdrive')

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).


Functions to split text on spacing and post proccess text to replace incorrect space with half space

In [0]:
def convert_ar_characters(input_str):
    """
    Converts Arabic chars to related Persian unicode char
    :param input_str: String contains Arabic chars
    :return: New str with converted arabic chars
    """
    mapping = {
        'ك' : 'ک',
        'دِ' : 'د',
        'بِ' : 'ب',
        'زِ' : 'ز',
        'ذِ' : 'ذ',
        'شِ' : 'ش',
        'سِ' : 'س',
        'ى' : 'ی',
        'ي' : 'ی',
        '“' : '"',
        '”' : '"',
        ')' : '(',
        '(' : ')',
        ')' : '(',
        '٠' : '0', # arabic numbers
        '١' : '1',
        '٢' : '2',
        '٣' : '3',
        '٤' : '4',
        '٥' : '5',
        '٦' : '6',
        '٧' : '7',
        '٨' : '8',
        '٩' : '9',
        '۰' : '0', # persian numbers
        '۱' : '1',
        '۲' : '2',
        '۳' : '3',
        '۴' : '4',
        '۵' : '5',
        '۶' : '6',
        '۷' : '7',
        '۸' : '8',
        '۹' : '9',
        'ـ' : ''
    }
    return _multiple_replace(mapping, input_str)


def _multiple_replace(mapping, text):
    """
    Internal function for replace all mapping keys for a input string
    :param mapping: replacing mapping keys
    :param text: user input string
    :return: New string with converted mapping keys to values
    """
    pattern = "|".join(map(re.escape, mapping.keys()))
    return re.sub(pattern, lambda m: mapping[m.group()], str(text))

In [0]:
def split(text):
  # punctuation pattern
  pattern = re.compile(u'([.,;\':="«»→←?!.،:؟؛\(\)\[\]])', re.UNICODE)
  # add spacing around punctuation
  text = re.sub(pattern , ' \1 'r' \1 ', text)
  # remove extra spacing
  text = text.replace('\n', ' ')
  text = text.replace('\t', ' ')
  text = text.replace('\u200f', ' ')
  text = text.split(' ')

  return(text)

def postProccess(text):
  i=0
  # replace incorrect space with half space
  while(True):
      if text[i] == 'می':
          text = text[:i] + [(u'می‌' + str(text[i+1]))] + text[i+2:]
      if text[i] == 'نمی':
          text = text[:i] + [(u'نمی‌' + str(text[i+1]))] + text[i+2:]
      if text[i] == 'ها':
          text = text[:i-1] + [(str(text[i-1]) + u'‌ها')] + text[i+1:]
          i-=1
      if text[i] == 'های':
          text = text[:i-1] + [(str(text[i-1]) + u'‌های')] + text[i+1:]
          i-=1
      elif text[i] == '':
          text = text[:i] + text[i+1:]
          i-=1
      if text[i][-1] == '' or text[i][-1] == '\u200c':
        text[i] = text[i][:-1]
        i-=1
      i+=1
      if(i==len(text)):
          break
          
  return text

Load files and calculate term frequency

In [59]:
# load all files in directory
files = glob.glob(f'{path}/texts/*.txt')
fileCount = len(files)
# create place holder data frame to hold information of all files
data = pd.DataFrame(columns=['Word'])

# proccess each file
for i, filename in enumerate(files):
   with io.open(filename, mode="r", encoding="utf-8") as f:
    text = ''
    for x in f:
      text = text + x

    text = convert_ar_characters(text)
    text = split(text)
    text = postProccess(text)
    
    dic = {}
    for x in text:
      try:
          dic[x] = dic[x] + 1
      except:
          dic[x] = 1

    # convert dictionary to data frame for better management      
    df = pd.DataFrame(list(dic.items()))
    # rename columns
    df = df.rename(columns={0:'Word', 1:f'{i+1} TF'})
    # join data frame of current file with data frame of all files
    data = pd.merge(data, df, on = ['Word'], how = 'outer').fillna(0)

data

Unnamed: 0,Word,1 TF,2 TF,3 TF,4 TF,5 TF,6 TF
0,آدامس,44.0,0.0,0.0,0.0,0.0,0.0
1,از,44.0,147.0,66.0,46.0,16.0,15.0
2,ویکی‌پدیا,1.0,0.0,0.0,0.0,0.0,0.0
3,,218.0,574.0,204.0,186.0,201.0,64.0
4,،,66.0,141.0,82.0,54.0,20.0,10.0
...,...,...,...,...,...,...,...
2783,وعده,0.0,0.0,0.0,0.0,0.0,1.0
2784,هیچ,0.0,0.0,0.0,0.0,0.0,1.0
2785,تلاشی,0.0,0.0,0.0,0.0,0.0,1.0
2786,فروگذار,0.0,0.0,0.0,0.0,0.0,1.0


Calculate TF-IDF using following formula:

TF-IDF = TF*IDF = TF/DF = (Term Frequency)/(Document Frequency)

In [0]:
# convert data to boolean and sum them on rows to get number of docs with that word (minus one is to reduce word itself)
data["DF"] = data.astype(bool).sum(axis=1) - 1
for i in range(1, fileCount+1):
  data[f'{i} TF-IDF'] = data[f'{i} TF'] / data['DF']
  
data = data.sort_values(by=['Word'])

Export to CSV for future usage and better display

In [0]:
data.to_csv(f'{path}/output.csv', index=False, encoding='utf-8-sig')

In [38]:
data.iloc[[2819]]

IndexError: ignored

In [25]:
data.iloc[[2819]].values[0]

array(['پرس', 0.0, 2.0, 0.0, 0.0, 0.0, 0.0, 1, 0.0, 2.0, 0.0, 0.0, 0.0,
       0.0], dtype=object)