In [1]:
# %pip install pandas hazm langdetect CPVI tqdm

### Loading DataFrame & Keep Persian Texts

In [None]:
from __future__ import unicode_literals
from langdetect import detect

import pandas as pd
import random as ra

df = pd.read_csv('./comment-feeling-1650617369.csv')

def is_persian(text):
    try: return detect(text) == 'fa' 
    except: return False
    
df = df[df.commentText.apply(lambda x: is_persian(x))]

### Scenario 1: Normalize The Text Of The Comments

In [None]:
from hazm import Normalizer

normalizer = Normalizer()
df['normalized'] = df.commentText.apply(lambda x: normalizer.normalize(x).strip())

df = df[df.normalized.str.count(' ') > 0]

In [None]:
print('\n'.join(df.normalized.sample(10)))

### Scenario 2: Normalize & Delete Stopwords

In [None]:
from hazm import stopwords_list, word_tokenize

stopwords = set(stopwords_list())
df['no_stopwords'] = df.commentText.apply(
    lambda x: ' '.join((y for y in word_tokenize(x) if y not in stopwords)))

df = df[df.no_stopwords.str.count(' ') > 0]

In [None]:
print('\n'.join(df.no_stopwords.sample(10)))

### Scenario 3: Stemming & lemmatizing

In [None]:
from hazm import Lemmatizer, Stemmer, word_tokenize

stemmer = Stemmer()
lemmatizer = Lemmatizer()

def context_exchanger(word):
    
    lemmatized, stemmed = lemmatizer.lemmatize(word), stemmer.stem(word) 
    
    return (('ن' if word.startswith('ن') and not lemmatized.split('#')[1].startswith('ن') else '') 
            + lemmatized.split('#')[1]) if '#' in lemmatized else stemmed

df['lemz_or_stem'] = df.commentText.apply(
    lambda x: ' '.join((context_exchanger(y) for y in word_tokenize(x))))

In [None]:
print('\n'.join(df.lemz_or_stem.sample(10)))

### Scenario 4: Using POSTagger & Chunker

In [None]:
from hazm import POSTagger, word_tokenize
from pprint import pprint

tagger = POSTagger(model='../resources/postagger.model')

for text in df.commentText.sample(1):
    pprint(text)
    pprint(tagger.tag(word_tokenize(text)))

In [None]:
from hazm import POSTagger, Chunker, tree2brackets, word_tokenize

tagger = POSTagger(model='../resources/postagger.model')
chunker = Chunker(model='../resources/chunker.model')

for text in df.commentText.sample(1):
    pprint(text)
    tagged = tagger.tag(word_tokenize(text))
    pprint(tree2brackets(chunker.parse(tagged)))

### Scenario 5: Changing Shape Of Verb

In [None]:
import random 
from CPVI import CPVI


flatten_list = lambda irregular_list:[element for item in irregular_list for element in flatten_list(item)] \
                                        if type(irregular_list) is list else [irregular_list]

def get_allshapes(verb):
    
    profile = CPVI().profiling(verb)
    
    verbs = []
    
    verbs.extend(profile['paradigm']['informal']['Persian']['affirmative']['present'].values())
    verbs.extend(profile['paradigm']['informal']['Persian']['affirmative']['past'].values())
    verbs.extend(profile['paradigm']['informal']['Persian']['affirmative']['future'].values())
    verbs.extend(profile['paradigm']['informal']['Persian']['negative']['present'].values())
    verbs.extend(profile['paradigm']['informal']['Persian']['negative']['past'].values())
    verbs.extend(profile['paradigm']['informal']['Persian']['negative']['future'].values())
    verbs.extend(profile['paradigm']['formal']['Persian']['affirmative']['present'].values())
    verbs.extend(profile['paradigm']['formal']['Persian']['affirmative']['past'].values())
    verbs.extend(profile['paradigm']['formal']['Persian']['affirmative']['future'].values())
    verbs.extend(profile['paradigm']['formal']['Persian']['negative']['present'].values())
    verbs.extend(profile['paradigm']['formal']['Persian']['negative']['past'].values())
    verbs.extend(profile['paradigm']['formal']['Persian']['negative']['future'].values())
    
    verbs = [list(gro.values()) for gro in verbs if gro]
    
    groups = [flatten_list(gro) for gro in verbs if gro != None]
    
    return groups

groups = get_allshapes('هست')
groups[random.randint(0 ,len(groups))]

In [None]:
from hazm import POSTagger, word_tokenize

tagger = POSTagger(model='../resources/postagger.model')

def get_with_allverbs(text):
    
    verbs = [tg for tg in tagger.tag(word_tokenize(text)) if tg[1] == 'V']
    
    replaced = []
    
    for vr, _ in verbs:
        
        for vrps in get_allshapes(vr):
            if vr not in vrps:
                continue
                
            replaced.extend([text.replace(vr, new) for new in vrps])
            
    return replaced

for text in df.commentText.sample(1):
    new_texts = ['%s: %s' % x for x in enumerate(get_with_allverbs(text), start=1)]
    print('orig:', text, *new_texts, sep='\n')

### Scenario 6: Normalize Unconventional Words

In [None]:
from hazm import word_tokenize
from collections import Counter

counter = Counter()
df.no_stopwords.apply(lambda x: counter.update(word_tokenize(x)))

counter.most_common(20)

In [None]:
half_commens = set(x[0]for x in counter.most_common(len(counter) // 2))

In [None]:
import re

def normalize_badwords(word):
    searched = re.search(r'(.)\1{1,}', word)
    if not searched:
        return word
    
    one = word[:searched.span()[0] + 1] + word[searched.span()[1]:]
    
    if word[searched.span()[0] + 1] != 'ی':
        return one
    
    two = word[:searched.span()[0] + 2] + word[searched.span()[1]:]
    return one if one in half_commens else two

print(normalize_badwords('عالیییییه'), ' و ', normalize_badwords('پاییییین'))

In [None]:
from hazm import word_tokenize

def normalize_badtexts(text):
    return ' '.join([normalize_badwords(to) for to in word_tokenize(text)])

print(normalize_badtexts('سلاااااام به‌به!!!! چه تی‌ای گلییییییی. درخدمت باشییییم'))

### The Final Check For Achieving The Right Data

In [None]:
from hazm import Normalizer, stopwords_list, word_tokenize, Lemmatizer, Stemmer

normalizer = Normalizer()
df['normalized'] = df.commentText.apply(lambda x: normalizer.normalize(x).strip())

stemmer = Stemmer()
lemmatizer = Lemmatizer()

def context_exchanger(word):
    
    lemmatized, stemmed = lemmatizer.lemmatize(word), stemmer.stem(word) 
    
    return (('ن' if word.startswith('ن') and not lemmatized.split('#')[1].startswith('ن') else '') 
            + lemmatized.split('#')[1]) if '#' in lemmatized else stemmed

df['lemz_or_stem'] = df.normalized.apply(
    lambda x: ' '.join((context_exchanger(y) for y in word_tokenize(x))))

stopwords = set(stopwords_list() + [])
df['no_stopwords'] = df.normalized.apply(
    lambda x: ' '.join((y for y in word_tokenize(x) if y not in stopwords)))

In [None]:
from pandas import option_context

with option_context('display.max_colwidth', 400):
    display(df[['no_stopwords', 'feeling']].sample(10))
    
with option_context('display.max_colwidth', 400):
    display(df[['lemz_or_stem', 'feeling']].sample(10))

# Construction Of Final Dataset According To The Scenarios 

In [1]:
from hazm import POSTagger, word_tokenize, Lemmatizer, Normalizer, stopwords_list
from collections import Counter
from langdetect import detect

import re
import pandas as pd

In [2]:
df = pd.read_csv('./comment-feeling-1650617369.csv')

def is_persian(text):
    try: return detect(text) == 'fa' 
    except: return False
    
df = df[df.commentText.apply(lambda x: is_persian(x))]

In [3]:
stopwords = set(stopwords_list())
df['no_stopwords'] = df.commentText.apply(
    lambda x: ' '.join((y for y in word_tokenize(x) if y not in stopwords)))

df = df[df.no_stopwords.str.count(' ') > 0]

counter = Counter()
df.no_stopwords.apply(lambda x: counter.update(word_tokenize(x)))
half_commens = set(x[0]for x in counter.most_common(len(counter) // 2))

In [4]:
def normalize_badwords(word):
    searched = re.search(r'(.)\1{1,}', word)
    if not searched:
        return word
    
    one = word[:searched.span()[0] + 1] + word[searched.span()[1]:]
    
    if word[searched.span()[0] + 1] != 'ی':
        return one
    
    two = word[:searched.span()[0] + 2] + word[searched.span()[1]:]
    return one if one in half_commens else two


tagger = POSTagger(model='../resources/postagger.model')
lemmatizer = Lemmatizer()
normalizer = Normalizer()


lemmatize = lambda word, _type: lemmatizer.lemmatize(word) if _type != 'V' else word
add_not = lambda old, new: old.startswith('ن') and not new.startswith('ن')
change = lambda word, _type: _type == 'V' and '#' in word and add_not(*word.split('#'))

In [5]:
def setp_1(text):
    
    tokens = [token.replace('#', '_') for token in word_tokenize(text)]
    tokens = [normalize_badwords(token) for token in tokens]
    
    return ' '.join(tokens)


def setp_3(tags):
    return [tag for tag in tags if tag[1] not in {'P', 'CONJ', 'PRO', 'DET'}]


df['step_1'] = df.commentText.apply(setp_1)
df['step_2'] = tagger.tag_sents(df.step_1.apply(lambda x: x.split()))
df['step_3'] = df.step_2.apply(setp_3)

In [6]:
def proccessing(tags):
    
    lemmatizeds = [(lemmatize(*tag), tag[1]) for tag in tags]
    prefixes = ['ن' if change(*tag) else '' for tag in lemmatizeds]
    
    tuples = zip(prefixes, lemmatizeds)
    
    lemmatizeds = [(pre + word, _type) for pre, (word, _type) in tuples]
    return normalizer.normalize(' '.join([x for x, y in lemmatizeds])).strip()


df['preprocessed'] = df.step_3.apply(proccessing)
df['normalized'] = df.commentText.apply(normalizer.normalize)

In [7]:
dataset = pd.concat(
    (df[['normalized'  , 'feeling']].rename(columns={"normalized"  : "text", "feeling": "feeling"}),
     df[['preprocessed', 'feeling']].rename(columns={"preprocessed": "text", "feeling": "feeling"}), 
    ))

import time
dataset.to_csv(f'Pre-processed-comments-{int(time.time())}.csv', index=False)