In [None]:
import numpy as np
import pandas as pd

import gc
import re
import json
import pickle

pd.set_option("display.max_columns", 99)

FOLDER = "/media/shared_ardalan_evgeny/processed_data/"
CORPUS  = "test"
FEATURE = "attrsJSON"

In [3]:
def tryDivide(x, y):
    """ Try to divide two numbers """
    s = 0.0
    if y != 0.0: s = x / y
    return s

def getWordCharCount(w):
    """ Char count for a word """
    rus = len(re.findall(r"[а-я]",w))
    eng = len(re.findall(r"[a-z]",w))
    c = len(w)    
    return c, rus, eng

def getTextStatsFeat(text, text_processed = False):
    
    """ Get stats features for raw text """
    sentenceCount = 0.0
    exclamationMarkCount = 0.0
    questionMarkCount = 0.0
    
    if not text_processed :
        sentenceCount = len(re.findall("[.?!]", text))
        exclamationMarkCount = len(re.findall("[!]", text))
        questionMarkCount = len(re.findall("[?]", text))
        text = text.replace(",", " ").replace(".", " ")   
        text = re.sub(u'[^a-zа-я0-9]', ' ', text.lower())
    
    digitsCount = len(re.findall("[0-9]+", text))
    wordCount = 0.0
    charCount = 0.0
    rusCharCount = 0.0
    engCharCount = 0.0
    
    for w in text.split():
        if len(w)>1:
            wordCount += 1
            c, rus, eng = getWordCharCount(w)
            charCount += c
            rusCharCount += rus
            engCharCount += eng
    
    wordPerSentence = 0.0
    charPerSentence = 0.0
    rusCharPerSentence = 0.0
    engCharPerSentence = 0.0
    numCharPerSentence = 0.0
    
    if not text_processed :
        # per sentence
        wordPerSentence = tryDivide(wordCount, sentenceCount)
        charPerSentence = tryDivide(charCount, sentenceCount)
        rusCharPerSentence = tryDivide(rusCharCount, sentenceCount)
        engCharPerSentence = tryDivide(engCharCount, sentenceCount)
        numCharPerSentence = tryDivide(digitsCount, sentenceCount)
    
    # per word
    charPerWord = tryDivide(charCount, wordCount)
    rusCharPerWord = tryDivide(rusCharCount, wordCount)
    engCharPerWord = tryDivide(engCharCount, wordCount)
    numCharPerWord = tryDivide(digitsCount, wordCount)
    
    # ratio
    rusCharRatio = tryDivide(rusCharCount, charCount)
    engCharRatio = tryDivide(engCharCount, charCount)
    rusCharVsEngChar = tryDivide(rusCharCount, engCharCount)
    engCharVsRusChar = tryDivide(engCharCount, rusCharCount)
    numCharVsRusChar = tryDivide(digitsCount, rusCharCount)
    numCharVsEngChar = tryDivide(digitsCount, engCharCount)
    
    stats = {
        "wordCount" : wordCount,
        "charCount" : charCount,
        "rusCharCount" : rusCharCount,
        "engCharCount" : engCharCount,
        "digitsCount" : digitsCount     ,
        "charPerWord" : charPerWord,
        "rusCharPerWord" : rusCharPerWord,
        "engCharPerWord" : engCharPerWord,
        "numCharPerWord" : numCharPerWord,
        "rusCharRatio" : rusCharRatio,
        "engCharRatio" : engCharRatio,
        "rusCharVsEngChar" : rusCharVsEngChar,
        "engCharVsRusChar" : engCharVsRusChar,
        "numCharVsRusChar" : numCharVsRusChar,
        "numCharVsEngChar" : numCharVsEngChar
    }
    
    if not text_processed:
        stats.update({
        "sentenceCount" : sentenceCount,
        "exclamationMarkCount" : exclamationMarkCount,
        "questionMarkCount" : questionMarkCount,
        "wordPerSentence" : wordPerSentence,
        "charPerSentence" : charPerSentence,
        "rusCharPerSentence" : rusCharPerSentence,
        "engCharPerSentence" : engCharPerSentence,
        "numCharPerSentence" : numCharPerSentence
        })


    return stats

In [4]:
ItemInfo = pd.read_hdf(FOLDER+"ItemInfo_"+CORPUS+"_"+FEATURE+"_processed.h")
ItemInfo[FEATURE].fillna("", inplace=True)
print(ItemInfo.columns)
print(ItemInfo.shape)

Index(['itemID', 'attrsJSON', 'attrsJSON_titles', 'attrsJSON_description'], dtype='object')
(1315205, 4)


In [5]:
FEATURE = "attrsJSON_description"

stats = []
for index, row in ItemInfo.iterrows():
    if index % 100000 == 0:
        print("Processed docs :",index)
    stats.append(getTextStatsFeat(row[FEATURE], text_processed = True))

del ItemInfo
gc.collect()

print("Converting new features to dataframe")
pd_stats = pd.DataFrame.from_records(stats)

del stats
gc.collect()

print("Storing results")
pickle.dump(pd_stats, open(FOLDER+"ItemInfo_"+CORPUS+"_"+FEATURE+"_processed_dumy_features.p", "wb"))
print("DONE")

Processed docs : 0
Processed docs : 100000
Processed docs : 200000
Processed docs : 300000
Processed docs : 400000
Processed docs : 500000
Processed docs : 600000
Processed docs : 700000
Processed docs : 800000
Processed docs : 900000
Processed docs : 1000000
Processed docs : 1100000
Processed docs : 1200000
Processed docs : 1300000
Converting new features to dataframe
Storing results
DONE
