In [1]:
# -*- coding: utf-8 -*-

######################
## Required Modules ##
######################

import numpy as np
import pandas as pd

import re
import json
import nltk.corpus
try:
   import cPickle as pkl
except:
   import pickle as pkl
from csv import DictReader
from datetime import datetime
from nltk import SnowballStemmer
from collections import defaultdict
from gensim import corpora, models
# from ngram import getUnigram, getBigram

pd.set_option("display.max_columns", 99)

FOLDER = "/home/evgeny/kaggle/input/"
FOLDER_MEDIA = "/media/shared_ardalan_evgeny/"

In [2]:
###########
## Setup ##
###########
stopwords= frozenset(word \
                     for word in nltk.corpus.stopwords.words("russian") \
                     if word!="не")
stemmer = SnowballStemmer('russian')
engChars = [ord(char) for char in u"cCyoOBaAKpPeE"]
rusChars = [ord(char) for char in u"сСуоОВаАКрРеЕ"]
eng_rusTranslateTable = dict(zip(engChars, rusChars))
rus_engTranslateTable = dict(zip(rusChars, engChars))

In [3]:
def tryDivide(x, y):
    """ Try to divide two numbers"""
    s = 0.0
    if y != 0.0: s = x / y
    return s

def correctWord (w):
    """ Corrects word by replacing characters with written similarly depending
    on which language the word. 
    Fraudsters use this technique to avoid detection by anti-fraud algorithms.
    """
    if len(re.findall(r"[а-я]",w))>len(re.findall(r"[a-z]",w)):
        return w.translate(eng_rusTranslateTable)
    else:
        return w.translate(rus_engTranslateTable)
    
def getWordCharCount(w):
    """ Char count for a word."""
    rus = len(re.findall(r"[а-я]",w))
    eng = len(re.findall(r"[a-z]",w))
    num = len(re.findall(r"[0-9]",w))
    c = len(w)    
    return c, rus, eng, num

In [13]:
def getTextStatsFeat(text, ndoc, stemmRequired = True,
                     excludeStopwordsRequired = True):
    
    """ Get stats features for raw text.
    These features don't seem to help much.
    """
    #length = len(text)
    sentenceCount = len(re.findall("[.?!]", text))
    exclamationMarkCount = len(re.findall("[!]", text))
    questionMarkCount = len(re.findall("[?]", text))
    digitsCount = len(re.findall("[0-9]+", text))
    text = text.replace(",", " ").replace(".", " ")
    cleanText = re.sub(u'[^a-zа-я0-9]', ' ', text.lower())
    wordCount = 0.0
    charCount = 0.0
    rusCharCount = 0.0
    engCharCount = 0.0
    numCharCount = 0.0
    if excludeStopwordsRequired:
        for w in cleanText.split():
            if len(w)>1 and w not in stopwords:
                if not (not stemmRequired or re.search("[0-9a-z]", w)):
                    w = stemmer.stem(w)
                wordCount += 1
                c, rus, eng, num = getWordCharCount(w)
                charCount += c
                rusCharCount += rus
                engCharCount += eng
                numCharCount += num
    else:
        for w in cleanText.split():
            if len(w)>1:
                if not (not stemmRequired or re.search("[0-9a-z]", w)):
                    w = stemmer.stem(w)
                wordCount += 1
                c, rus, eng, num = getWordCharCount(w)
                charCount += c
                rusCharCount += rus
                engCharCount += eng
                numCharCount += num
    # per sentence
    wordPerSentence = tryDivide(wordCount, sentenceCount)
    charPerSentence = tryDivide(charCount, sentenceCount)
    rusCharPerSentence = tryDivide(rusCharCount, sentenceCount)
    engCharPerSentence = tryDivide(engCharCount, sentenceCount)
    numCharPerSentence = tryDivide(numCharCount, sentenceCount)
    # per word
    charPerWord = tryDivide(charCount, wordCount)
    rusCharPerWord = tryDivide(rusCharCount, wordCount)
    engCharPerWord = tryDivide(engCharCount, wordCount)
    numCharPerWord = tryDivide(numCharCount, wordCount)
    # ratio
    rusCharRatio = tryDivide(rusCharCount, charCount)
    engCharRatio = tryDivide(engCharCount, charCount)
    rusCharVsEngChar = tryDivide(rusCharCount, engCharCount)
    engCharVsRusChar = tryDivide(engCharCount, rusCharCount)
    numCharVsRusChar = tryDivide(numCharCount, rusCharCount)
    numCharVsEngChar = tryDivide(numCharCount, engCharCount)
    
#     stats = [
#         sentenceCount,
#         wordCount,
#         charCount,
#         rusCharCount,
#         engCharCount,
#         digitsCount,      
#         exclamationMarkCount,
#         questionMarkCount,
# #   per sentence
#         wordPerSentence,
#         charPerSentence,
#         rusCharPerSentence,
#         engCharPerSentence,
#         numCharPerSentence,
# #   per word
#         charPerWord,
#         rusCharPerWord,
#         engCharPerWord,
#         numCharPerWord,
# #   ratio
#         rusCharRatio,
#         engCharRatio,
#         rusCharVsEngChar,
#         engCharVsRusChar,
#         numCharVsRusChar,
#         numCharVsEngChar,
#     ]

    stats = {
        "sentenceCount" + ndoc : sentenceCount,
        "wordCount" + ndoc : wordCount,
        "charCount" + ndoc : charCount,
        "rusCharCount" + ndoc : rusCharCount,
        "engCharCount" + ndoc : engCharCount,
        "digitsCount" + ndoc : digitsCount     ,
        "exclamationMarkCount" + ndoc : exclamationMarkCount,
        "questionMarkCount" + ndoc : questionMarkCount,
        "wordPerSentence" + ndoc : wordPerSentence,
        "charPerSentence" + ndoc : charPerSentence,
        "rusCharPerSentence" + ndoc : rusCharPerSentence,
        "engCharPerSentence" + ndoc : engCharPerSentence,
        "numCharPerSentence" + ndoc : numCharPerSentence,
        "charPerWord" + ndoc : charPerWord,
        "rusCharPerWord" + ndoc : rusCharPerWord,
        "engCharPerWord" + ndoc : engCharPerWord,
        "numCharPerWord" + ndoc : numCharPerWord,
        "rusCharRatio" + ndoc : rusCharRatio,
        "engCharRatio" + ndoc : engCharRatio,
        "rusCharVsEngChar" + ndoc : rusCharVsEngChar,
        "engCharVsRusChar" + ndoc : engCharVsRusChar,
        "numCharVsRusChar" + ndoc : numCharVsRusChar,
        "numCharVsEngChar" + ndoc : numCharVsEngChar
    }
#     statsFeat = ""
#     for i,f in enumerate(stats):
#         if f != 0:
#             statsFeat += "%s:%s " % (i+1, f)
#     statsFeat = statsFeat[:-1]    
#     return statsFeat
    return stats

In [5]:
def getCleanSentence(text, stemmRequired = True,
             correctWordRequired = True,
             excludeStopwordsRequired = True):
    """ Splits the text into words, discards stop words and applies stemmer. 
    Parameters
    ----------
    text : str - initial string
    stemmRequired : bool - flag whether stemming required
    correctWordRequired : bool - flag whether correction of words required     
    """
    text = text.replace(",", " ").replace(".", " ")
    cleanText = re.sub(u'[^a-zа-я0-9]', ' ', text.lower())
    if correctWordRequired:
        if excludeStopwordsRequired:
            words = [correctWord(w) \
                    if not stemmRequired or re.search("[0-9a-z]", w) \
                    else stemmer.stem(correctWord(w)) \
                    for w in cleanText.split() \
                    if len(w)>1 and w not in stopwords]
        else:
            words = [correctWord(w) \
                    if not stemmRequired or re.search("[0-9a-z]", w) \
                    else stemmer.stem(correctWord(w)) \
                    for w in cleanText.split() \
                    if len(w)>1]
    else:
        if excludeStopwordsRequired:
            words = [w \
                    if not stemmRequired or re.search("[0-9a-z]", w) \
                    else stemmer.stem(w) \
                    for w in cleanText.split() \
                    if len(w)>1 and w not in stopwords]
        else:
            words = [w \
                    if not stemmRequired or re.search("[0-9a-z]", w) \
                    else stemmer.stem(w) \
                    for w in cleanText.split() \
                    if len(w)>1]
    cleanSentence = ' '.join(words)
    return cleanSentence

In [72]:
print("Loading...")
print("----- LOAD train_merged-part1")
pdtrain1 = pd.read_hdf(FOLDER + "train_merged-part1.h")
print("----- LOAD train_merged-part2")
pdtrain2 = pd.read_hdf(FOLDER + "train_merged-part2.h")
pdtrain = pdtrain1.append(pdtrain2)
# print("----- LOAD test_merged")
# pdtest = pd.read_hdf(FOLDER + "test_merged.h")
# pd_data = pdtrain.append(pdtest)

del pdtrain1
del pdtrain2

print("DONE")

Loading...
----- LOAD train_merged-part1
----- LOAD train_merged-part2
DONE


In [73]:
print("Extract titles")
pd_titles_features = pdtrain[['itemID_1', 'itemID_2','title_1','title_2', 'isDuplicate']]

print("DONE")

Extract titles
DONE


In [82]:
print("Replacing NaN")
pd_titles_features['title_1'].fillna("", inplace=True)
pd_titles_features['title_2'].fillna("", inplace=True)

Replacing NaN


In [74]:
del pdtrain

In [63]:
# columns = [
# 'sentenceCount',
# 'wordCount',
# 'charCount',
# 'rusCharCount',
# 'engCharCount',
# 'digitsCount',
# 'exclamationMarkCount',
# 'questionMarkCount',
# 'wordPerSentence',
# 'charPerSentence',
# 'rusCharPerSentence',
# 'engCharPerSentence',
# 'numCharPerSentence',
# 'charPerWord',
# 'rusCharPerWord',
# 'engCharPerWord',
# 'numCharPerWord',
# 'rusCharRatio',
# 'engCharRatio',
# 'rusCharVsEngChar',
# 'engCharVsRusChar',
# 'numCharVsRusChar',
# 'numCharVsEngChar'
# ]
# test = pd.DataFrame(0.0, index=pd_titles_features.head().index, columns=columns)
# test


In [83]:
print("Processing title_1")
pd_titles_features = pd_titles_features.merge(pd_titles_features.title_1.apply(lambda x: pd.Series(getTextStatsFeat(x, '_1'))), 
           left_index=True, right_index=True)

print("Processing title_2")
pd_titles_features = pd_titles_features.merge(pd_titles_features.title_2.apply(lambda x: pd.Series(getTextStatsFeat(x, '_2'))), 
           left_index=True, right_index=True)

print("Done")

Processing title_1
Processing title_2
Done


In [86]:
pd_titles_features.to_hdf("titles_dumy_features.h", 'w')

In [95]:
del pd_titles_features

In [6]:
print("Loading...")
print("----- LOAD train_merged-part1")
pdtrain1 = pd.read_hdf(FOLDER + "train_merged-part1.h")
print("----- LOAD train_merged-part2")
pdtrain2 = pd.read_hdf(FOLDER + "train_merged-part2.h")
pdtrain = pdtrain1.append(pdtrain2)
# print("----- LOAD test_merged")
# pdtest = pd.read_hdf(FOLDER + "test_merged.h")
# pd_data = pdtrain.append(pdtest)

del pdtrain1
del pdtrain2

print("DONE")

Loading...
----- LOAD train_merged-part1
----- LOAD train_merged-part2
DONE


In [7]:
print("Extract titles")
pd_description_features = pdtrain[['itemID_1', 'itemID_2','description_1','description_2', 'isDuplicate']]

print("DONE")

Extract titles
DONE


In [8]:
del pdtrain

In [10]:
pd_description_features.head()

Unnamed: 0,itemID_1,itemID_2,description_1,description_2,isDuplicate
0,1,4112648,Продам Камаз 6520 20 тонн,Продам Камаз 6520 20 тонн,1
1,523245,739258,Продам в хорошем состоянии 2 пары ботинок для ...,Продам в хорошем состоянии сноуборд размер 150см,0
2,739258,2558827,Продам в хорошем состоянии сноуборд размер 150см,Продам новые крепления \nЦена 2500 руб,0
3,280620,970311,Продам или обмен мерседес бенц в хорошем состо...,"Продам в хорошем состоянии,рестайлинг,двигател...",1
4,970311,4402682,"Продам в хорошем состоянии,рестайлинг,двигател...","Продам ,автомат не пинает,масло от замены до ...",1


In [11]:
print("Replacing NaN")
pd_description_features['description_1'].fillna("", inplace=True)
pd_description_features['description_2'].fillna("", inplace=True)

Replacing NaN


In [14]:
print("Processing descriprion_1")
pd_description_features = pd_description_features.merge(pd_description_features.description_1.apply(lambda x: pd.Series(getTextStatsFeat(getCleanSentence(x), '_1'))), 
           left_index=True, right_index=True)

print("Processing descriprion_2")
pd_description_features = pd_description_features.merge(pd_description_features.description_1.apply(lambda x: pd.Series(getTextStatsFeat(getCleanSentence(x), '_2'))), 
           left_index=True, right_index=True)

print("Done")

Processing descriprion_1
Processing descriprion_2
Done


In [15]:
pd_titles_features.to_hdf("description_dumy_features.h", 'w')
print("DONE")

NameError: name 'pd_titles_features' is not defined