In [1]:
import pandas as pd
import numpy as np

from textblob import TextBlob, Sentence
from nltk.corpus import stopwords
import collections 

In [2]:
from nltk.tokenize import TweetTokenizer, sent_tokenize

In [3]:
eng_stopwords = set(stopwords.words("english"))

In [19]:
train = pd.read_csv('data/train.csv').fillna(' ')
test = pd.read_csv('data/test.csv').fillna(' ')

In [48]:
repeated_threshold = 15
def count_repeated(text):
    text_splitted = text.split()
    word_counts = collections.Counter(text_splitted)
    return sum(count for word, count in sorted(word_counts.items()) if count > repeated_threshold)
eng_stopwords = set(stopwords.words("english"))

def purity(sentences):
    polarities = np.array([TextBlob(x).sentiment.polarity for x in sentences])
    return polarities.sum() / np.abs(polarities).sum()

In [49]:
def generate_text_features(df):
    tdf = pd.DataFrame()
    tdf['id'] = df['id']
    tdf['text'] = df['comment_text']
   
    tdf['total_length'] = tdf.text.apply(len)

    tdf['capitals'] = tdf.text.apply(lambda comment: sum(1 for c in comment if c.isupper()))
    tdf['capitals_vs_length'] = tdf['capitals'] / tdf['total_length']


    tdf['stopwords'] = tdf.text.apply(lambda comment: sum(comment.count(w) for w in eng_stopwords))
    tdf['stopwords_vs_length'] = tdf['stopwords'] / tdf['total_length']

    tdf['exclamation_marks'] = tdf.text.apply(lambda comment: comment.count('!'))
    tdf['exclamation_marks_vs_length'] = tdf['exclamation_marks'] / tdf['total_length']


    tdf['unique_words'] = tdf.text.apply(
        lambda comment: len(set(w for w in comment.split())))
    tdf['unique_words_vs_length'] = tdf['unique_words'] / tdf['total_length']

    tdf['repeated_words'] = tdf.text.apply(lambda comment: count_repeated(comment))
    tdf['repeated_words_vs_length'] = tdf['repeated_words'] / tdf['total_length']
    
    tdf['sentences'] = tdf.text.apply(lambda comment: sent_tokenize(comment)) 
    
    tdf['polarity_1st_sent'] = tdf.sentences.apply(lambda s: TextBlob(s[0]).sentiment.polarity)    
    tdf['subjectivity_1st_sent'] = tdf.sentences.apply(lambda s: TextBlob(s[0]).sentiment.subjectivity)
    
    tdf['polarity_last_sent'] = tdf.sentences.apply(lambda s: TextBlob(s[-1]).sentiment.polarity)
    
    tdf['polarity'] = tdf.text.apply(lambda s: TextBlob(s).sentiment.polarity)
    tdf['subjectivity'] = tdf.text.apply(lambda s: TextBlob(s).sentiment.subjectivity)
    tdf['purity'] = tdf.sentences.apply(purity)
    tdf['purity'].fillna(0, inplace=True)
    
    tdf.rename(columns={'text': 'comment_text'}, inplace=True)
    
    return tdf
    

In [35]:
features = [
    'id',
    'comment_text',
    'capitals_vs_length',
    'stopwords_vs_length',
    'exclamation_marks_vs_length',
    'unique_words_vs_length',
    'repeated_words_vs_length',
    'sentences',
    'polarity_1st_sent',
    'subjectivity_1st_sent',
    'polarity_last_sent',
    'polarity',
    'subjectivity',
    'purity',
]

In [44]:
train_rez = generate_text_features(train)

In [45]:
train_rez[features].to_csv('train_text_features.csv', index=False)

In [57]:
test[test['sentences_len'] == 0]

Unnamed: 0,id,comment_text,sentences,sentences_len
55142,5bbabc3b14cc1f7f,,[],0


In [60]:
test[test['id'] != '5bbabc3b14cc1f7f'].shape

(153163, 4)

In [8]:
test['comment_text'].sample(1).iloc[0]

"This guy is a nothing.  This article shouldn't even exist."

In [9]:
test_rez = pd.read_csv('test_text_features.csv')

In [10]:
len(test)

153164

In [11]:
len(test_rez)

153163

In [20]:
test = test.merge(test_rez[[
    'id',
    'capitals_vs_length',
    'stopwords_vs_length',
    'exclamation_marks_vs_length',
    'unique_words_vs_length',
    'repeated_words_vs_length',
    'sentences',
    'polarity_1st_sent',
    'subjectivity_1st_sent',
    'polarity_last_sent',
    'polarity',
    'subjectivity',
    'purity',
]], on = 'id', how='left')

In [34]:
len(test)

153164

In [23]:
test.fillna(0, inplace=True)

In [29]:
test.loc[test['id'] == '5bbabc3b14cc1f7f', 'sentences'].iloc[0] = []

In [61]:
test_rez = generate_text_features(test[test['id'] != '5bbabc3b14cc1f7f'])

In [37]:
test[features].to_csv('data/test_text_features.csv', index=False)