In [None]:
# Pandas - Toxic Comments EDA

# basics
import pandas as pd 
import numpy as np

# Load data
train = pd.read_csv('train.csv')
test = pd.read_csv('../data/Toxis comments/test.csv')

# First look
train.head(20)
test.head(20)

print(train.shape)
print(test.shape)

train.info()
test.info()

train.describe()
train['toxic'].sample(10)
train['toxic'].value_counts()
train['toxic'].value_counts(normalize=True)

train.isnull().head()

print("Check for missing values in Train dataset")
null_check=train.isnull().sum()
print(null_check)
print("filling NA with \"unknown\"")

# Vizalization 
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
plt.style.use('fivethirtyeight')

plt.figure(figsize=(10,5))
ax= sns.barplot(x.index, x.values, alpha=0.8)
plt.title("# per class")
plt.ylabel('# of Occurrences', fontsize=12)
plt.xlabel('Type ', fontsize=12)

#adding the text labels
rects = ax.patches
labels = x.values
for rect, label in zip(rects, labels):
    height = rect.get_height()
    ax.text(rect.get_x() + rect.get_width()/2, height + 5, label, ha='center', va='bottom')

plt.show()


# Multi tagging
x=rowsums.value_counts()

#plot
plt.figure(figsize=(8,4))
color = sns.color_palette()
ax = sns.barplot(x.index, x.values, alpha=0.8,color=color[0])
plt.title("Multiple tags per comment")
plt.ylabel('# of Occurrences', fontsize=12)
plt.xlabel('# of tags ', fontsize=12)

#adding the text labels
rects = ax.patches
labels = x.values
for rect, label in zip(rects, labels):
    height = rect.get_height()
    ax.text(rect.get_x() + rect.get_width()/2, height + 5, label, ha='center', va='bottom')

plt.show()


# Coorelation

corr=temp_df.corr()
plt.figure(figsize=(10,8))
sns.heatmap(corr,
            xticklabels=corr.columns.values,
            yticklabels=corr.columns.values, annot=True, cmap="YlGnBu")

# Crosstab:
pd.crosstab(temp_df['insult'], temp_df['obscene'])

# look at toxic with other tags
main_col="toxic"
corr_mats=[]
for other_col in temp_df.columns[1:]:
    confusion_matrix = pd.crosstab(temp_df[main_col], temp_df[other_col])
    corr_mats.append(confusion_matrix)
out = pd.concat(corr_mats,axis=1,keys=temp_df.columns[1:])
out

# Let's read comments
pd.options.display.max_colwidth = -1
train[train.toxic==1].sample(10)

print("severe_toxic: \n")
print(train[train.severe_toxic==1]['comment_text'].sample(1).iloc[0])

print("threat: \n")
print(train[train.threat==1]['comment_text'].sample(1).iloc[0])

# Wordcloud
from wordcloud import WordCloud ,STOPWORDS

sample=train[train.threat==1]

text=sample.comment_text.values

wc= WordCloud(max_font_size=60, background_color="black",max_words=2000,stopwords=STOPWORDS)
wc.generate(" ".join(text))
plt.figure(figsize=(12,6))
plt.axis("off")
plt.imshow(wc.recolor(colormap= 'viridis' , random_state=17),
           interpolation="bilinear")
plt.show()

# Feature engineering
import re
import string

train['count_sent']=train["comment_text"].apply(lambda x: len(re.findall("\n",str(x)))+1)
train[:2]

train['count_word']=train["comment_text"].apply(lambda x: len(str(x).split()))
#Unique word count
train['count_unique_word']=train["comment_text"].apply(lambda x: len(set(str(x).split())))
#Letter count
train['count_letters']=train["comment_text"].apply(lambda x: len(str(x)))
#punctuation count
train["count_punctuations"] =train["comment_text"].apply(lambda x: len([c for c in str(x) if c in string.punctuation]))
#upper case words count
train["count_words_upper"] = train["comment_text"].apply(lambda x: len([w for w in str(x).split() if w.isupper()]))
#title case words count
train["count_words_title"] = train["comment_text"].apply(lambda x: len([w for w in str(x).split() if w.istitle()]))
#Number of stopwords
train["count_stopwords"] = train["comment_text"].apply(lambda x: len([w for w in str(x).lower().split() if w in STOPWORDS]))
#Average length of the words
train["mean_word_len"] = train["comment_text"].apply(lambda x: np.mean([len(w) for w in str(x).split()]))

STOPWORDS
train.sample(2)

# Word count percent in each comment:
train['word_unique_percent']=train['count_unique_word']*100/train['count_word']
# Punct percent in each comment:
train['punct_percent']=train['count_punctuations']*100/train['count_word']

# are longer comments more toxic
plt.figure(figsize=(12,6))
## sentenses
plt.subplot(121)
plt.suptitle("Are longer comments more toxic?",fontsize=20)
sns.violinplot(y='count_sent',x='clean', data=train, split=True)
plt.xlabel('Clean?', fontsize=12)
plt.ylabel('# of sentences', fontsize=12)
plt.title("Number of sentences in each comment", fontsize=15)
# words
plt.subplot(122)
sns.violinplot(y='count_word',x='clean', data=train, split=True, inner="quart")
plt.xlabel('Clean?', fontsize=12)
plt.ylabel('# of words', fontsize=12)
plt.title("Number of words in each comment", fontsize=15)

plt.show()


# Spammers
#spammers - comments with less than 40% unique words
spammers=train[train['word_unique_percent']<30]

print("Clean Spam example:")
print(spammers[spammers.clean==1].comment_text.iloc[1])

print("Toxic Spam example:")
print(spammers[spammers.toxic==1].comment_text.iloc[2])

#For the desired plots , the data must be in long format
temp_df = pd.melt(train, value_vars=['count_word', 'count_unique_word'], id_vars='clean')

temp_df.head()

# what's so unique
import matplotlib.gridspec as gridspec 

plt.figure(figsize=(16,12))
plt.suptitle("What's so unique ?",fontsize=20)
gridspec.GridSpec(2,2)
plt.subplot2grid((2,2),(0,0))
sns.violinplot(x='variable', y='value', hue='clean', data=temp_df, split=True,inner='quartile')
plt.title("Absolute wordcount and unique words count")
plt.xlabel('Feature', fontsize=12)
plt.ylabel('Count', fontsize=12)

plt.subplot2grid((2,2),(0,1))
plt.title("Percentage of unique words of total words in comment")
# sns.boxplot(x='clean', y='word_unique_percent', data=train_feats)
ax=sns.kdeplot(train[train.clean == 0].word_unique_percent, label="Bad",shade=True,color='r')
ax=sns.kdeplot(train[train.clean == 1].word_unique_percent, label="Clean")
plt.legend()
plt.ylabel('Number of occurances', fontsize=12)
plt.xlabel('Percent unique words', fontsize=12)

x=spammers.iloc[:,2:8].sum()
plt.subplot2grid((2,2),(1,0),colspan=2)
plt.title("Count of comments with low(<30%) unique words",fontsize=15)
ax=sns.barplot(x=x.index, y=x.values,color=color[3])

# adding the text labels
rects = ax.patches
labels = x.values
for rect, label in zip(rects, labels):
    height = rect.get_height()
    ax.text(rect.get_x() + rect.get_width()/2, height + 5, label, ha='center', va='bottom')

plt.xlabel('Threat class', fontsize=12)
plt.ylabel('# of comments', fontsize=12)
plt.show()

# Вывод: спамеры пишут более "токсичные" комментарии. Это будет хорошей фичей для ML



In [None]:
import numpy as np
import pandas as pd
import re

from string import punctuation
from wordcloud import STOPWORDS
from collections import Counter

from scipy.sparse import csr_matrix, hstack
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

class_names = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

train = pd.read_

csv('../../data/Toxis comments/train.csv').fillna(' ')
test = pd.read_csv('../../data/Toxis comments/test.csv').fillna(' ')

train.shape

train_text = train['comment_text']
test_text = test['comment_text']
all_text = pd.concat([train_text, test_text])

# to count the most common words in the comments you should set max_features=1000 to avoid MemoryError
word_vectorizer = TfidfVectorizer(analyzer='word',
                                  max_features=1000,
                                 )

all_word_features = word_vectorizer.fit_transform(all_text)
x_train = word_vectorizer.transform(train_text)
x_test = word_vectorizer.transform(test_text)

sums = all_word_features.todense().sum(axis=0)
d = list(zip(word_vectorizer.get_feature_names(), sums.tolist()[0]))
words_sorted = sorted(d, key=lambda x: x[1], reverse=True)

words_sorted[:5]

def get_words(text):
    """return list of the words"""
    pattern = r'[a-z]+'
    words = re.findall(pattern, text.lower())
    
    return words

words_lists = all_text.apply(get_words)

all_words = dict()

for item in words_lists:
    
    for word in list(item):
        if word in all_words:
            all_words[word] += 1
        else:
            all_words[word] = 1
            
sorted(all_words.items(), key=lambda x: x[1], reverse=True)[:5]

result = Counter(all_words)
result.most_common(5)

# Logistic regression
# Для классификации будем использовать логистическую регрессию LogisticRegression.

# Будем тренировать по одному классификатору на каждый класс.

# Что бы провалидировать качество модели воспользуемся функцией cross_val_score

# TRAIN

train['count_word']=train["comment_text"].apply(lambda x: len(str(x).split()))
#Unique word count
train['count_unique_word']=train["comment_text"].apply(lambda x: len(set(str(x).split())))
#punctuation count
train["count_punctuations"] =train["comment_text"].apply(lambda x: len([c for c in str(x) if c in punctuation]))
#upper case words count
train["count_words_upper"] = train["comment_text"].apply(lambda x: len([w for w in str(x).split() if w.isupper()]))
#Number of stopwords
train["count_stopwords"] = train["comment_text"].apply(lambda x: len([w for w in str(x).lower().split() if w in STOPWORDS]))
#Average length of the words
train["mean_word_len"] = train["comment_text"].apply(lambda x: np.mean([len(w) for w in str(x).split()]))


#TEST

test['count_word']=test["comment_text"].apply(lambda x: len(str(x).split()))
#Unique word count
test['count_unique_word']=test["comment_text"].apply(lambda x: len(set(str(x).split())))
#punctuation count
test["count_punctuations"] =test["comment_text"].apply(lambda x: len([c for c in str(x) if c in punctuation]))
#upper case words count
test["count_words_upper"] = test["comment_text"].apply(lambda x: len([w for w in str(x).split() if w.isupper()]))
#Number of stopwords
test["count_stopwords"] = test["comment_text"].apply(lambda x: len([w for w in str(x).lower().split() if w in STOPWORDS]))
#Average length of the words
test["mean_word_len"] = test["comment_text"].apply(lambda x: np.mean([len(w) for w in str(x).split()]))

# create csr_matrixs
features_train = csr_matrix(train[train.columns[-6:]].fillna(0))
features_test = csr_matrix(test[test.columns[-6:]].fillna(0))

# concatanate with the train/test_words_features
x_train = hstack([x_train, features_train])
x_test = hstack([x_test, features_test])

scores= [] # best c_value for current class_name
c_values = [0.1, 0.5, 1.0, 2.0, 3.0, 4.0, 5.0, 7.5, 10.0]


for class_name in class_names:
    
    c_list = []
    print('CLASS NAME: {}'.format(class_name))
    
    for c in c_values:
        classifier = LogisticRegression(C=c, random_state=32)
        
        y_train = train[class_name]
        cv_score = np.mean(cross_val_score(classifier, x_train, y_train, scoring='roc_auc'))
    
        print('CV score for c = {} is {}'.format(c, cv_score))
        
        c_list.append((c, cv_score))
        
    scores.append(max(c_list, key=lambda x: x[1]))
    
    print('-' * 20)

sc = [item[1] for item in scores]
print('Total score is {}'.format(np.mean(sc)))

scores

submission = pd.DataFrame.from_dict({'id': test['id']})

c_values = [item[0] for item in scores]

for class_name, c in zip(class_names, c_values):
    
    classifier = LogisticRegression(C=c, random_state=32)
    
    y_train = train[class_name]
    classifier.fit(x_train, y_train)
    
    submission[class_name] = classifier.predict_proba(x_test)[:, 1]   
    
submission.head()

submission.to_csv('submission.csv', index=False)