In [1]:
# Necessary imports
import nltk
import glob
import scipy
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from keras.preprocessing.text import Tokenizer
import matplotlib.pyplot as plt
from fuzzywuzzy import fuzz
from sklearn.preprocessing import normalize
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split

Using TensorFlow backend.


In [2]:
pos_files = glob.glob("data/clean_positive_train_*.csv")
neg_files = glob.glob("data/clean_negative_train_*.csv")

df_pos_list = [pd.read_csv(open(fp, 'r'), encoding='utf-8', engine='c') for fp in pos_files]
df_neg_list = [pd.read_csv(open(fp, 'r'), encoding='utf-8', engine='c') for fp in neg_files]

In [3]:
# Making it a binary classification with either positive or negative score
def posneg(number):
    if number > 0:
        return 1
    else:
        return 0

Now we make features for each dataset that we have, calculating all these features takes a long long time...

In [None]:
df = [pd.concat([df_pos, df_neg]) for (df_pos, df_neg) in zip(df_pos_list, df_neg_list)]

from profanity_check import predict as prof_predict
from profanity_check import predict_prob
from textblob import TextBlob


for index in range(200):
    df[index] = df[index].sample(frac=1).reset_index(drop=True)
    df[index]['score'] = df[index]['score'].apply(posneg)
    df[index]['parent_score'] = df[index]['parent_score'].apply(posneg)
    
    df[index]['sentiment'] = [TextBlob(text).sentiment.polarity for text in df[index]['text'].astype(str)]
    df[index]['profanity'] = [prof_predict([text]) for text in df[index]['text'].astype(str)]
    df[index]['profanity_prob'] = [predict_prob([text]) for text in df[index]['text'].astype(str)]
    df[index]['profanity'] = df[index]['profanity'].astype(int)
    df[index]['profanity_prob'] = df[index]['profanity_prob'].astype(float)


# Now we have scrambeled dataframes of 20000 entries, with features such as binary scores and profanity
df[0].head()

In [None]:
df[0].describe()

In [None]:
df[23].describe()

In [None]:
sentiment = df[0]['sentiment'].values[:, None]
profanity = df[0]['profanity'].values[:, None]
features = np.hstack((sentiment, profanity))
print(features.shape)

In [None]:
text_data = []
text_score = []

X_train = []
X_test = []
y_train = []
y_test = []

for n in range(200):
    text_data.append(np.hstack((df[n]['sentiment'].values[:, None], df[n]['profanity'].values[:, None], df[n]['profanity_prob'].values[:, None])))
    text_score.append(df[n]['score'].apply(posneg))
    
    X_train_tmp, X_test_tmp, y_train_tmp, y_test_tmp = train_test_split(text_data[n], text_score[n], test_size=0.20, random_state=42)
    X_train.append(X_train_tmp)
    X_test.append(X_test_tmp)
    y_train.append(y_train_tmp)
    y_test.append(y_test_tmp)

## Finally we can do some neural networks!

In [None]:
model = tf.keras.models.Sequential()
model.add(keras.layers.Embedding(3, 16))
model.add(keras.layers.Dense(16, activation=tf.nn.relu))
model.add(keras.layers.Dense(24, activation=tf.nn.relu))
model.add(keras.layers.Dense(16, activation=tf.nn.relu))
model.add(keras.layers.Dense(1, activation=tf.nn.sigmoid))

model.summary()

In [None]:
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['acc'])

In [None]:
train_data = X_train[0]
test_data = X_test[0]

history = model.fit(train_data,
                    y_train[0],
                    epochs=40,
                    batch_size=512,
                    validation_data=(test_data, y_test[0]),
                    verbose=1)