In [4]:
import pandas as pd
import os, string
import re
from keras.preprocessing.text import Tokenizer
from gensim.models.fasttext import FastText
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter, defaultdict
from sklearn.feature_extraction.text import CountVectorizer

import nltk
from string import punctuation
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import sent_tokenize
from nltk import WordPunctTokenizer
import spacy
from spacy.tokenizer import Tokenizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import MinMaxScaler
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction import text

import nltk
# nltk.download('punkt')
# nltk.download('wordnet')
# nltk.download('stopwords')
en_stop = set(nltk.corpus.stopwords.words('english'))

#### Read data

In [5]:
df = pd.read_json('reddit_jokes.json')

In [6]:
df = df[df.score > 0].reset_index()

#### Pre-process

Create 'joke' column and remove unnecessary columns:

In [7]:
df

Unnamed: 0,index,body,id,score,title
0,0,"Now I have to say ""Leroy can you please paint ...",5tz52q,1,I hate how you cant even say black paint anymore
1,3,A Sunday school teacher is concerned that his ...,5tz2wj,1,"Brian raises his hand and says, “He’s in Heaven.”"
2,7,"Apparently ""Whatever's low in cholesterol"" was...",5tz04j,1,I walked into a PETA adoption center and the r...
3,10,"I always thought they were gunna hit me, not t...",5tyzxh,15,Remember when you were a kid and when you crie...
4,13,"I said, ""I'm not sure; it's hard to keep track.""",5tyytx,3,"My boss said to me, ""you're the worst train dr..."
...,...,...,...,...,...
132987,194547,"Just this morning she said, ""Daddy, is that th...",1a8a5r,123,My daughter has reached that age where she's a...
132988,194548,Gives me something to read while i'm in the sh...,1a89ts,5,I like a girl with words tattooed on her back.
132989,194549,I mean dyslexia fcuk!!! >_<,1a87we,12,I have sexdaily...
132990,194550,A hockey player showers after three periods.,1a7xnd,44,What's the difference between a hippie chick a...


In [8]:
df['joke'] = df['title'] +'. '+ df['body']

In [9]:
df = df.loc[:,['joke','score']]

Pre-process text:
- Remove punctuation
- Replace escape characters
- Remove extra spaces
- Remove single characters
- Remove prefixed 'b'
- Lowercase all characters
- Lemmatization

In [10]:
def replace_non_eng_punct(txt):
    return re.sub(r'/[^a-zA-Z0-9\s,.?!]/','*',txt).strip()

In [11]:
# def remove_url(txt):
#     return re.sub(r'https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9]+\.[^\s]{2,}|www\.[a-zA-Z0-9]+\.[^\s]{2,}','',txt)

In [12]:
def replace_escape(txt):
    updated_txt = re.sub(r'\n|\t|&amp;',' ',txt)
    return updated_txt.strip()

In [13]:
def remove_multi_spaces(txt):
    return re.sub(' +', ' ',txt)

In [14]:
import re
from nltk.stem import WordNetLemmatizer

stemmer = WordNetLemmatizer()

def preprocess_text(document):
#         # Remove all the special characters
        document = re.sub(r'\W', ' ', str(document))

#         # remove all single characters
        document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)

#         # Remove single characters from the start
        document = re.sub(r'\^[a-zA-Z]\s+', ' ', document)

        # Substituting multiple spaces with single space
        document = re.sub(r'\s+', ' ', document, flags=re.I)

        # Removing prefixed 'b'
        document = re.sub(r'^b\s+', '', document)

        # Converting to Lowercase
        document = document.lower()

        # Lemmatization
        tokens = document.split()
        tokens = [stemmer.lemmatize(word) for word in tokens]
        tokens = [word for word in tokens if word not in en_stop]
        tokens = [word for word in tokens if len(word) > 3]

        preprocessed_text = ' '.join(tokens)

        return preprocessed_text

In [15]:
df['joke'] = df['joke'].apply(replace_non_eng_punct).apply(remove_multi_spaces)

In [16]:
df['joke_preprocessed'] = df['joke'].apply(preprocess_text)

Sample before/after preprocessing:

In [17]:
df['joke'][0]

'I hate how you cant even say black paint anymore. Now I have to say "Leroy can you please paint the fence?"'

In [18]:
df['joke_preprocessed'][0]

'hate cant even black paint anymore leroy please paint fence'

Create joke tokens list:

In [19]:
def split_punct(txt):
    return re.split(r'(\W)',txt)

In [20]:
df['joke_tokens'] = df['joke_preprocessed'].apply(split_punct)

In [21]:
def remove_spaces(lst):
    return [x for x in lst if (x != ' ') and (x != '')]

In [22]:
df['joke_tokens'] = df['joke_tokens'].apply(remove_spaces)

#### Feature engineering

Train/test split:

In [31]:
target_col = 'score'
tokens_col = 'joke_preprocessed'
X = df[tokens_col].to_numpy()
y = df[target_col].to_numpy()

In [32]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, \
                                                    random_state=42)

Feature scaling:

In [33]:
scaler = MinMaxScaler()
scaler.fit(y_train.reshape(-1,1))
y_train = scaler.transform(y_train.reshape(-1,1)).reshape(-1,)
y_test = scaler.transform(y_test.reshape(-1,1)).reshape(-1,)

Generate vocabulary:

In [35]:
cnt = Counter()
for s in df['joke_tokens']:
    for w in s:
        cnt[w] += 1

In [36]:
vocab2 = cnt.most_common(1000)
vocab2 = [i[0] for i in vocab2]
# vocab_id = defaultdict(int)
# for ind,w in enumerate(vocab):
#     vocab_id[w[0]] = ind+2

Generate TFIDF/CountVectorizer features:

In [37]:
concat_filter_vec2 = np.vectorize(lambda x: ' '.join([i for i in x.split(' ') if i in vocab2]))

In [38]:
X_train = concat_filter_vec2(X_train)
X_test = concat_filter_vec2(X_test)

In [39]:
X_train[6]

'scientist came inside fuck dude hell away'

In [40]:
vectorizer_tf = TfidfVectorizer()
X_train_tf = vectorizer_tf.fit_transform(X_train).toarray()
X_test_tf = vectorizer_tf.transform(X_test).toarray()

In [41]:
c = CountVectorizer()
X_train_count = c.fit_transform(X_train).toarray()
X_test_count = c.fit_transform(X_test).toarray()

In [42]:
X_train_tf

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.33728993, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [43]:
X_train_count.shape

(106393, 1000)

In [44]:
from keras.models import Model, Sequential
from keras.layers import Activation, Dense, LSTM, Embedding, TimeDistributed, recurrent
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from keras import regularizers

#### Modeling

Train Neural network

In [45]:
# Neural network
model = Sequential()
model.add(Dense(128, input_dim=1000, activation='relu',kernel_regularizer=regularizers.l2(0.01),
                activity_regularizer=regularizers.l1(0.01)))
model.add(Dense(128, activation='relu',kernel_regularizer=regularizers.l2(0.01),
                activity_regularizer=regularizers.l1(0.01)))
model.add(Dense(128, activation='relu',kernel_regularizer=regularizers.l2(0.01),
                activity_regularizer=regularizers.l1(0.01)))
model.add(Dense(1, activation='relu',kernel_regularizer=regularizers.l2(0.01),
                activity_regularizer=regularizers.l1(0.01)))






In [46]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 128)               128128    
_________________________________________________________________
dense_2 (Dense)              (None, 128)               16512     
_________________________________________________________________
dense_3 (Dense)              (None, 128)               16512     
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 129       
Total params: 161,281
Trainable params: 161,281
Non-trainable params: 0
_________________________________________________________________


In [47]:
model.compile(loss='mean_squared_error', optimizer='sgd', metrics=['mse'])




In [48]:
history = model.fit(X_train_tf, y_train, epochs=20, batch_size=200,validation_split = 0.2)




Train on 85114 samples, validate on 21279 samples
Epoch 1/20





Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [49]:
y_pred = model.predict(X_test_tf)

In [54]:
y_pred.sum()

0.0

In [52]:
y_test

array([1.89592993e-03, 0.00000000e+00, 2.30808861e-03, ...,
       0.00000000e+00, 2.26687275e-04, 2.06079341e-05])