# <center> **NLP analysis of jokes dataset**

## Table of Contents

1. **[Functions](#1)**
2. **[Preprocessing](#3)**
	* [Cleaning](#sub-heading2)

# **Imports**

In [None]:
!pip install unidecode
!pip install emoji
!pip install word2number
!pip install contractions
!pip3 install spacy
!python3 -m spacy download en_core_web_sm
!python3 -m nltk.downloader stopwords
!pip install pyspellchecker
!pip install iplot

Collecting en_core_web_sm==2.2.5
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.5/en_core_web_sm-2.2.5.tar.gz (12.0 MB)
[K     |████████████████████████████████| 12.0 MB 20.3 MB/s 
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
import pandas as pd
import numpy as np
import re
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import style
style.use('seaborn')
%matplotlib inline
#graphs in svg look clearer
%config InlineBackend.figure_format = 'svg' 
import warnings
warnings.simplefilter('ignore')

from scipy.stats import norm

from google.colab import drive
import os
import string
import emoji

import plotly as py
import numpy as np
from plotly.offline import init_notebook_mode, iplot
from plotly.graph_objs import Contours, Histogram2dContour, Marker, Scatter

import unidecode
from word2number import w2n
import contractions
from bs4 import BeautifulSoup
import spacy

import nltk
nltk.download('punkt')
nltk.download('wordnet')
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
nltk.download('averaged_perceptron_tagger')
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from collections import Counter
from spellchecker import SpellChecker
from wordcloud import WordCloud

from tqdm._tqdm_notebook import tqdm_notebook
tqdm_notebook.pandas(desc='PROGRESS>>>')

from google_drive_downloader import GoogleDriveDownloader as gdd
from IPython.core.display import display, HTML

from sklearn.model_selection import RepeatedStratifiedKFold, GridSearchCV, StratifiedKFold, KFold, cross_val_score, train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import label_binarize, StandardScaler

from torch.optim.lr_scheduler import CosineAnnealingLR
from tqdm import tqdm
import gensim
import gensim.downloader as api
from gensim.models import Word2Vec

import gensim.downloader as gensim_api
from torch.nn.utils.rnn import pad_sequence

from functools import partial
import torch      
import torch.nn as nn
from torch.utils.data import TensorDataset, ConcatDataset, DataLoader, random_split, Dataset
import torch.nn.functional as F
import torchvision.transforms as transforms
import torchvision
import torch.optim as optim

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [None]:
import cufflinks as cf
cf.go_offline()
cf.set_config_file(offline=False, world_readable=True)
import plotly.graph_objs as go

In [None]:
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
data = pd.read_csv('/content/drive/My Drive/NLP_humor/data/jokes_dataset.csv', 
                 sep=',', 
                 encoding='utf-8').set_index('id')
# work with copy
df = data.copy(deep=True)

# **Functions**

In [None]:
def draw_histogram(data):
    f,ax_hist = plt.subplots(1)
    sns.distplot(data, hist=True,  kde=True, rug=True, bins=10, fit=norm, ax=ax_hist);
    plt.xlabel("values")
    plt.title("Distribution") 

    mean=data.mean()

    ax_hist.axvline(mean, color='r', linestyle='--', label=f"Mean={mean:.3f}")
    ax_hist.legend()

    plt.show()

**Activate Plotly**

In [None]:
def configure_plotly_browser_state():
  import IPython
  display(IPython.core.display.HTML('''
        <script src="/static/components/requirejs/require.js"></script>
        <script>
          requirejs.config({
            paths: {
              base: '/static/base',
              plotly: 'https://cdn.plot.ly/plotly-latest.min.js?noext',
            },
          });
        </script>
        '''))

**Word clouds**

In [None]:
def draw_word_clouds(data):
  fig, (ax1) = plt.subplots(1, 1, figsize=[26, 8])
  wordcloud1 = WordCloud( background_color='white',
                          width=600,
                          height=400).generate(" ".join(data['joke']))
  ax1.imshow(wordcloud1)
  ax1.axis('off')
  ax1.set_title('Joke',fontsize=40);

**Top words**

In [None]:
def draw_top_words(data, top_size, title):
  counting_table = pd.DataFrame(count_words(data, top_size))
  counting_table.drop([0],inplace=True)
  counting_table.columns = ['word','counting']

  plt.figure(figsize=(17,10))
  ax= sns.barplot(data=counting_table,x='word',y='counting',facecolor=(1, 1, 1, 0),edgecolor='black')
  if title == 'Top' or title == 'top':
    ax.set_title(f'Top {top_size} words'.title(),fontsize=20)
  else:
    ax.set_title(f'{title} words: Top {top_size}'.title(),fontsize=20)

  ax.set_ylabel('Word counting',fontsize=15)
  ax.set_xlabel(f'Top {top_size} words',fontsize=15);
  plt.xticks(rotation=45)
  plt.show()

**Removal of html tags**

In [None]:
def strip_html_tags(text):
    """remove html tags from text"""
    soup = BeautifulSoup(text, "html.parser")
    stripped_text = soup.get_text(separator=" ")
    return stripped_text

**Removal of whitespaces**

In [None]:
def remove_whitespace(text):
    """remove extra whitespaces from text"""
    text = text.strip()
    return " ".join(text.split())

**Removal of accented characters**

In [None]:
def remove_accented_chars(text):
    """remove accented characters from text, e.g. café"""
    text = unidecode.unidecode(text)
    return text

**Removal of shortened words**

In [None]:
def expand_contractions(text):
    """expand shortened words, e.g. don't to do not"""
    text = contractions.fix(text)
    return text

**Removal of urls**

In [None]:
def find_urls(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.findall(text)

In [None]:
def remove_urls(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', text)

**Removal of Frequent words**

In [None]:
def count_words(text, top=10):
    cnt = Counter()
    for text in text.values:
        for word in text.split():
            cnt[word] += 1
          
    return cnt.most_common(top)

In [None]:
# FREQWORDS = set([w for (w, wc) in count_words(data['joke'], 10)])
# FREQWORDS

# n_rare_words = 10
# RAREWORDS = set([w for (w, wc) in count_words(data['joke'][:-n_rare_words-1:-1])])
# RAREWORDS

In [None]:
def remove_freqwords(text):
    """remove the frequent words"""
    return " ".join([word for word in str(text).split() if word not in FREQWORDS])

**Removal of Rare words**

In [None]:
def remove_rarewords(text):
    """remove the rare words"""
    return " ".join([word for word in str(text).split() if word not in RAREWORDS])

**Removal of stopwords**


In [None]:
STOPWORDS = set(stopwords.words('english'))

def remove_stopwords(text):
    """remove the stopwords"""
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])

**Convert chat words**

In [None]:
chat_words_str = """
AFAIK=As Far As I Know
AFK=Away From Keyboard
ASAP=As Soon As Possible
ATK=At The Keyboard
ATM=At The Moment
A3=Anytime, Anywhere, Anyplace
BAK=Back At Keyboard
BBL=Be Back Later
BBS=Be Back Soon
BFN=Bye For Now
B4N=Bye For Now
BRB=Be Right Back
BRT=Be Right There
BTW=By The Way
B4=Before
B4N=Bye For Now
CU=See You
CUL8R=See You Later
CYA=See You
FAQ=Frequently Asked Questions
FC=Fingers Crossed
FWIW=For What It's Worth
FYI=For Your Information
GAL=Get A Life
GG=Good Game
GN=Good Night
GMTA=Great Minds Think Alike
GR8=Great!
G9=Genius
IC=I See
ICQ=I Seek you (also a chat program)
ILU=ILU: I Love You
IMHO=In My Honest/Humble Opinion
IMO=In My Opinion
IOW=In Other Words
IRL=In Real Life
LDR=Long Distance Relationship
LMAO=Laugh My A.. Off
LOL=Laughing Out Loud
LTNS=Long Time No See
L8R=Later
MTE=My Thoughts Exactly
M8=Mate
NRN=No Reply Necessary
OIC=Oh I See
PITA=Pain In The A..
PRT=Party
PRW=Parents Are Watching
ROFL=Rolling On The Floor Laughing
ROFLOL=Rolling On The Floor Laughing Out Loud
ROTFLMAO=Rolling On The Floor Laughing My A.. Off
SK8=Skate
STATS=Your sex and age
ASL=Age, Sex, Location
THX=Thank You
TTFN=Ta-Ta For Now!
TTYL=Talk To You Later
U=You
U2=You Too
U4E=Yours For Ever
WB=Welcome Back
WTF=What The F...
WTG=Way To Go!
WUF=Where Are You From?
W8=Wait...
7K=Sick:-D Laugher
"""

In [None]:
def chat_words_conversion(text):
    new_text = []
    for w in text.split():
        if w.upper() in chat_words_list:
            new_text.append(chat_words_map_dict[w.upper()])
        else:
            new_text.append(w)
    return " ".join(new_text)

**Convertion of emoticons**

In [None]:
EMOTICONS = {
    u":‑\)":"Happy face or smiley",
    u":\)":"Happy face or smiley",
    u":-\]":"Happy face or smiley",
    u":\]":"Happy face or smiley",
    u":-3":"Happy face smiley",
    u":3":"Happy face smiley",
    u":->":"Happy face smiley",
    u":>":"Happy face smiley",
    u"8-\)":"Happy face smiley",
    u":o\)":"Happy face smiley",
    u":-\}":"Happy face smiley",
    u":\}":"Happy face smiley",
    u":-\)":"Happy face smiley",
    u":c\)":"Happy face smiley",
    u":\^\)":"Happy face smiley",
    u"=\]":"Happy face smiley",
    u"=\)":"Happy face smiley",
    u":‑D":"Laughing, big grin or laugh with glasses",
    u":D":"Laughing, big grin or laugh with glasses",
    u"8‑D":"Laughing, big grin or laugh with glasses",
    u"8D":"Laughing, big grin or laugh with glasses",
    u"X‑D":"Laughing, big grin or laugh with glasses",
    u"XD":"Laughing, big grin or laugh with glasses",
    u"=D":"Laughing, big grin or laugh with glasses",
    u"=3":"Laughing, big grin or laugh with glasses",
    u"B\^D":"Laughing, big grin or laugh with glasses",
    u":-\)\)":"Very happy",
    u":‑\(":"Frown, sad, andry or pouting",
    u":-\(":"Frown, sad, andry or pouting",
    u":\(":"Frown, sad, andry or pouting",
    u":‑c":"Frown, sad, andry or pouting",
    u":c":"Frown, sad, andry or pouting",
    u":‑<":"Frown, sad, andry or pouting",
    u":<":"Frown, sad, andry or pouting",
    u":‑\[":"Frown, sad, andry or pouting",
    u":\[":"Frown, sad, andry or pouting",
    u":-\|\|":"Frown, sad, andry or pouting",
    u">:\[":"Frown, sad, andry or pouting",
    u":\{":"Frown, sad, andry or pouting",
    u":@":"Frown, sad, andry or pouting",
    u">:\(":"Frown, sad, andry or pouting",
    u":'‑\(":"Crying",
    u":'\(":"Crying",
    u":'‑\)":"Tears of happiness",
    u":'\)":"Tears of happiness",
    u"D‑':":"Horror",
    u"D:<":"Disgust",
    u"D:":"Sadness",
    u"D8":"Great dismay",
    u"D;":"Great dismay",
    u"D=":"Great dismay",
    u"DX":"Great dismay",
    u":‑O":"Surprise",
    u":O":"Surprise",
    u":‑o":"Surprise",
    u":o":"Surprise",
    u":-0":"Shock",
    u"8‑0":"Yawn",
    u">:O":"Yawn",
    u":-\*":"Kiss",
    u":\*":"Kiss",
    u":X":"Kiss",
    u";‑\)":"Wink or smirk",
    u";\)":"Wink or smirk",
    u"\*-\)":"Wink or smirk",
    u"\*\)":"Wink or smirk",
    u";‑\]":"Wink or smirk",
    u";\]":"Wink or smirk",
    u";\^\)":"Wink or smirk",
    u":‑,":"Wink or smirk",
    u";D":"Wink or smirk",
    u":‑P":"Tongue sticking out, cheeky, playful or blowing a raspberry",
    u":P":"Tongue sticking out, cheeky, playful or blowing a raspberry",
    u"X‑P":"Tongue sticking out, cheeky, playful or blowing a raspberry",
    u"XP":"Tongue sticking out, cheeky, playful or blowing a raspberry",
    u":‑Þ":"Tongue sticking out, cheeky, playful or blowing a raspberry",
    u":Þ":"Tongue sticking out, cheeky, playful or blowing a raspberry",
    u":b":"Tongue sticking out, cheeky, playful or blowing a raspberry",
    u"d:":"Tongue sticking out, cheeky, playful or blowing a raspberry",
    u"=p":"Tongue sticking out, cheeky, playful or blowing a raspberry",
    u">:P":"Tongue sticking out, cheeky, playful or blowing a raspberry",
    u":‑/":"Skeptical, annoyed, undecided, uneasy or hesitant",
    u":/":"Skeptical, annoyed, undecided, uneasy or hesitant",
    u":-[.]":"Skeptical, annoyed, undecided, uneasy or hesitant",
    u">:[(\\\)]":"Skeptical, annoyed, undecided, uneasy or hesitant",
    u">:/":"Skeptical, annoyed, undecided, uneasy or hesitant",
    u":[(\\\)]":"Skeptical, annoyed, undecided, uneasy or hesitant",
    u"=/":"Skeptical, annoyed, undecided, uneasy or hesitant",
    u"=[(\\\)]":"Skeptical, annoyed, undecided, uneasy or hesitant",
    u":L":"Skeptical, annoyed, undecided, uneasy or hesitant",
    u"=L":"Skeptical, annoyed, undecided, uneasy or hesitant",
    u":S":"Skeptical, annoyed, undecided, uneasy or hesitant",
    u":‑\|":"Straight face",
    u":\|":"Straight face",
    u":$":"Embarrassed or blushing",
    u":‑x":"Sealed lips or wearing braces or tongue-tied",
    u":x":"Sealed lips or wearing braces or tongue-tied",
    u":‑#":"Sealed lips or wearing braces or tongue-tied",
    u":#":"Sealed lips or wearing braces or tongue-tied",
    u":‑&":"Sealed lips or wearing braces or tongue-tied",
    u":&":"Sealed lips or wearing braces or tongue-tied",
    u"O:‑\)":"Angel, saint or innocent",
    u"O:\)":"Angel, saint or innocent",
    u"0:‑3":"Angel, saint or innocent",
    u"0:3":"Angel, saint or innocent",
    u"0:‑\)":"Angel, saint or innocent",
    u"0:\)":"Angel, saint or innocent",
    u":‑b":"Tongue sticking out, cheeky, playful or blowing a raspberry",
    u"0;\^\)":"Angel, saint or innocent",
    u">:‑\)":"Evil or devilish",
    u">:\)":"Evil or devilish",
    u"\}:‑\)":"Evil or devilish",
    u"\}:\)":"Evil or devilish",
    u"3:‑\)":"Evil or devilish",
    u"3:\)":"Evil or devilish",
    u">;\)":"Evil or devilish",
    u"\|;‑\)":"Cool",
    u"\|‑O":"Bored",
    u":‑J":"Tongue-in-cheek",
    u"#‑\)":"Party all night",
    u"%‑\)":"Drunk or confused",
    u"%\)":"Drunk or confused",
    u":-###..":"Being sick",
    u":###..":"Being sick",
    u"<:‑\|":"Dump",
    u"\(>_<\)":"Troubled",
    u"\(>_<\)>":"Troubled",
    u"\(';'\)":"Baby",
    u"\(\^\^>``":"Nervous or Embarrassed or Troubled or Shy or Sweat drop",
    u"\(\^_\^;\)":"Nervous or Embarrassed or Troubled or Shy or Sweat drop",
    u"\(-_-;\)":"Nervous or Embarrassed or Troubled or Shy or Sweat drop",
    u"\(~_~;\) \(・\.・;\)":"Nervous or Embarrassed or Troubled or Shy or Sweat drop",
    u"\(-_-\)zzz":"Sleeping",
    u"\(\^_-\)":"Wink",
    u"\(\(\+_\+\)\)":"Confused",
    u"\(\+o\+\)":"Confused",
    u"\(o\|o\)":"Ultraman",
    u"\^_\^":"Joyful",
    u"\(\^_\^\)/":"Joyful",
    u"\(\^O\^\)／":"Joyful",
    u"\(\^o\^\)／":"Joyful",
    u"\(__\)":"Kowtow as a sign of respect, or dogeza for apology",
    u"_\(\._\.\)_":"Kowtow as a sign of respect, or dogeza for apology",
    u"<\(_ _\)>":"Kowtow as a sign of respect, or dogeza for apology",
    u"<m\(__\)m>":"Kowtow as a sign of respect, or dogeza for apology",
    u"m\(__\)m":"Kowtow as a sign of respect, or dogeza for apology",
    u"m\(_ _\)m":"Kowtow as a sign of respect, or dogeza for apology",
    u"\('_'\)":"Sad or Crying",
    u"\(/_;\)":"Sad or Crying",
    u"\(T_T\) \(;_;\)":"Sad or Crying",
    u"\(;_;":"Sad of Crying",
    u"\(;_:\)":"Sad or Crying",
    u"\(;O;\)":"Sad or Crying",
    u"\(:_;\)":"Sad or Crying",
    u"\(ToT\)":"Sad or Crying",
    u";_;":"Sad or Crying",
    u";-;":"Sad or Crying",
    u";n;":"Sad or Crying",
    u";;":"Sad or Crying",
    u"Q\.Q":"Sad or Crying",
    u"T\.T":"Sad or Crying",
    u"QQ":"Sad or Crying",
    u"Q_Q":"Sad or Crying",
    u"\(-\.-\)":"Shame",
    u"\(-_-\)":"Shame",
    u"\(一一\)":"Shame",
    u"\(；一_一\)":"Shame",
    u"\(=_=\)":"Tired",
    u"\(=\^\·\^=\)":"cat",
    u"\(=\^\·\·\^=\)":"cat",
    u"=_\^=	":"cat",
    u"\(\.\.\)":"Looking down",
    u"\(\._\.\)":"Looking down",
    u"\^m\^":"Giggling with hand covering mouth",
    u"\(\・\・?":"Confusion",
    u"\(?_?\)":"Confusion",
    u">\^_\^<":"Normal Laugh",
    u"<\^!\^>":"Normal Laugh",
    u"\^/\^":"Normal Laugh",
    u"\（\*\^_\^\*）" :"Normal Laugh",
    u"\(\^<\^\) \(\^\.\^\)":"Normal Laugh",
    u"\(^\^\)":"Normal Laugh",
    u"\(\^\.\^\)":"Normal Laugh",
    u"\(\^_\^\.\)":"Normal Laugh",
    u"\(\^_\^\)":"Normal Laugh",
    u"\(\^\^\)":"Normal Laugh",
    u"\(\^J\^\)":"Normal Laugh",
    u"\(\*\^\.\^\*\)":"Normal Laugh",
    u"\(\^—\^\）":"Normal Laugh",
    u"\(#\^\.\^#\)":"Normal Laugh",
    u"\（\^—\^\）":"Waving",
    u"\(;_;\)/~~~":"Waving",
    u"\(\^\.\^\)/~~~":"Waving",
    u"\(-_-\)/~~~ \($\·\·\)/~~~":"Waving",
    u"\(T_T\)/~~~":"Waving",
    u"\(ToT\)/~~~":"Waving",
    u"\(\*\^0\^\*\)":"Excited",
    u"\(\*_\*\)":"Amazed",
    u"\(\*_\*;":"Amazed",
    u"\(\+_\+\) \(@_@\)":"Amazed",
    u"\(\*\^\^\)v":"Laughing,Cheerful",
    u"\(\^_\^\)v":"Laughing,Cheerful",
    u"\(\(d[-_-]b\)\)":"Headphones,Listening to music",
    u'\(-"-\)':"Worried",
    u"\(ーー;\)":"Worried",
    u"\(\^0_0\^\)":"Eyeglasses",
    u"\(\＾ｖ\＾\)":"Happy",
    u"\(\＾ｕ\＾\)":"Happy",
    u"\(\^\)o\(\^\)":"Happy",
    u"\(\^O\^\)":"Happy",
    u"\(\^o\^\)":"Happy",
    u"\)\^o\^\(":"Happy",
    u":O o_O":"Surprised",
    u"o_0":"Surprised",
    u"o\.O":"Surpised",
    u"\(o\.o\)":"Surprised",
    u"oO":"Surprised",
    u"\(\*￣m￣\)":"Dissatisfied",
    u"\(‘A`\)":"Snubbed or Deflated"
}

In [None]:
def convert_emoticons(text):
    for emot in EMOTICONS:
        text = re.sub(u'('+emot+')', "_".join(EMOTICONS[emot].replace(",","").split()), text)
    return text

**Removal of Punctuations**

In [None]:
PUNCT_TO_REMOVE = string.punctuation

def remove_punctuation(text):
    """custom function to remove the punctuation"""
    return text.translate(str.maketrans('', '', PUNCT_TO_REMOVE))

**Removal of numbers**

In [None]:
def remove_numbers(input):
  input_str = re.sub(r'\d+', "", input)
  return input_str

**Find emoji**

In [None]:
def extract_emojis(s):
  return ''.join(c for c in s if c in emoji.UNICODE_EMOJI['en'])

**Stemming**

In [None]:
stemmer = PorterStemmer()

def stem_words(text):
    return " ".join([stemmer.stem(word) for word in text])

**Word importance**

In [None]:
# def get_word_importance(model,tfidf, top=15):

#     important_tokens = pd.DataFrame(
#         data=model.coef_[0],
#         index=tfidf.get_feature_names(),
#         columns=['Coefs']
#     )

#     important_tokens_pos = important_tokens.sort_values(by='Coefs', ascending=False)[:top]
#     important_tokens_neg = important_tokens.sort_values(by='Coefs', ascending=False)[-top:]

#     word_imp = important_tokens_pos.copy(deep=True)
#     word_imp = word_imp.append(important_tokens_neg)

#     plt.figure(figsize=(15,10))
#     sns.barplot(x=word_imp.index, y='Coefs', data=word_imp)
#     plt.title('Word importance')
#     plt.xlabel('POS             vs.               NEG')
#     plt.ylabel('word weight')
#     plt.xticks(rotation=55)
#     plt.show()

# **Preprocessing**

In [None]:
df.shape

(194616, 3)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 194616 entries, 5tz52q to 1a801u
Data columns (total 3 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   body    190507 non-null  object 
 1   score   194553 non-null  float64
 2   title   194553 non-null  object 
dtypes: float64(1), object(2)
memory usage: 5.9+ MB


## **Check missing data**

In [None]:
df.isnull().sum()

body     4109
score      63
title      63
dtype: int64

In [None]:
df[df['body'].isna() == True]

Unnamed: 0_level_0,body,score,title
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
5tr0co,,4.0,Obese people are a very large portion of the p...
4zcsif,,2.0,"A priest, a bishop, and a pontiff are all in a..."
4toft6,,0.0,Plagiarism is cheating. If mrs trump cheated o...
4rlhug,,0.0,Marriage is like childhood...
4rlf8f,,0.0,The worst excuse I have ever came up with.
...,...,...,...
1avalc,,0.0,a dyslexic man walks into a bra
1autf7,,22.0,I tried searching on Google for 'Lost Medieval...
1ajoog,,0.0,591
1ady2k,,0.0,I one the sandwich.


There are not a lot of NaNs in 2 columns (score, title), we can just drop them.

In [None]:
df.dropna(subset=['title', 'score'], inplace=True)

So, now we have full jokes in body, not in title.

In [None]:
df.isna().sum()

body     4109
score       0
title       0
dtype: int64

Now, we need to work with jokes without titles --> there I replace NaNs with " ".

In [None]:
df['body'] = df['body'].fillna('')

In [None]:
df.isna().sum()

body     0
score    0
title    0
dtype: int64

In [None]:
df['score'] = df['score'].astype(np.int32)

### **Make new df with full joke text**

In [None]:
df["joke"] = df["title"] + " " + df["body"]

In [None]:
df.drop(['title', 'body'], axis=1, inplace=True)

In [None]:
df.head()

Unnamed: 0_level_0,score,joke
id,Unnamed: 1_level_1,Unnamed: 2_level_1
5tz52q,1,I hate how you cant even say black paint anymo...
5tz4dd,0,What's the difference between a Jew in Nazi Ge...
5tz319,0,I recently went to America.... ...and being th...
5tz2wj,1,"Brian raises his hand and says, “He’s in Heave..."
5tz1pc,0,You hear about the University book store worke...


### **Dropping duplicates**

In [None]:
df.shape

(194553, 2)

In [None]:
df.drop_duplicates(inplace=True)
df.shape

(193906, 2)

## **Cleaning**

In [None]:
text_col = 'joke'

**Lower Casing**  
Lower casing is a common text preprocessing technique. The idea is to convert the input text into same casing format so that 'text', 'Text' and 'TEXT' are treated the same way.  

This is more helpful for text featurization techniques like frequency, tfidf as it helps to combine the same words together thereby reducing the duplication and get correct counts / tfidf values.  

**Some other parts like:** *html removal, extra whitespaces removal are also important.*

These may not be helpful when we do tasks like Part of Speech tagging (where proper casing gives some information about Nouns and so on) and Sentiment Analysis (where upper casing refers to anger and so on)  

In [None]:
def text_preprocessing(df, col):
    
    #remove html tags
    df[col] = df[col].apply(strip_html_tags)
    print('>>> REMOVE HTML TAGS <<<')
    print(df.head(1), '\n')

    #remove extra whitespaces
    df[col] = df[col].apply(remove_whitespace)
    print('>>> REMOVE EXTRA WHITESPACES <<<')
    print(data.head(1), '\n')

    #remove accented characters
    df[col] =  df[col].apply(remove_accented_chars)
    print('>>> REMOVE ACCENTED CHARS <<<')
    print(df.head(1), '\n')

    #expand contractions
    df[col] =  df[col].apply(expand_contractions)
    print('>>> EXPAND CONTRACTIONS <<<')
    print(data.head(1), '\n')


In [None]:
text_preprocessing(df, col='joke')

>>> REMOVE HTML TAGS <<<
        score                                               joke
id                                                              
5tz52q      1  I hate how you cant even say black paint anymo... 

>>> REMOVE EXTRA WHITESPACES <<<
                                                     body  score  \
id                                                                 
5tz52q  Now I have to say "Leroy can you please paint ...    1.0   

                                                   title  
id                                                        
5tz52q  I hate how you cant even say black paint anymore   

>>> REMOVE ACCENTED CHARS <<<
        score                                               joke
id                                                              
5tz52q      1  I hate how you cant even say black paint anymo... 

>>> EXPAND CONTRACTIONS <<<
                                                     body  score  \
id                                    

**Removal of URLs**  
Next preprocessing step is to remove any URLs present in the data. Probably we might need to remove them for our further analysis.


1st of all, I check whether there are any URLs in dataset:

In [None]:
for i in df['joke']:
    url = find_urls(i)
    if url == []:
      continue
    else:
      print(url)
      

['www.....That']
['http://www.youtube.com/watch?v=xaFZrxlPwWs),']
['http://m.imgur.com/x29gyvN', 'http://m.imgur.com/Q1iyUoT', 'http://m.imgur.com/hKRnGrT)', 'http://m.imgur.com/fqufGbK)']
['http://www.afterfeed.com/)?"']
['https://www.reddit.com/r/Punny/comments/4zob6u/tonight_the_world_egg_throwing_federation_host_a/?st=ISBQLINS&sh=8da1321d']
['www..."']
['http://www.merriam-webster.com/dictionary/hypocorism)."']
['https://techcrunch.com/2016/08/11/reddit-is-currently-experiencing-a-major-outage/']
['https://m.vk.com/wall-55955185_3834']
['https://en.m.wikipedia.org/wiki/Umami']
['http://www.afterfeed.com/story/detail/13350/14-epic-jokes-by-chandler-bing-from-friends-that-will-make-hole-in-your-belly)']
['http://www.dictionary.com/browse/evacuate)']
['http://instantrimshot.com/)']
['http://www.dictionary.com/browse/elope?s=t)']
['http://you-tricks.blogspot.com/2016/07/asian-doctor.html)']
['www..']
['https://pbs.twimg.com/media/CjVBbALUoAAb6r_.jpg)']
['www.curing-conjunctivitis.com']

In [None]:
df['joke'] = df['joke'].apply(lambda text: remove_urls(text))
df.head(1)

Unnamed: 0_level_0,score,joke
id,Unnamed: 1_level_1,Unnamed: 2_level_1
5tz52q,1,I hate how you cannot even say black paint any...


**Removal of numbers**  
* Here I delete only all the numbers that look like numbers, I will leave the written numbers for later.

In [None]:
df[text_col] = df[text_col].apply(lambda text: remove_numbers(text))
df.head(1)

Unnamed: 0_level_0,score,joke
id,Unnamed: 1_level_1,Unnamed: 2_level_1
5tz52q,1,I hate how you cannot even say black paint any...


In [None]:
print('The number of duplicated data is:',sum(df.duplicated()))

The number of duplicated data is: 213


## **Same jokes have different scores:**   
Now I take the biggest score from a joke and set this value to all duplicates, then I delete duplicates.

In [None]:
print('The number of duplicated data is:',sum(df['joke'].duplicated()))

The number of duplicated data is: 3764


In [None]:
dupls = df[df['joke'].duplicated() == True]['joke'].unique()
dupls

array(['What did the leper say to the prostitute? Keep the tip.',
       '!false It is funny because it is true.',
       "What is Mexico's National Animal? The drug mule.", ...,
       'What is the difference between ignorance and apathy? Do not know, do not care.',
       'How do you know if your roommate is gay? His dick tastes like shit.',
       'Why do they call it PMS? Because mad cow disease was already taken'],
      dtype=object)

In [None]:
dupls = df[df['joke'].duplicated() == True]['joke'].unique()

for j in dupls:
  max_score = max(set(df[df['joke'] == j]['score']))
  joke_idxs = df[df['joke'] == j].index
  df['score'][joke_idxs] = max_score
  

In [None]:
df[df['joke'] == dupls[0]]

Unnamed: 0_level_0,score,joke
id,Unnamed: 1_level_1,Unnamed: 2_level_1
5t6548,252,What did the leper say to the prostitute? Keep...
5swmk6,252,What did the leper say to the prostitute? Keep...
525zmr,252,What did the leper say to the prostitute? Keep...
4zzal9,252,What did the leper say to the prostitute? Keep...
4yf0xg,252,What did the leper say to the prostitute? Keep...
4tvbms,252,What did the leper say to the prostitute? Keep...
4t2ajn,252,What did the leper say to the prostitute? Keep...
48hd0j,252,What did the leper say to the prostitute? Keep...
46fpv4,252,What did the leper say to the prostitute? Keep...
43luzn,252,What did the leper say to the prostitute? Keep...


Removal of duplicates

In [None]:
df.shape

(193906, 2)

In [None]:
df.drop_duplicates(inplace=True)
df.shape

(190142, 2)

In [None]:
# df.to_csv('/content/drive/My Drive/NLP_humor/data/jokes_dataset_PREPROC_withOrigScores.csv',
#             sep=',',
#             header=True, 
#             index=True,
#             encoding='utf-8')

### **Split 'score' in 4 quantiles (make 5 classes):**

In [None]:
df['score'].quantile([0.2, 0.4, 0.6, 0.8])

0.2     0.0
0.4     1.0
0.6     5.0
0.8    26.0
Name: score, dtype: float64

In [None]:
df.groupby(pd.cut(df['score'], np.percentile(df['score'], [20, 40, 60, 80]), include_lowest=True)).mean()

Unnamed: 0_level_0,score
score,Unnamed: 1_level_1
"(-0.001, 1.0]",0.239339
"(1.0, 5.0]",3.151722
"(5.0, 26.0]",12.500093


In [None]:
def add_rank(data):
  if data == 0:
    return 0
  elif (data > 0) and( data <= 1):
    return 1 
  elif (data > 1)and (data <= 5):
    return 2
  elif (data > 5)and (data <= 26):
    return 3
  elif data > 26:
    return 4

In [None]:
df['rank'] = df['score'].apply(add_rank)

In [None]:
df['rank'].value_counts()

0    60182
4    37757
3    37445
2    35822
1    18936
Name: rank, dtype: int64

**Drop joke from one word:**

In [None]:
df['text_word_count'] = df['joke'].apply(lambda x: len(str(x).split()))
df.head(1)

Unnamed: 0_level_0,score,joke,rank,text_word_count
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
5tz52q,1,I hate how you cannot even say black paint any...,1,22


In [None]:
df[df['text_word_count'] == 1]

Unnamed: 0_level_0,score,joke,rank,text_word_count
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
4nfz1o,0,Gandalf,0,1
4kq9zf,0,Bernie,0,1
441t69,0,Lol,0,1
3xunbs,0,-chan,0,1
3xo5ma,0,You.,0,1
...,...,...,...,...
2a6jyn,0,PSm,0,1
1p5dzb,0,Feminisim,0,1
1nju7e,941,Congress,4,1
1j5l9y,0,.,0,1


In [None]:
df = df[df['text_word_count'] != 1]
df.drop(['score',	'text_word_count'], axis=1, inplace=True)

In [None]:
df.head()

Unnamed: 0_level_0,joke,rank
id,Unnamed: 1_level_1,Unnamed: 2_level_1
5tz52q,I hate how you cannot even say black paint any...,1
5tz4dd,What is the difference between a Jew in Nazi G...,0
5tz319,I recently went to America.... ...and being th...,0
5tz2wj,"Brian raises his hand and says, ""He is in Heav...",1
5tz1pc,You hear about the University book store worke...,0


In [None]:
df.shape

(190056, 2)

# **Split**

Train/Test/Val

In [None]:
data_train, data_test = train_test_split(df, test_size=0.1, random_state=42)

data_train, data_val = train_test_split(data_train, test_size=0.1, random_state=42)

data_train.index = range(len(data_train))
data_val.index = range(len(data_val))

print("Train size = ", len(data_train))
print("Validation size = ", len(data_val))
print("Test size = ", len(data_test))

Train size =  153945
Validation size =  17105
Test size =  19006


# **Augmentation**

Just for train and val datasets. Test dataset is keeped clean.

**Concate dfs**

In [None]:
train_val_df = pd.concat([data_train.assign(indic="train"), data_val.assign(indic="val")])

### **nlpaug**

In [None]:
!pip install nlpaug

In [None]:
import nlpaug.augmenter.char as nac
import nlpaug.augmenter.sentence as nas
import nlpaug.augmenter.word as naw
import nlpaug.augmenter.word.context_word_embs as nawcwe
import nlpaug.augmenter.word.word_embs as nawwe
import nlpaug.augmenter.word.spelling as naws

In [None]:
# txt = 'I jump at school.'

In [None]:
# aug_syn = naw.SynonymAug(aug_src='wordnet', aug_min=1)
# print("Original:")
# print(txt)
# print("Augmented Synonym Text:")

# augmented_text = aug_syn.augment(txt)
# print(augmented_text)

Original:
I jump at school.
Augmented Synonym Text:
Iodine jump at school day.


## **SynonymAug**

In [None]:
# !pip install py-thesaurus

In [None]:
# import random
# from py_thesaurus import Thesaurus
# import nltk 
# from nltk.corpus import wordnet 
# import en_core_web_sm
# import re

In [None]:
# def synalter_Verb(word,a1,POS):
#     max_temp = -1
#     flag = 0
#     for i in a1:
#         try:
#             w1 = wordnet.synset(word+'.'+POS+'.01') 
#             w2 = wordnet.synset(i+'.'+POS+'.01') # n denotes noun 
#             if(max_temp<w1.wup_similarity(w2)):
#                 max_temp=w1.wup_similarity(w2)
#                 temp_name = i
#                 flag =1
#         except:
#             f = 0
            
#     if flag == 0:
#         max1 = -1.
#         nlp = en_core_web_sm.load()
#         for i in a1:
#             j=i.replace(' ', '')
#             tokens = nlp(u''+j)
#             token_main = nlp(u''+ word)
#             for token1 in token_main:
#                 if max1<float(token1.similarity(tokens)):
#                     max1 = token1.similarity(tokens)
#                     value = i
#         max1 = -1.
#         return value 
#     else:
#         return temp_name

In [None]:
# def add_syn_verbs(text_col):
#     synonyms = [] 

#     percent = 50

#     output_text = text_col
#     words = text_col.split()
#     counts = {}
#     for word in words:
#         if word not in counts:
#             counts[word] = 0
#         counts[word] += 1
#     one_word = []
#     for key, value in counts.items():
#         if value == 1 and key.isalpha() and len(key)>2:
#             one_word.append(key)
#     verb = []
#     nlp = spacy.load('en_core_web_sm')
#     doc = nlp(u''+' '.join(one_word))
#     for token in doc:
#         if  token.pos_ == 'VERB':
#             verb.append(token.text)
        
#     all_main = verb
#     len_all = len(verb)
#     # print(len_all)######
#     final_value = int(len_all * percent /100)
#     # print(final_value)##########
#     random.seed(4)
#     temp = random.sample(range(0, len_all), final_value)
#     # print(temp)###########
#     for i in temp:
#         print(i)############
#         try:
#             word_str = all_main[i]
#             print(word_str)############
#             w = Thesaurus(word_str)
#             print(w)##########
#             a1 = list(w.get_synonym())
#             print(a1)##########
#             if i<len(verb):
#                 change_word=synalter_Verb(word_str,a1,'v')
#                 try:
#                     search_word = re.search(r'\b('+word_str+r')\b', output_text)
#                     Loc = search_word.start()
#                     output_text = output_text[:int(Loc)] + change_word + output_text[int(Loc) + len(word_str):]
#                 except:
#                     f=0

#             else:
#                 change_word=synalter_Verb(word_str,a1,'n')
#                 try:
#                     search_word = re.search(r'\b('+word_str+r')\b', output_text)
#                     Loc = search_word.start()
#                     output_text = output_text[:int(Loc)] + change_word + output_text[int(Loc) + len(word_str):]
#                 except:
#                     f=0

#         except:
#             f=0

#     return output_text


In [None]:
# txt = 'I jump at school.'

In [None]:
# new_txt = add_syn_verbs(txt)
# new_txt

0
run
<py_thesaurus.base_class.Thesaurus object at 0x7f812b463e10>
[]


'I run  and jump at school.'

In [None]:
# from nltk.corpus import wordnet

# synonyms = []

# for syn in wordnet.synsets("jump", pos=wordnet.VERB):
#     for i in syn.lemmas():
#         synonyms.append(i.name())

# print(set(synonyms))

{'skip', 'climb_up', 'bound', 'leap', 'pass_over', 'jump_out', 'alternate', 'spring', 'jump-start', 'parachute', 'derail', 'jump', 'chute', 'startle', 'skip_over', 'start', 'stand_out', 'stick_out', 'jump_off', 'rise', 'jumpstart', 'leap_out'}


In [None]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.18.0-py3-none-any.whl (4.0 MB)
[K     |████████████████████████████████| 4.0 MB 13.9 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 40.0 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.49-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 40.7 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 55.6 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.5.1-py3-none-any.whl (77 kB)
[K     |████████████████████████████████| 77 kB 7.4 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    F

In [None]:
import transformers

In [None]:
#BERT Augmentator
TOPK = 20 
ACT = 'insert'

aug_bert = naw.ContextualWordEmbsAug(
    model_path='distilbert-base-uncased', 
    #device='cuda',
    action=ACT, top_k=TOPK)
print("Original:")
print(txt)
print("Augmented Text:")

augmented_text = aug_bert.augment(txt)
print(augmented_text)

Original:
I jump at school.
Augmented Text:
i jump successfully at dance school.


In [None]:
def generate_aug_samples(aug, row: pd.DataFrame) -> pd.DataFrame:
    """
    Create new negative sample rows with new request and 'y' equals to 0.

    Parameters
    ----------
    user_rows (DataFrame): Samples(rows) of one user from initial dataset
    mode (str) : {'train', 'test'}, default='train'
                Samples are generated for:
            - 'train' dataset, size of new generated rows for one user is equal to "self.ns_amount".
            - 'test' dataset, size of new generated rows for one user is equal to all possible S.TARGETs.

    Returns ns_df (DataFrame): dataset with only negative samples of one user
    -------
    """

   
    new_row = {}

    # new_row['joke'] = aug_bert.augment(row['joke'])

    # new_row['rank'] = row['rank']

    # new_row['indic'] = row['indic']
    new_row[0] = aug_bert.augment(row[0])

    new_row[1] = row[1]

    new_row[2] = row[2]
    return new_row


def set_augmentation(X: pd.DataFrame) -> pd.DataFrame:
    """
    Create new negative sample rows for each user.
    -------

    Parameters:

    X (DataFrame): Original dataset
    -------

    Returns:

    Returns negative_samples_df (DataFrame): dataset with only negative samples of all users
    """
    #BERT Augmentator
    TOPK = 20 
    ACT = 'insert'

    aug_bert = naw.ContextualWordEmbsAug(
        model_path='distilbert-base-uncased', 
        #device='cuda',
        action=ACT, top_k=TOPK)

    X = X.to_numpy()

    new_samples = []
    for row in X:

        new_row = generate_aug_samples(aug_bert, row)
        new_samples.append(new_row)

    aug_df = pd.DataFrame(new_samples, columns=['joke', 'rank', 'indic'])


    new_df = X.append(aug_df, ignore_index=True, sort=False)
    return new_df

In [None]:
train_val_df.shape

(171050, 3)

In [None]:
aug_df = set_augmentation(train_val_df)

In [None]:
aug_df.shape

In [None]:
aug_df

### Shuffle

In [None]:
shuf_df = aug_df.reindex(np.random.permutation(aug_df.index)).reset_index(drop=True)

### Split train/val

In [None]:
data_train, data_val = shuf_df[shuf_df["indic"].eq("train")], shuf_df[shuf_df["indic"].eq("val")]

# **Save preproc data:**

In [None]:
data_train.to_csv('/content/drive/My Drive/NLP_humor/data/splits/jokes_TRAIN.csv',
            sep=',',
            header=True, 
            index=True,
            encoding='utf-8')

In [None]:
data_val.to_csv('/content/drive/My Drive/NLP_humor/data/splits/jokes_VAL.csv',
            sep=',',
            header=True, 
            index=True,
            encoding='utf-8')

In [None]:
data_test.to_csv('/content/drive/My Drive/NLP_humor/data/splits/jokes_TEST.csv',
            sep=',',
            header=True, 
            index=True,
            encoding='utf-8')

### Load & Read data: 

In [None]:
train = pd.read_csv('/content/drive/My Drive/NLP_humor/data/splits/jokes_TRAIN.csv', 
                 sep=',', 
                 encoding='utf-8').set_index('id')

In [None]:
train.head()

Unnamed: 0_level_0,joke,Log1pRank
id,Unnamed: 1_level_1,Unnamed: 2_level_1
5tz52q,I hate how you cannot even say black paint any...,0.693147
5tz4dd,What is the difference between a Jew in Nazi G...,0.0
5tz319,I recently went to America..... ...and being t...,0.0
5tz2wj,"Brian raises his hand and says, ""He is in Heav...",0.693147
5tz1pc,You hear about the University book store worke...,0.0


In [None]:
train.isna().sum()

joke         0
Log1pRank    0
dtype: int64

In [None]:
train.dropna(inplace=True)