# Humour Classification

In [1]:
!pip install gensim
import re

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
nltk.download('wordnet')
from collections import defaultdict

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

import gensim
from gensim import corpora
from gensim import utils
import numpy as np
import pickle

from tensorflow.keras import layers
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences




[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [2]:
import tensorflow as tf

## 1. Load dataset

In [None]:
from google.colab import files
uploads = files.upload()

Saving jokes_and_sentences.csv to jokes_and_sentences.csv


In [4]:
def load_jokes(filename):
  '''Function to load,shuffle and clean data. Incorporates clean_data() function.

  Returns:
  clean(dict): shuffled and formatted data; {'Text': ['example','text'],'Target': [0,1]}
  '''

  df = pd.read_csv('jokes_and_sentences.csv')
  #df = pd.read_csv(filename)

  #shuffles rows of df
  df = df.sample(frac = 1)

  print(df.head(20))

  all_lines = df.to_dict()
  text = list(all_lines['Text'].values())
  labels = list(all_lines['Target'].values())

  clean = defaultdict(list)
  clean['Text'] = text
  clean['Target'] = labels

  for i in range(len(clean['Text'])):
    words = clean['Text'][i]
    cleaned = clean_data(words)
    clean['Text'][i] = cleaned

  return clean



In [5]:
def clean_data(data):
  '''Function to clean text
  Performs lemmatization, removes non alphanumeric characters and removes stop words

  Args:
  data(string): piece of text to be cleaned

  Returns:
  clean_text(string): cleaned text
  '''
  # convert to lower case
  clean_text = data.lower()
  #remove hyphens and sub with space
  clean_text = clean_text.replace("-"," ")
  clean_text = clean_text.replace(":"," ")
  clean_text = clean_text.replace("..."," ")

  # tokenize the data
  clean_text = clean_text.split(" ")

  # remove non alphanumeric chars
  clean_text = " ".join([re.sub(r'[^a-zA-Z0-9]', '', word) for word in clean_text])

  # remove stopwords
  stopword = nltk.corpus.stopwords.words('english')
  review_cleaned = " ".join([word for word in re.split('\W+', clean_text) if word not in stopword])

  # perform lemmatizing
  wn = nltk.WordNetLemmatizer()
  review_cleaned = " ".join([wn.lemmatize(word,'v') for word in re.split('\W+', review_cleaned)])

  return review_cleaned

In [6]:
data = load_jokes('jokes_and_sentences.csv')

                                                   Text  Target
7040  It is a distance race that demands a great dea...       0
5308  Some animals, like people and most vertebrates...       0
657   Wife told me to take the spider out instead of...       1
6563  Both come about because this part of the air g...       0
7064  The open world environment allows players to c...       0
2699  A guy stood over his tee shot for what seemed ...       1
81          Why do cows not have toes? They lactose! \t       1
598   I tried taking some high resolution photos of ...       1
4871  A vaccine often contains something like a germ...       0
6289  And not all long sentences are run-on sentence...       0
1082  A man went to a restaurant and ordered a steak...       1
7852  Percentage is far below consensus. Please try ...       0
2882  It was graduation day and Mom was trying to ta...       1
3839  A feisty 70 year old woman had to call a furna...       1
3707  HOBBIES, TECHNICAL:\n\n\nHOBBIES, 

In [7]:
data['Text'][:10]

['distance race demand great deal endurance well speed strategies also play key role combination make attractive many',
 'animals like people vertebrates two ears animals hear ears spiders small hairs legs hear',
 'wife tell take spider instead kill drink cool guy want web developer ',
 'come part air get colder go cold air become thicker fall warm air become thinner go turn earth move air well air move north south middle earth generally get power sun warmer north south point',
 'open world environment allow players choose want play storyline missions progress players also take part many events come across explore world',
 'guy stand tee shoot seem eternity look look measure distance figure wind direction speed drive partner nutsfinally exasperate partner say hell take long hit goddamn ballthe guy answer wife watch clubhouse want make perfect shotwell hell man dont stand snowball chance hell hit',
 'cow toe lactose ',
 'try take high resolution photos local farmland turn bite grainy ',

## 2. Text Preprocessing (Incorporated into the Load jokes function for ease of use)

In [8]:
for i in range(len(data['Text'])):
    words = data['Text'][i]
    cleaned = clean_data(words)
    data['Text'][i] = cleaned

In [9]:
data['Text'][:10]

['distance race demand great deal endurance well speed strategies also play key role combination make attractive many',
 'animals like people vertebrates two ears animals hear ears spiders small hairs legs hear',
 'wife tell take spider instead kill drink cool guy want web developer ',
 'come part air get colder go cold air become thicker fall warm air become thinner go turn earth move air well air move north south middle earth generally get power sun warmer north south point',
 'open world environment allow players choose want play storyline missions progress players also take part many events come across explore world',
 'guy stand tee shoot seem eternity look look measure distance figure wind direction speed drive partner nutsfinally exasperate partner say hell take long hit goddamn ballthe guy answer wife watch clubhouse want make perfect shotwell hell man dont stand snowball chance hell hit',
 'cow toe lactose ',
 'try take high resolution photos local farmland turn bite grainy ',

#### 2.1 Write cleaned data to a file

In [None]:
with open('cleaned_data.pkl', 'wb') as f:
  pickle.dump(data['Text'], f)

In [None]:
with open('data_labels.pkl', 'wb') as f:
  pickle.dump(data['Target'], f)

## 3. Word Embeddings

Need to create document vector from the word embeddings. We should list following link in references for report:
https://www.kaggle.com/code/kstathou/word-embeddings-logistic-regression

In [10]:
def create_doc_vec(embeddings, doc, emb_size, agg=True):
  '''Function to take dict of word embeddings and creates document vector as an output
  Args:
    embeddings(dict): embeddings dictionary where keys correspond to words in the vocab
    doc(string): a single document
    emb_size(int): the length of an embedding vector
  Returns:
    doc_vec(array): a 2d array containing the word embeddings
  '''
  doc_ = doc.split()
  doc_vect = []
  for word in doc_:
    try:
      embed = embeddings[word]
      doc_vect.append(embed)
    except:
      pass
      #embed = [0 for i in range(emb_size)]

  doc_vect = np.array(doc_vect)
  if agg:
    doc_vect = np.mean(doc_vect, axis=0)

  return doc_vect


### 3.1 TF-IDF vectorizer

In [None]:
# Create a TF-IDF vectorizer
vectorizer = TfidfVectorizer()

# Fit the vectorizer on the data
X_tfidf = vectorizer.fit_transform(data['Text'])

In [None]:
X_tfidf.shape

(7880, 29369)

In [None]:
# Saving the TF_IDF to a file
with open('embeddings_tfidf.pkl', 'wb') as f:
  pickle.dump(X_tfidf, f)

### 3.2 Training Word2Vec model

In [11]:
dictionary = corpora.Dictionary(line.split() for line in data['Text'])
dictionary

<gensim.corpora.dictionary.Dictionary at 0x78b6dd48f040>

In [12]:
class MyCorpus:
    def __iter__(self):
        for line in data['Text']:
          # assume there's one document per line, tokens separated by whitespace
          yield utils.simple_preprocess(line)

In [13]:
corpus = MyCorpus()
print(corpus)

<__main__.MyCorpus object at 0x78b6dd48fd00>


In [14]:
for vector in corpus:
    continue  # load one vector into memory at a time
    print(vector)
    break


In [15]:
model = gensim.models.Word2Vec(sentences=corpus)

In [16]:
vocab = set()
for string in data['Text']:
  words = string.split()
  vocab.update(words)

len(vocab)

29382

In [17]:
embeddings_trained_model_list = []
embeddings_trained_model = {}
for word in vocab:
  try:
    vec_c = model.wv[word]
    embeddings_trained_model_list.append(vec_c)
    embeddings_trained_model[word] = vec_c
  except KeyError:
    continue

In [18]:
#need to check length of each vector
emb_len  = len(embeddings_trained_model)
print(emb_len)
le = []
for i in range(emb_len):
  le.append(len(embeddings_trained_model_list[i]))
le = np.array(le)
emb_vec_size = np.unique(le)

emb_vec_size

5707


array([100])

In [None]:
vec_test = model.wv['king']
vec_test

array([-0.14696124,  0.36991766,  0.09362533, -0.1108419 ,  0.08769319,
       -0.5812112 ,  0.16687095,  0.7445814 , -0.22547688, -0.2111091 ,
       -0.13856097, -0.5161872 , -0.04633697, -0.01273117,  0.02413861,
       -0.33954874, -0.09285438, -0.38825518,  0.07410315, -0.6231759 ,
        0.19963343,  0.221795  ,  0.4367928 , -0.14297172, -0.1284446 ,
        0.11873754, -0.23265192, -0.19718258, -0.16379066,  0.06883798,
        0.4861388 ,  0.12306323,  0.09142508, -0.15280724,  0.14072526,
        0.3024679 , -0.08148299, -0.35445035, -0.2318539 , -0.60254806,
        0.10525856, -0.22507106, -0.28386873,  0.07236446,  0.17296818,
       -0.36758518, -0.3272282 , -0.11837509,  0.16549821,  0.23980342,
        0.09453598, -0.4486291 , -0.08290415, -0.104697  , -0.35850808,
        0.30226302,  0.2999139 , -0.1016961 , -0.38924235,  0.20495047,
        0.11201318, -0.05786983, -0.13116279, -0.01444412, -0.47610587,
        0.21180329,  0.12212019,  0.2593602 , -0.4683321 ,  0.46

#### 3.2.1 Creating document vector for Logistic Regression

In [19]:
#create document vector for logistic regression
total_vect = []
for doc in data['Text']:
  vec = create_doc_vec(embeddings_trained_model, doc, int(emb_vec_size))
  total_vect.append(vec)

In [20]:
#check shape of document vector
total_vect = np.array(total_vect)
total_vect.shape

(7880, 100)

In [None]:
# Save document vector for logistic regression
with open('embeddings_trained_lr.pkl', 'wb') as f:
  pickle.dump(total_vect, f)

#### 3.2.2 Creating Document Vector for LSTM

In [None]:
# Creating document vectors for model trained on our data for LSTM
embeddings_implemented_doc_vec = []

for doc in data['Text']:
  vec = create_doc_vec(embeddings_trained_model, doc, int(emb_vec_size), False)
  embeddings_implemented_doc_vec.append(vec)

In [None]:
# Save the document embeddings obtained from model trained on our data
with open('embeddings_trained_lstm.pkl', 'wb') as f:
  pickle.dump(embeddings_implemented_doc_vec, f)


--------------------

### 3.3 Pre-trained Word2Vec

We will fetch the Word2Vec model trained on part of the Google News dataset covering approximately 3 million words and phrases. Such a model can take hours to train, but since it’s already available, downloading and loading it will be much faster.

In [None]:
# Pretrained Word2Vec model
from gensim.models import Word2Vec
import gensim.downloader as api

wv = api.load('word2vec-google-news-300')



In [None]:
vocab = set()
for string in data['Text']:
  words = string.split()
  vocab.update(words)

len(vocab)

29382

In [None]:
embeddings_pretrained_model = {}
embeddings_pretrained_model_list = []
for word in vocab:
  try:
    vec_c = wv[word]
    embeddings_pretrained_model_list.append(vec_c)
    embeddings_pretrained_model[word] = vec_c
  except KeyError:
    continue

In [None]:
len(embeddings_pretrained_model)

16613

In [None]:
#need to check length of each vector
emb_len = len(embeddings_pretrained_model)
print(emb_len)
le = []

for i in range(emb_len):
  le.append(len(embeddings_pretrained_model_list[i]))

le = np.array(le)
emb_vec_size = np.unique(le)
print(emb_vec_size)

16613
[300]


#### 3.3.1 Creating Document Vector for Logistic Regression

In [None]:
#create document vector for logistic regression
total_pre_vect = []
for doc in data['Text']:
  vec = create_doc_vec(embeddings_pretrained_model, doc, int(emb_vec_size))
  total_pre_vect.append(vec)

In [None]:
#check shape of document vector
total_pre_vect = np.array(total_pre_vect)
total_pre_vect.shape

(7880, 300)

In [None]:
# Save document vector for LR in a file
with open('embeddings_pretrained_lr.pkl', 'wb') as f:
  pickle.dump(total_pre_vect, f)


##### Create Document vectors for LSTM

In [None]:
# Creating document vectors for model pretrained for LSTM
embeddings_pretrained_doc_vec = []

for doc in data['Text']:
  vec = create_doc_vec(embeddings_pretrained_model, doc, int(emb_vec_size), False)
  embeddings_pretrained_doc_vec.append(vec)

In [None]:
len(embeddings_pretrained_doc_vec)

7880

In [None]:
# Save the document embeddings obtained from pretrained model
with open('embeddings_pretrained_lstm.pkl', 'wb') as f:
  pickle.dump(embeddings_pretrained_doc_vec, f)


Note: We averaged the embeddings of the words to construct a document vector of a consisent shape to pass through logistic regression. However, when doing this unfortunately the relationships between each word can get lost. However, we end up with a new/different representation of the words than TFIDF to pass through logistic regression

Due to these limitations, Doc2Vec might be more suitable for logistic regression since it does paragraph embeddings
https://stats.stackexchange.com/questions/299446/word-embeddings-with-logistic-regression

Using the pre-trained model, we can get word embeddings for 16k words as other words are unknown to the pretrained model.

Pre-trained model gives us embeddings for more words compared to the model we trained.

-----------------------------