# Text Mining and Search

UniMiB 2022/23

**IMDB Reviews**

In [None]:
# initial imports

import io
import numpy as np
import pandas as pd
from tqdm import tqdm
import os
import joblib

Merge data - Train

In [None]:
train_dataset = pd.DataFrame(columns=['text','sentiment'])

found = 0
for file in tqdm(os.listdir('../data/raw/train/neg/')):
  with io.open('../data/raw/train/neg/'+file, mode="r", encoding="utf-8") as f:
    text = f.read()
    train_dataset.loc[len(train_dataset)] = [text, 'NEG']
for file in tqdm(os.listdir('../data/raw/train/pos/')):
  with io.open('../data/raw/train/pos/'+file, mode="r", encoding="utf-8") as f:
    text = f.read()
    train_dataset.loc[len(train_dataset)] = [text, 'POS']
            

In [None]:
train_dataset.to_csv('../data/train_dataset.csv')

Merge data - Test

In [None]:
test_dataset = pd.DataFrame(columns=['text', 'sentiment'])

found = 0
for file in tqdm(os.listdir('../data/raw/test/neg/')):
  with io.open('../data/raw/test/neg/'+file, mode="r", encoding="utf-8") as f:
    text = f.read()
    test_dataset.loc[len(test_dataset)] = [text, 'NEG']
for file in tqdm(os.listdir('../data/raw/test/pos/')):
  with io.open('../data/raw/test/pos/'+file, mode="r", encoding="utf-8") as f:
    text = f.read()
    test_dataset.loc[len(test_dataset)] = [text, 'POS']


In [None]:
test_dataset.to_csv('../data/test_dataset.csv')

Read data

In [None]:
train = pd.read_csv('../data/train_dataset.csv')
train = train[['text', 'sentiment']]
train.head()

In [None]:
test = pd.read_csv('../data/test_dataset.csv')
test = test[['text', 'sentiment']]
test.head()

### Text Pre-Processing

In [None]:
import string
import re

import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('stopwords')

from preprocess import *

In [None]:
lemmatizer = WordNetLemmatizer()

# function to remove all the numbers from the text
def remove_numbers(text_to_preprocess):
    return re.sub(r'\d+', '', text_to_preprocess)

# function to remove all the punctuation marks from the text
def remove_punctuation(text):
    return text[0].translate(str.maketrans('', '', string.punctuation))

# function to remove all the stopwords from the text
def remove_stopwords(text):
    no_stopwords = ''
    for item in text.split():
      if item not in stopwords.words():
        no_stopwords+=' '+item
    return no_stopwords

# function to find the Part-Of-Speech tags for the words in the text
def postagger(token_words):
    return nltk.pos_tag(token_words)

# function to remove extra whitespaces from the text
def remove_extra_whitespace(text):
    return " ".join(text.split())

# function to tokenize the text into words
def tokenizer(text):
  return word_tokenize(text)

# function to lemmatize the tokenized words
def lemmatizer_function(tokenized_text):
  lemmatized_text= ''
  for token in tokenized_text:
    lemmatized = lemmatizer.lemmatize(token)
    lemmatized_text += ' '+lemmatized
  return lemmatized_text


# function to preprocess the text by lowercasing, removing numbers, punctuation, stopwords, extra whitespaces and lemmatizing
def preprocess_text(text):
    text = text.str.lower()
    no_nums = remove_numbers(text),
    no_punct = remove_punctuation(no_nums)
    no_stopw = remove_stopwords(no_punct)
    no_whtspace = remove_extra_whitespace(no_stopw)
    tokenized = tokenizer(no_whtspace)
    lemmatized = lemmatizer_function(tokenized)
    return lemmatized

In [None]:
import numpy as np
from multiprocessing import Pool
from preprocess import preprocess_loader
if __name__ == '__main__':
  df_split = np.array_split(train_ds, 10)
  pool = Pool(10)
  df = pd.concat(pool.map(preprocess_loader, df_split))
  pool.close()
  pool.join()

In [None]:
tqdm.pandas()

train['preprocessed_text'] = train['text'].progress_apply(preprocess_text)
train.head()

In [None]:
from bs4 import BeautifulSoup

# removing the html strips
def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

# removing the square brackets
def remove_between_square_brackets(text):
    return re.sub('\[[^]]*\]', '', text)

### Data Exploration

In [None]:
train_pp = pd.read_csv('../data/preprocessed_train.csv')
train_pp = train_pp[['text', 'sentiment', 'preprocessed_text']]
train_pp.head()

In [None]:
test_pp = pd.read_csv('../data/preprocessed_test.csv')
test_pp = test_pp[['text', 'sentiment', 'preprocessed_text']]
test_pp.head()

In [None]:
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt

# get the text and join all the reviews in training set
# creating the text variable
text = " ".join(cat for cat in train_pp.preprocessed_text)

# limit the word count and set the stopwords
wordcount = 500
stopwords = set(STOPWORDS)
stopwords.add("english")

# setup, generate and save the word cloud image to a file
wc = WordCloud(scale=5, 
               background_color="grey", 
               max_words=wordcount, 
               stopwords=stopwords)
wc.generate(text)
wc.to_file("../figures/WordCloud_train.png")

# show the wordcloud as output
plt.imshow(wc, interpolation='bilinear')
plt.axis("off")
plt.figure()
plt.axis("off")
plt.show()

In [None]:
# get the text and join all the reviews in test set
# creating the text variable
text = " ".join(cat for cat in test_pp.preprocessed_text)

# limit the word count and set the stopwords
wordcount = 500
stopwords = set(STOPWORDS)
stopwords.add("english")

# setup, generate and save the word cloud image to a file
wc = WordCloud(scale=5, 
               background_color="grey", 
               max_words=wordcount, 
               stopwords=stopwords)
wc.generate(text)
wc.to_file("../figures/WordCloud_test.png")

# show the wordcloud as output
plt.imshow(wc, interpolation='bilinear')
plt.axis("off")
plt.figure()
plt.axis("off")
plt.show()

In [None]:
# print  number of unique words
print("Number of words: ")
print(len(np.unique(np.hstack(train_pp.preprocessed_text))))
print ()

# print the average review length
print("Average review length:")
result = [len(x) for x in train_pp.preprocessed_text]
print("Mean %.2f words (%f)" % (np.mean(result), np.std(result)))

In [None]:
# print  number of unique words
print("Number of words: ")
print(len(np.unique(np.hstack(test_pp.preprocessed_text))))
print ()

# print the average review length
print("Average review length:")
result = [len(x) for x in test_pp.preprocessed_text]
print("Mean %.2f words (%f)" % (np.mean(result), np.std(result)))

### Text Representation

In [None]:
from sklearn.feature_extraction.text import CountVectorizer      #-- Bag of Words
from sklearn.feature_extraction.text import TfidfVectorizer      #-- Tf-Idf

import joblib

In [None]:
text_preprocessed_train = train_pp['preprocessed_text']
text_preprocessed_test = test_pp['preprocessed_text']

Bag-of-Word (BoW)

In [None]:
vectorizer = CountVectorizer(ngram_range=(1,2),                                 #-- Uni-grams and Bi-grams
                             max_features = 25000)                              #-- Most 25000 frequent grams across the text

X_text_bow =  vectorizer.fit_transform(text_preprocessed_train).toarray()
y_text_bow =  vectorizer.transform(text_preprocessed_test).toarray()

In [None]:
print(X_text_bow.shape)
print(y_text_bow.shape)

In [None]:
joblib.dump(X_text_bow, 'processed_train_bow.save')

In [None]:
joblib.dump(y_text_bow, 'processed_test_bow.save')

Binary

In [None]:
'''
text_preprocessed = train_pp['preprocessed_text']

vectorizer = CountVectorizer(binary = True, max_features = 25000)
X_text_binary =  vectorizer.fit_transform(text_preprocessed)
'''

In [None]:
'''
print(X_text_bow.shape)
'''

In [None]:
'''
import joblib
joblib.dump(X_text_bow, 'processed_train_binary_bow.save')
'''

TF-IDF

In [None]:
vectorizer = TfidfVectorizer(max_features=25000, ngram_range=(1, 2))

X_text_tfidf =  vectorizer.fit_transform(text_preprocessed_train).toarray()
y_text_tfidf =  vectorizer.transform(text_preprocessed_test).toarray()

In [None]:
print(X_text_tfidf.shape)
print(y_text_tfidf.shape)

In [None]:
joblib.dump(X_text_tfidf, 'processed_train_tfidf.save')

In [None]:
joblib.dump(y_text_tfidf, 'processed_test_tfidf.save')

### Text Classification

In [None]:
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
import joblib
from sklearn.preprocessing import LabelEncoder

In [None]:
labels_train = train_pp['sentiment']

encoder = LabelEncoder()
encoder.fit(labels_train)
encoded_labels_train = encoder.transform(labels_train)

In [None]:
labels_test = test_pp['sentiment']

encoder = LabelEncoder()
encoder.fit(labels_test)
encoded_labels_test = encoder.transform(labels_test)

In [None]:
X_train_bow = joblib.load('processed_train_bow.save')
X_train_tfidf = joblib.load('processed_train_tfidf.save')

In [None]:
y_test_bow = joblib.load('processed_test_bow.save')
y_test_tfidf = joblib.load('processed_test_tfidf.save')

Support Vector Machines (SVM)

In [None]:
from sklearn.svm import LinearSVC

In [None]:
clf_bow = LinearSVC(C=0.001)

In [None]:
clf_bow.fit(X_text_bow, encoded_labels_train)

In [None]:
preds_bow = clf_bow.predict(y_text_bow)
print(classification_report(encoded_labels_test, preds_bow))

In [None]:
clf_tfidf = LinearSVC(C=0.001)

In [None]:
clf_tfidf.fit(X_text_tfidf, encoded_labels_train)

In [None]:
preds_tfidf = clf_tfidf.predict(y_text_tfidf)
print(classification_report(encoded_labels_test, preds_tfidf))

## Multi layer perceptron

In [None]:
import tensorflow as tf
from tensorflow.keras import layers
import scipy

TF-IDF

In [None]:
inputs = tf.keras.Input(shape=(25000))

x = layers.Dense(128, activation="relu")(inputs)
x = layers.Dense(64, activation="relu")(x)
x = layers.Dense(32, activation="relu")(x)
prediction = layers.Dense(1, activation="sigmoid")(x)

callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3)
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath='./checkpoint/best_model.h5',
    save_weights_only=True,
    monitor='val_accuracy',
    mode='max',
    save_best_only=True)
model = tf.keras.Model(inputs, prediction)
model.compile(loss="binary_crossentropy",
              optimizer="adam", metrics=["accuracy"], )

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_val, Y_train, Y_val = train_test_split(
    X_text_tfidf, encoded_labels_train, test_size=0.2)

In [None]:
history = model.fit(X_train, Y_train, epochs=100,
          callbacks=[model_checkpoint_callback, callback], validation_data=(X_val, Y_val))

In [None]:
joblib.dump(history, './mlp_train_history.save')

In [None]:
model.save('./mlp_tfidf_save')


In [None]:
preds = model.predict(y_text_tfidf)

In [None]:
preds = np.round(preds)

In [None]:
predictions = []

for item in preds:
  predictions.append(int(item[0]))

In [None]:
print(classification_report(encoded_labels_test, preds))

BoW

In [None]:
inputs = tf.keras.Input(shape=(25000))

x = layers.Dense(128, activation="relu")(inputs)
x = layers.Dense(64, activation="relu")(x)
x = layers.Dense(32, activation="relu")(x)
prediction = layers.Dense(1, activation="sigmoid")(x)

callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3)
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath='./checkpoint/best_model_bow.h5',
    save_weights_only=True,
    monitor='val_accuracy',
    mode='max',
    save_best_only=True)
model = tf.keras.Model(inputs, prediction)
model.compile(loss="binary_crossentropy",
              optimizer="adam", metrics=["accuracy"], )


In [None]:
from sklearn.model_selection import train_test_split

X_train, X_val, Y_train, Y_val = train_test_split(
    X_text_bow, encoded_labels_train, test_size=0.2)

In [None]:
history = model.fit(X_train, Y_train, epochs=100,
                    callbacks=[model_checkpoint_callback, callback], validation_data=(X_val, Y_val))

In [None]:
joblib.dump(history, './mlp_train_history.save')

In [None]:
model.save('./mlp_bow_save')

In [None]:
model = tf.keras.models.load_model('./mlp_bow_save')
preds = model.predict(y_text_bow)
preds = np.round(preds)
predictions = []

for item in preds:
  predictions.append(int(item[0]))

print(classification_report(encoded_labels_test, preds))

Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression,SGDClassifier

In [None]:
# training the model
lr = LogisticRegression(penalty='l2',max_iter=500,C=1,random_state=42)

# fitting the model for Bag of words
lr_bow = lr.fit(X_text_bow, encoded_labels_train)
print(lr_bow)

# fitting the model for tfidf features
lr_tfidf = lr.fit(X_text_tfidf, encoded_labels_train)
print(lr_tfidf)

In [None]:
# predicting the model for bag of words
lr_bow_predict = lr.predict(y_text_bow)
print(lr_bow_predict)

# predicting the model for tfidf features
lr_tfidf_predict = lr.predict(y_text_tfidf)
print(lr_tfidf_predict)

In [None]:
# accuracy score for bag of words
lr_bow_score = accuracy_score(encoded_labels_test, lr_bow_predict)
print("lr_bow_score :",lr_bow_score)

# accuracy score for tfidf features
lr_tfidf_score = accuracy_score(encoded_labels_test, lr_tfidf_predict)
print("lr_tfidf_score :",lr_tfidf_score)

In [None]:
# classification report for bag of words 
lr_bow_report = classification_report(encoded_labels_test, lr_bow_predict, target_names=['Positive','Negative'])
print(lr_bow_report)

# classification report for tfidf features
lr_tfidf_report = classification_report(encoded_labels_test, lr_tfidf_predict, target_names=['Positive','Negative'])
print(lr_tfidf_report)

### Text Clustering

#### Density Clustering

In [None]:
from sklearn.cluster import DBSCAN

In [None]:
train_pp = pd.read_csv('../data/preprocessed_train.csv')
train_pp = train_pp[['text', 'sentiment', 'preprocessed_text']]

test_pp = pd.read_csv('../data/preprocessed_test.csv')
test_pp = test_pp[['text', 'sentiment', 'preprocessed_text']]

X_train = train_pp['preprocessed_text']
X_test = test_pp['preprocessed_text']

In [None]:
train_pp = pd.read_csv('../data/preprocessed_train.csv')
test_pp = pd.read_csv('../data/preprocessed_test.csv')
train_pp = train_pp['preprocessed_text']
test_pp = test_pp['preprocessed_text']

full_dataframe = pd.concat((train_pp, test_pp), axis=0)

In [None]:
full_dataframe.shape

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer  # -- Tf-Idf

vectorizer = TfidfVectorizer(max_features=10000, ngram_range=(1, 2),)

X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [None]:
from sklearn.decomposition import TruncatedSVD

svd = TruncatedSVD(n_components=200, n_iter=100)
data1 = svd.fit_transform(X_train_tfidf)
data2 = svd.fit_transform(X_test_tfidf)

In [None]:
data1.shape

In [None]:
joblib.dump(data1,'./svd_train.save')
joblib.dump(data2,'./svd_test.save')

In [None]:
data1 = joblib.load("./svd_test.save")
data2 = joblib.load("./svd_train.save")

In [None]:
full_dataset_tfidf = np.concatenate((data1, data2))
full_dataset_tfidf.shape

DBSCAN EPS 0.75

In [None]:
from sklearn.metrics import silhouette_score

def get_silhouette(labels, data):
  silhouette_avg = silhouette_score(data, labels )
  print(f"Silhouette is equal to {silhouette_avg}")

In [None]:
from tqdm import tqdm
clustering = DBSCAN(min_samples=3, eps=.25)
clustering = clustering.fit_predict(data1)

filtered_data = []
filtered_labels = []
for index in tqdm(range(0, len(clustering))):
  label = clustering[index]
  if label != -1:
    filtered_data.append(data1[index])
    filtered_labels.append(label)

get_silhouette(filtered_labels, filtered_data)

In [None]:
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
clustering = joblib.load('./agglomerative_clustering.save')

In [None]:
def plot_word_cloud_data(data, clusters):
  data_str = [str(x) for x in data]
  datas = {
      'text': data_str,
    'cluster': clusters
  }
  new_data = pd.DataFrame(datas)
  for cluster_num in np.unique(clusters):
    if cluster_num != -1:

      data_to_plot = new_data.loc[new_data['cluster'] == cluster_num]
      print(type(data_to_plot['text'].values.tolist()[0]))
      fullstring = ' '.join(data_to_plot['text'].values)
      wordcloud = WordCloud(max_font_size=50, max_words=100,
                            background_color="white").generate(fullstring)
      plt.figure()
      plt.imshow(wordcloud, interpolation="bilinear")
      plt.axis("off")
      plt.show()

In [None]:
plot_word_cloud_data(full_dataframe.values, clustering)

In [None]:
get_silhouette(filtered_labels, filtered_data)

In [None]:
from sklearn.decomposition import PCA
import seaborn as sns

def plot_clustering(points, clusters):
  pca = PCA(2)
  data = pca.fit_transform(points)
  print(data.shape)
  df = pd.DataFrame(data, columns=['x','y'])
  df['cluster'] = clusters
  sns.scatterplot(data=df, x="x", y="y", hue="cluster")

In [None]:
plot_clustering(filtered_data, filtered_labels)

k-means

In [None]:
from sklearn.cluster import KMeans

In [None]:
kMeans = KMeans(n_clusters=4)
clusters = kMeans.fit_predict(full_dataset_tfidf)
get_silhouette(clusters, full_dataset_tfidf)

In [None]:
joblib.dump(clusters, './clustering_kmeans.save')

In [None]:
plot_word_cloud_data(full_dataframe, clusters)

In [None]:
plot_clustering(full_dataset_tfidf, clusters)