In [6]:
##################################################
# Introduction to Text Mining and Natural Language Processing
##################################################
# Data pre-processing
# Data visualization
# Modelling

In [7]:
##################################################
# Sentiment Analysis and Sentiment Modeling for Amazon Reviews
##################################################

In [8]:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from PIL import Image
from wordcloud import WordCloud
import nltk
from nltk.corpus import stopwords
from textblob import Word, TextBlob
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn import preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from nltk.sentiment import SentimentIntensityAnalyzer
from warnings import filterwarnings

In [9]:
filterwarnings('ignore')
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', lambda x: '%.2f' % x)
pd.set_option('display.width', 200)

In [10]:
###############################
# TEXT PRE-PROCESSING
###############################

df = pd.read_csv("../input/amazonreviews-sub/df_sub.csv", sep=",")
df.head()
df.info()

In [11]:
###############################
# Normalizing Case Folding
###############################
df['reviewText'] = df['reviewText'].str.lower()

In [12]:
###############################
# Punctuations
###############################
df['reviewText'] = df['reviewText'].str.replace('[^\w\s]', '')

In [13]:
###############################
# Numbers
###############################
df['reviewText'] = df['reviewText'].str.replace('\d', '')

In [14]:
###############################
# Stopwords
###############################
# nltk.download('stopwords')
sw = stopwords.words('english')
df['reviewText'] = df['reviewText'].apply(lambda x: " ".join(x for x in str(x).split() if x not in sw))

In [15]:
###############################
# Rarewords / Custom Words
###############################

# sil = pd.Series(' '.join(df['reviewText']).split()).value_counts()[-1000:]
# df['reviewText'] = df['reviewText'].apply(lambda x: " ".join(x for x in x.split() if x not in sil))

In [16]:
###############################
# Tokenization
###############################

# nltk.download("punkt")
df["reviewText"].apply(lambda x: TextBlob(x).words).head()

In [17]:
###############################
# Lemmatization
###############################
# nltk.download('wordnet')
df['reviewText'] = df['reviewText'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))

df['reviewText'].head(10)

In [18]:
###############################
# Term Frequency Calculations
###############################

tf = df["reviewText"].apply(lambda x: pd.value_counts(x.split(" "))).sum(axis=0).reset_index()
tf.columns = ["words", "tf"]
tf.head()
tf.shape
tf["words"].nunique()
tf["tf"].describe([0.05, 0.10, 0.25, 0.50, 0.75, 0.80, 0.90, 0.95, 0.99]).T

In [19]:
###############################
# Barplot
###############################

tf[tf["tf"] > 500].plot.bar(x="words", y="tf")
plt.show()

In [20]:
###############################
# Wordcloud
###############################

text = " ".join(i for i in df.reviewText)
wordcloud = WordCloud().generate(text)
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()

wordcloud = WordCloud(max_font_size=50,
                      max_words=100,
                      background_color="white").generate(text)
plt.figure()
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()

# wordcloud.to_file("WordCloud.png")

In [28]:
# nltk.download('vader_lexicon')
sia = SentimentIntensityAnalyzer()
sia.polarity_scores("I liked this music but it is not good as the other one")

In [29]:
df["reviewText"][0:10].apply(lambda x: sia.polarity_scores(x))

In [30]:
df["reviewText"][0:10].apply(lambda x: sia.polarity_scores(x)["compound"])

In [31]:
df["reviewText"][0:10].apply(lambda x: "pos" if sia.polarity_scores(x)["compound"] > 0 else "neg")

In [32]:
df["sentiment_label"] = df["reviewText"].apply(lambda x: "pos" if sia.polarity_scores(x)["compound"] > 0 else "neg")

In [34]:
df.groupby("sentiment_label")["overall"].mean()

In [35]:
# ngram
a = """I wouldn't set a longer text to be understood that way.
N-grams show and produce combinations of uses for use together"""

TextBlob(a).ngrams(3)


In [36]:
##################
# Count Vectors
##################

from sklearn.feature_extraction.text import CountVectorizer
corpus = ['This is the first document.',
          'This document is the second document.',
          'And this is the third one.',
          'Is this the first document?']

# word frekans
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)
vectorizer.get_feature_names()
X.toarray()


# n-gram frekans
vectorizer2 = CountVectorizer(analyzer='word', ngram_range=(2, 2))
X2 = vectorizer2.fit_transform(corpus)
vectorizer2.get_feature_names()
X2.toarray()

In [37]:
##################
# TF-IDF
##################

# Normalized numerical representations of word/sentence representations (words, characters, n-grams).

# TF-IDF = TF(t) * IDF(t)
# TF(t) = (Frequency of a t term observed in the relevant document) / (Total number of terms in the document)
# IDF(t) = log_e(Total number of documents / Number of documents with t terms)

# word tf-idf
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(analyzer='word')
X = vectorizer.fit_transform(corpus)
vectorizer.get_feature_names()
X.toarray()

# n-gram tf-idf
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(ngram_range=(2, 3))
X = vectorizer.fit_transform(corpus)
vectorizer.get_feature_names()
X.toarray()

In [38]:

##################
# Word Embeddings
##################

# Word Embeddings (Word2Vec, GloVe, BERT)

In [39]:
###############################
# FEATURE ENGINEERING
###############################

# Test-Train
train_x, test_x, train_y, test_y = train_test_split(df["reviewText"],
                                                    df["sentiment_label"],
                                                    random_state=17)

train_x[0:5]
train_y[0:5]

encoder = preprocessing.LabelEncoder()
train_y = encoder.fit_transform(train_y)
test_y = encoder.fit_transform(test_y)

# Count Vectors
vectorizer = CountVectorizer()
vectorizer.fit(train_x)
x_train_count = vectorizer.transform(train_x)
x_test_count = vectorizer.transform(test_x)
vectorizer.get_feature_names()[0:10]
x_train_count.toarray()

# TF-IDF Word Level
tf_idf_word_vectorizer = TfidfVectorizer().fit(train_x)
x_train_tf_idf_word = tf_idf_word_vectorizer.transform(train_x)
x_test_tf_idf_word = tf_idf_word_vectorizer.transform(test_x)

# TF-IDF N-Gram Level
tf_idf_ngram_vectorizer = TfidfVectorizer(ngram_range=(2, 3)).fit(train_x)
x_train_tf_idf_ngram = tf_idf_ngram_vectorizer.transform(train_x)
x_test_tf_idf_ngram = tf_idf_ngram_vectorizer.transform(test_x)

# TF-IDF Characters Level
tf_idf_chars_vectorizer = TfidfVectorizer(analyzer="char", ngram_range=(2, 3)).fit(train_x)
x_train_tf_idf_chars = tf_idf_chars_vectorizer.transform(train_x)
x_test_tf_idf_chars = tf_idf_chars_vectorizer.transform(test_x)

In [41]:
###############################
# MODELING (SENTIMENT MODELING)
###############################

# TF-IDF Word-Level Logistic Regression
log_model = LogisticRegression().fit(x_train_tf_idf_word, train_y)
y_pred = log_model.predict(x_test_tf_idf_word)
print(classification_report(y_pred, test_y))

cross_val_score(log_model, x_test_tf_idf_word, test_y, cv=5).mean()

new_comment = pd.Series("this film is great")
new_comment = pd.Series("look at that shit very bad")
new_comment = pd.Series("it was good but I am sure that it fits me")

new_comment = CountVectorizer().fit(train_x).transform(new_comment)

log_model.predict(new_comment)

random_review = pd.Series(df["reviewText"].sample(1).values)
new_comment = CountVectorizer().fit(train_x).transform(random_review)
log_model.predict(new_comment)

In [42]:
# RandomForestClassifier
# TF-IDF Word-Level
rf_model = RandomForestClassifier().fit(x_train_tf_idf_word, train_y)
cross_val_score(rf_model, x_test_tf_idf_word, test_y, cv=5, n_jobs=-1).mean()

In [43]:
# TF-IDF N-GRAM
rf_model = RandomForestClassifier().fit(x_train_tf_idf_ngram, train_y)
cross_val_score(rf_model, x_test_tf_idf_ngram, test_y, cv=5, n_jobs=-1).mean()

In [44]:
# TF-IDF CHARLEVEL
rf_model = RandomForestClassifier().fit(x_train_tf_idf_chars, train_y)
cross_val_score(rf_model, x_test_tf_idf_chars, test_y, cv=5, n_jobs=-1).mean()

In [45]:
# Count Vectors
rf_model = RandomForestClassifier().fit(x_train_count, train_y)
cross_val_score(rf_model, x_test_count, test_y, cv=5).mean()