<a href="https://www.kaggle.com/code/aleksandrmorozov123/natural-language-processing?scriptVersionId=99551937" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

**Natural Language Processing - step by step**

In [None]:
# import required libraries
import numpy as np
import json
import pandas as pd

In [None]:
# read the data
df = pd.read_csv ('../input/twitter-sentiment-analysis-hatred-speech/test.csv')
df.head (5)

**Lowercase**

In [None]:
df['tweet'] = df['tweet'].apply (lambda x: " ".join (x.lower () for x in x.split ()))
df['tweet']

**Remove punctuation**

In [None]:
df ['tweet'] = df['tweet'].str.replace (r"""[^\w\s]+""","", regex = True)

df['tweet']

**Removing stop words**

In [None]:
import nltk
from nltk.corpus import stopwords

# remove stop words
stop = stopwords.words ('english')
df['tweet'] = df['tweet'].apply (lambda x: " ".join (x for x in x.split () if x not in stop))
df['tweet']

**Tokenizing step**

In [None]:
from textblob import TextBlob
TextBlob (df['tweet'][3]).words

**Stemming**

In [None]:
from nltk.stem import PorterStemmer
st = PorterStemmer ()
df['tweet'][:5].apply (lambda x: " ".join ([st.stem(word) for word in x.split ()]))

**Lemmatizing**

In [None]:
from textblob import Word

df['tweet'] = df['tweet'].apply (lambda x: " ".join ([Word(word).
                                                     lemmatize () for word in x.split ()]))

df['tweet']

**Exploring text data**

In [None]:
# import required libraries

import nltk
from nltk.corpus import webtext
from nltk.probability import FreqDist
from nltk.corpus import stopwords
import string

# computer the frequency of all words
frequency_dist = nltk.FreqDist (df['tweet'][:30])
frequency_dist

In [None]:
sorted_frequency_dist = sorted (frequency_dist, key = frequency_dist.__getitem__, reverse = True)
sorted_frequency_dist

**Consider words with lengh greater than 5 and plot**

In [None]:
large_words = dict ([(k, v) for k, v in frequency_dist.items () if len (k) > 5])
frequency_dist = nltk.FreqDist (large_words)
frequency_dist.plot (50, cumulative = False)

**Build wordcloud**

In [None]:
from wordcloud import WordCloud
tcloud = WordCloud ().generate_from_frequencies (frequency_dist)

# plotting the wordcloud
import matplotlib.pyplot as plt
plt.imshow (tcloud, interpolation = 'bilinear')
plt.axis ("off")
(-0.5, 399.5, 199.5, -0.5)
plt.show ()

**Building a text preprocessing pipeline**

In [None]:
# read the data
datatweet = pd.read_csv ('../input/twitter-sentiment-analysis-hatred-speech/test.csv')
dtweet = datatweet ['tweet'][:30]

In [None]:
# write the function to process the tweets
def processRow (row):
    import re
    import nltk
    from textblob import TextBlob
    from nltk.corpus import stopwords
    from nltk.stem import PorterStemmer
    from textblob import Word
    from nltk.util import ngrams
    import re
    from wordcloud import WordCloud, STOPWORDS
    from nltk.tokenize import word_tokenize
    
    dtweet = row
    
    #lower case
    dtweet = dtweet.apply (lambda x: " ".join (x.lower () for x in x.split ()))
    
    #Removes unicode strings like "\u002c" and "x96"
    dtweet = dtweet.str.replace (r"""(\\u[0-9A-Fa-f]+)""","")
    dtweet = dtweet.str.replace (r"""[^\x00-\x7f]""","")
    
    # convert any url to URL
    dtweet = dtweet.str.replace("""(www\.[^\s]+)|(htpps?://[^\s]+)""", 'URL')
    
    # convert any @Username to "AT_USER"
    dtweet = dtweet.str.replace ("""@[^\s]+""", 'AT_USER')
    
    # remove additional white spaces
    dtweet = dtweet.str.replace ("""[\s]+""", ' ') 
    dtweet = dtweet.str.replace ("""[\n]+""", ' ')
    
    # remove not alphanumeric symbols white spaces
    dtweet = dtweet.str.replace (r"""[^\w]""", ' ')
    
    # remove hashtag in front of a word """
    dtweet = dtweet.str.replace (r"""#([^\s]+)""", r"""\1""")
    
    # replace #word with word
    dtweet = dtweet.str.replace (r"""#([^\s]+)""", r"""\1""")
    
    # remove :( or :)
    dtweet = dtweet.replace (""":)",""")
    dtweet = dtweet.replace (""":(",""")
    
    #remove numbers
    dtweet = " ".join([i for i in dtweet if not i.isdigit ()])
    
    # remove multiple exclamation
    dtweet = re.sub (r"""(\!)\1+""", ' ', dtweet)
    
    # remove multiple question marks
    dtweet = re.sub (r"""(\?)\1+""", ' ', dtweet)
    
    # remove multistop
    dtweet = re.sub (r"""(\.)\1+""", ' ', dtweet)
    
    # lemma
    from textblob import Word
    dtweet = " ".join ([Word(word).lemmatize () for word in dtweet.split ()])
    
    # stemmer
    st = PorterStemmer ()
    dtweet = " ".join ([st.stem (word) for word in dtweet.split ()])
    
    # trim
    dtweet = dtweet.strip ('\'"')
    row = dtweet
    return row
    

In [None]:
# call the function with data
processRow (dtweet)

**Converting text to features**
- **One Hot encoding**

In [None]:
text = "eat healthy live healthy everyday fix whats broken in 5 words eat healthy live healthy body mind soul spirit"

pd.get_dummies (text.split ())

**Count vectorizing**

In [None]:
# import the function
from sklearn.feature_extraction.text import CountVectorizer

text = ["user user user never understand dad left young deep in the feels"]

# create the transform
vectorizer = CountVectorizer ()

# tokenizing
vectorizer.fit (text)

# encode document
vector = vectorizer.transform (text)

# summarize and generating output
print (vectorizer.vocabulary_)
print (vector.toarray ())

**Generating N-grams using TextBlob**

In [None]:
text = 'haroldfriday have a weekend filled with sunbeams everyone healthy weekend'

# import TextBlob
from textblob import TextBlob

TextBlob (text).ngrams (1)

In [None]:
TextBlob (text).ngrams (2)

**Bigram-based features for a document**

In [None]:
# import the function
from sklearn.feature_extraction.text import CountVectorizer

text = ['enjoying the sunshine god is good orlando sunshinestate goodlife bosslady joy']

# create the transform
vectorizer.fit (text)

# encode document
vector = vectorizer.transform (text)

# summarize and generating output
print (vectorizer.vocabulary_)
print (vector.toarray ())

**Co-occurence matrix**

In [None]:
# import required libraries
import numpy as np
import nltk
from nltk import bigrams
import itertools

# create function

def co_occurence_matrix (corpus):
    vocab = set (corpus)
    vocab = list (vocab)
    vocab_to_index = {word:i for i, word in enumerate (vocab)}
    # create bi-grams from all words in corpus
    bi_grams = list (bigrams (corpus))
    # frequency distribution of bi-grams
    bigram_freq = nltk.FreqDist (bi_grams).most_common (len (bi_grams))
    # Initialise co-occurence matrix
    co_occurence_matrix = np.zeros ((len (vocab), len (vocab)))
    
    # loop through the bigrams taking thr current and previous words
    for bigram in bigram_freq:
        current = bigram [0][1]
        previous = bigram [0][1]
        count = bigram [1]
        pos_current = vocab_to_index [current]
        pos_previous = vocab_to_index [previous]
        co_occurence_matrix [pos_current][pos_previous] = count
    co_occurence_matrix = np.matrix (co_occurence_matrix)
    # return the matrix and the index
    return co_occurence_matrix, vocab_to_index

In [None]:
sentences = [['user', 'never', 'understand', 'dad', 'left', 'young', 'deep', 'in', 'the', 'feels'],
             ['enjoying', 'the', 'sunshine', 'god', 'is', 'good', 'orlando', 'sunshinestate', 'goodlife', 'bosslady', 'joy'],
             ['haroldfriday', 'have', 'a', 'weekend', 'filled', 'with', 'sunbeams', 'everyone', 'healthy', 'weekend']]

# create one list using many lists
merged = list (itertools.chain.from_iterable (sentences))
matrix = co_occurence_matrix (merged)

# generate the matrix
CoMatrixFinal = pd.DataFrame (matrix [0])
print (CoMatrixFinal)

**Hash vectorizer**

In [None]:
# import required libraries
from sklearn.feature_extraction.text import HashingVectorizer

# list of text documents
text = ['and the forecast looks good for the weather all across bolton']

# transform
vectorizer = HashingVectorizer (n_features = 10)

# create the hashing vector
vector = vectorizer.transform (text)

# summarize the vector
print (vector.shape)
print (vector.toarray ())

**Converting text to features using TF-IDF**

In [None]:
text = ['i have been working on my anatomy study guide since 5 pm and i am still not done isuck plspassme']

# import TfidVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# create the transform
vectorizer = TfidfVectorizer ()

# tokenize and build vocab
vectorizer.fit (text)

# summarize
print (vectorizer.vocabulary_)
print (vectorizer.idf_)

**Implementing word embeddings**

In [None]:
sentences = [['user', 'never', 'understand', 'dad', 'left', 'young', 'deep', 'in', 'the', 'feels'],
             ['enjoying', 'the', 'sunshine', 'god', 'is', 'good', 'orlando', 'sunshinestate', 'goodlife', 'bosslady', 'joy'],
             ['haroldfriday', 'have', 'a', 'weekend', 'filled', 'with', 'sunbeams', 'everyone', 'the', 'healthy', 'weekend'],
            ['i', 'have', 'been', 'working', 'on', 'my', 'anatomy', 'study', 'guide', 'since', '5', 'pm', 'and', 'i', 'am', 'still', 'not', 'done', 'isuck', 'the', 'plspassme']]

# import required libraries
import gensim
from gensim.models import Word2Vec
from sklearn.decomposition import PCA
from matplotlib import pyplot


# training the model
skipgram = Word2Vec(sentences, window = 3, min_count=1,sg = 1)

print (skipgram)

# visualize
X = skipgram.wv.get_normed_vectors()
pca = PCA (n_components = 2)
result = pca.fit_transform (X)

# create a scatter plot of the projection
pyplot.scatter (result [:, 0], result [:, 1])
words = len (skipgram.wv)
pyplot.show ()

**Continuous bag of words (CBOW)**

In [None]:
# import required libraries
from gensim.models import Word2Vec
from sklearn.decomposition import PCA
from matplotlib import pyplot

# example sentences
sentences = [['user', 'never', 'understand', 'dad', 'left', 'young', 'deep', 'in', 'the', 'feels'],
             ['enjoying', 'the', 'sunshine', 'god', 'is', 'good', 'orlando', 'sunshinestate', 'goodlife', 'bosslady', 'joy'],
             ['haroldfriday', 'have', 'a', 'weekend', 'filled', 'with', 'sunbeams', 'everyone', 'the', 'healthy', 'weekend'],
            ['i', 'have', 'been', 'working', 'on', 'my', 'anatomy', 'study', 'guide', 'since', '5', 'pm', 'and', 'i', 'am', 'still', 'not', 'done', 'isuck', 'the', 'plspassme']]

# training the model
cbow = Word2Vec (sentences, vector_size = 100, window = 3, min_count = 1, sg = 1)
print (cbow)

# save model
cbow.save ('cbow.bin')

# load model
cbow = Word2Vec.load ('cbow.bin')

# visualize
X = cbow.wv.get_normed_vectors()
pca = PCA (n_components = 2)
result = pca.fit_transform (X)

# create a scatter plot of the projection
pyplot.scatter (result [:, 0], result [:, 1])
words = len (cbow.wv)
pyplot.show ()

In [None]:
import gensim
# load the saved 
model = gensim.models.Word2Vec (dtweet)

# checking how similarity works
print (model.wv.most_similar ('book', 'is'))

**Implementing fastText**

In [None]:
# import FastText
from gensim.models import FastText
from sklearn.decomposition import PCA
from matplotlib import pyplot

# Example sentences
sentences = [['user', 'never', 'understand', 'dad', 'left', 'young', 'deep', 'in', 'the', 'feels'],
             ['enjoying', 'the', 'sunshine', 'god', 'is', 'good', 'orlando', 'sunshinestate', 'goodlife', 'bosslady', 'joy'],
             ['haroldfriday', 'have', 'a', 'weekend', 'filled', 'with', 'sunbeams', 'everyone', 'the', 'healthy', 'weekend'],
            ['i', 'have', 'been', 'working', 'on', 'my', 'anatomy', 'study', 'guide', 'since', '5', 'pm', 'and', 'i', 'am', 'still', 'not', 'done', 'isuck', 'the', 'plspassme']]
fast = FastText (sentences, vector_size = 20, window = 1, min_count = 1, workers = 5, min_n = 1, max_n = 2)

# vector for word the
print (fast)

# visualize
X = fast.wv.get_normed_vectors()
pca = PCA (n_components = 2)
result = pca.fit_transform (X)

# create a scatter plot of the projection
pyplot.scatter (result [:, 0], result [:, 1])
words = len (fast.wv)
pyplot.show ()

**Extracting noun phrases**

In [None]:
# import required libraries
import nltk
from textblob import TextBlob

# extract noun
blob = TextBlob ('i have been working on my anatomy study guide since 5 pm and i am still not done isuck plspassme')
for np in blob.noun_phrases:
    print (np)

**Find the similarity**

In [None]:
documents = ('eat healthy live healthy everyday fix whats broken in 5 words eat healthy live healthy body mind soul spirit', 
             'and the forecast looks good for the weather all across bolton',
            '3rd bihday amazing hilarious nephew eli ahmir uncle dave love misses')

# import libraries
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# compute tfidf
tfidf_vectorizer = TfidfVectorizer ()
tfidf_matrix = tfidf_vectorizer.fit_transform (documents)
tfidf_matrix.shape

In [None]:
# compute similarity for first sentence with rest of the sentences
cosine_similarity (tfidf_matrix [0:2], tfidf_matrix)

**Tagging part of speech**

In [None]:
text = 'one of the worlds greatest spoing events lemans24 team audi'

# import required libraries
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
stop_words = set (stopwords.words ('english'))

# tokenize the text
tokens = sent_tokenize (text)

# generate tagging for all tokens using loop
for i in tokens:
    words = nltk.word_tokenize (i)
    words = [w for w in words if not w in stop_words]
    # POS-tagger
    tags = nltk.pos_tag (words)
tags

**Using SpaCy**

In [None]:
import spacy
nlp = spacy.load ('en_core_web_sm')

# create a sentence
doc = nlp(u'one of the worlds greatest sporting events lemans24 team audi')

for ent in doc.ents:
    print (ent.text, ent.start_char, ent.end_char, ent.label_)

**Extracting topics from text**

In [None]:
doc1 = "lipo-light helped shape her, and it can help shape you. learn more@user loseinches burnfat result"
doc2 = "one of the worlds greatest sporting events lemans24 team audi"
doc3 = "and the forecast looks good for the weather all across bolton"

doc_complete = [doc1, doc2, doc3]
doc_complete

In [None]:
# import required libraries
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import string

# text preprocessing
stop = set (stopwords.words ('english'))
exclude = set (string.punctuation)
lemma = WordNetLemmatizer ()
def clean (doc):
    stop_free = " ".join ([i for i in doc.lower ().split () if i not in stop])
    punc_free = " ".join (ch for ch in stop_free if ch not in exclude)
    normalized = " ".join (lemma.lemmatize (word) for word in punc_free.split ())
    return normalized

doc_clean = [clean (doc).split () for doc in doc_complete]
doc_clean

**Preparing document term matrix**

In [None]:
# importing gensim
import gensim
from gensim import corpora

# creating the term dictionary of our corpus, where every unique term is assigned an index
dictionary = corpora.Dictionary (doc_clean)

# converting a list of documents (corpus) into Document-Term matrix using dictionary prepared above
doc_term_matrix = [dictionary.doc2bow (doc) for doc in doc_clean]

doc_term_matrix

**LDA model**

In [None]:
# creating the object for LDA model using gensim library
Lda = gensim.models.ldamodel.LdaModel

# runnug and trainig LDA model on the document term matrix for 3 topics
ldamodel = Lda (doc_term_matrix, num_topics = 3, id2word = dictionary, passes = 50)

# results
print (ldamodel.print_topics (num_topics = 20, num_words = 20))

**Classifying text**

In [None]:
# read the data 
email = pd.read_csv ('../input/email-spam-dataset/enronSpamSubset.csv') 

# understanding data
email.columns

In [None]:
email = email.rename (columns = {"Body": "Email"})
email.head (10)

**Text preprocessing and feature engineering**

In [None]:
# import required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import string
from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import os
from textblob import TextBlob
from nltk.stem import PorterStemmer
from textblob import Word
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm

# preprocessing steps like lower case, stemming and lemmatization
email ["Email"] = email ["Email"].apply (lambda x: " ".join (x.lower () for x in x.split ()))
stop = stopwords.words ('english')
email ["Email"] = email ["Email"].apply (lambda x: " ".join (x for x in x.split () if x not in stop))
st = PorterStemmer ()
email ["Email"] = email ["Email"].apply (lambda x: " ".join ([st.stem (word) for word in x.split ()]))
email ["Email"] = email ["Email"].apply (lambda x: " ".join ([Word (word).lemmatize () for word in x.split ()]))
email.head (10)

In [None]:
# splitting data into train and validation
train_x, valid_x, train_y, valid_y = train_test_split (email ["Email"], email ["Label"])

# TFIDF feature generation for a maximum of 6000 features
encoder = preprocessing.LabelEncoder ()
train_y = encoder.fit_transform (train_y)
valid_y = encoder.fit_transform (valid_y)

tfidf_vect = TfidfVectorizer (analyzer = 'word', token_pattern = r'\w{1,}', max_features = 6000)
tfidf_vect.fit (email ["Email"])
xtrain_tfidf = tfidf_vect.transform (train_x)
xvalid_tfidf = tfidf_vect.transform (valid_x)

xtrain_tfidf.data

**Model training**

In [None]:
def train_model (classifier, feature_vector_train, label, feature_vector_valid, is_neural_net = False):
    # fit the trainig dataset on the classifier
    classifier.fit (feature_vector_train, label)
    # predict the labels on validation dataset
    predictions = classifier.predict (feature_vector_valid)
    return metrics.accuracy_score (predictions, valid_y)

# Naive Bayes training
accuracy = train_model (naive_bayes.MultinomialNB (alpha = 0.2), xtrain_tfidf, train_y, xvalid_tfidf)
print ("Accuracy: ", accuracy)

In [None]:
# Linear classifier on Word level TF IDF Vectors
accuracy = train_model (linear_model.LogisticRegression (), xtrain_tfidf, train_y, xvalid_tfidf)
print ("Accuracy: ", accuracy)

**Sentiment analysis**

In [None]:
# read the sample
sentiment1 = "it's unbelievable that in the 21st century we'd need something like this. again. neverump xenopho..."
sentiment2 = "thank you! super love it! zpamdelacruz wedding dolores, capas tarlac"

# import required libraries
from textblob import TextBlob

# TextBlob has a pretrained sentiment prediction model
blob = TextBlob (sentiment1)
blob.sentiment

In [None]:
# now lets look at the sentiment2
blob = TextBlob (sentiment2)
blob.sentiment

**Multiclass classification**

In [None]:
# import required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import string
from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import os
from textblob import Word
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import sklearn.feature_extraction.text as text
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score
from io import StringIO
import seaborn as sns

# import data
data = pd.read_csv ('../input/comcastcomplaints/comcast_fcc_complaints_2015.csv')
data.head (10)

In [None]:
# selecting required columns and rows
data = data [['Description', 'Status']]
data = data [pd.notnull (data ['Description'])]

data.head (10)

In [None]:
# factorizing the rating column
data ['category_id'] = data['Status'].factorize ()[0]
data.head (10)

In [None]:
# check the distribution of complaints by category
data.groupby ('Status').Description.count ()

In [None]:
# visualize it
fig = plt.figure (figsize = (8, 6))
data.groupby ('Status').Description.count ().plot.bar (ylim = 0)
plt.show ()

**Splitting the data**

In [None]:
train_x, valid_x, train_y, valid_y = train_test_split (data ['Description'], data ['Status'])

**Feature engineering using TF-IDF**

In [None]:
encoder = preprocessing.LabelEncoder ()
train_y = encoder.fit_transform (train_y)
valid_y = encoder.fit_transform (valid_y)

tfidf_vect = TfidfVectorizer (analyzer = 'word', token_pattern = r'\w{1,}', max_features = 5000)
tfidf_vect.fit (data['Description'])
xtrain_tfidf = tfidf_vect.transform (train_x)
xvalid_tfidf = tfidf_vect.transform (valid_x)

**Model building and evaluation**

In [None]:
model = linear_model.LogisticRegression().fit (xtrain_tfidf, train_y)

# model summary
LogisticRegression (C = 1.0, class_weight = None, dual = False, fit_intercept = True,
                    intercept_scaling = 1, max_iter = 100, multi_class = "ovr", n_jobs = 1,
                    penalty = 'l2', random_state = None, solver = 'liblinear', tol = 0.0001,
                    verbose = 0, warm_start = False)

# checking accuracy
accuracy = metrics.accuracy_score (model.predict (xvalid_tfidf), valid_y)
print ("Accuracy: ", accuracy)

In [None]:
# classification report
print (metrics.classification_report (valid_y, model.predict (xvalid_tfidf),
                                     target_names = data ['Status'].unique ()))

In [None]:
from sklearn.metrics import confusion_matrix

# confusion matrix
conf_matrix = confusion_matrix (valid_y, model.predict (xvalid_tfidf))

# visualizing confusion matrix
category_id_df = data [['Status', "category_id"]].drop_duplicates ().sort_values ('category_id')
category_to_id = dict (category_id_df.values)
id_to_category = dict (category_id_df [['category_id', 'Status']].values)

fig, ax = plt.subplots (figsize = (8, 6))
sns.heatmap (conf_matrix, annot = True, fmt = 'd', cmap = "BuPu",
            xticklabels = category_id_df [['Status']].values,
            yticklabels = category_id_df [["Status"]].values)
plt.ylabel ("Actual")
plt.xlabel ("Predicted")
plt.show ()

In [None]:
# prediction example
text = ['Comcast refuses to help troubleshoot and correct my service.']
text_features = tfidf_vect.transform (text)
predictions = model.predict (text_features)
print (text)
print ("  - Predicted as: '{}'".format (id_to_category [predictions [0]]))

**Implementing sentiment analysis**

In [None]:
# import necesserary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

# read the data
data = pd.read_csv ('../input/22000-scotch-whisky-reviews/scotch_review.csv')

# Look at the top 10 rows of the data
data.head (10)

In [None]:
# understand the data types of the columns
data.info ()

In [None]:
# looking at the summary of descriptions
data.description.head (10)

**Text preprocessing**

In [None]:
# import libraries
from nltk.corpus import stopwords
from textblob import TextBlob
from textblob import Word

# lower casing and removing pucntuations
data ['description'] = data ['description'].apply (lambda x: " ".join (x.lower () for x in x.split ()))
data ['description'] = data ['description'].str.replace ("""[^\w\s]""","")
data.description.head (10)

In [None]:
# remove of stop words
stop = stopwords.words ('english')
data ['description'] = data ['description'].apply (lambda x: " ".join (x for x in x.split () if x not in stop))
data.description.head (10)

In [None]:
# Lemmatization
data ['description'] = data ['description'].apply (lambda x: " ".join ([Word (word).
                                                                       lemmatize () for word in x.split ()]))
data.description.head (10)

**Exploratory data analysis**

In [None]:
# dropping null values
data.dropna (inplace = True)

# the histogram reveals this dataset is highly unbalanced toward rating 85-90
data ['review.point'].hist (bins = 5, grid = False)
plt.show ()
print (data.groupby (data['review.point']).count ())

In [None]:
# to make it balanced data, I sampled each score by the lowest n-count
score_1 = data [data ['review.point'] == 83].sample (n = 141)
score_2 = data [data ['review.point'] == 85].sample (n = 198)
score_3 = data [data ['review.point'] == 87].sample (n = 221)
score_4 = data [data ['review.point'] == 90].sample (n = 188)
score_5 = data [data ['review.point'] == 93].sample (n = 84)

# here I create a balanced dataset
reviews_sample = pd.concat ([score_1, score_2, score_3, score_4, score_5], axis = 0)
reviews_sample.reset_index (drop = True, inplace = True)

# printing count by 'review.point' to check dataset is now balanced
print (reviews_sample.groupby ('review.point').count ())

In [None]:
# Let's build a word cloud looking at the 'description' text
from wordcloud import WordCloud
from wordcloud import STOPWORDS

# Wordcloud functions input needs to be a single string of text
# here I'm concatenating all descriptions into a single string
reviews_str = reviews_sample.description.str.cat ()
wordcloud = WordCloud (background_color = 'white').generate (reviews_str)
plt.figure (figsize = (12, 12))
plt.imshow (wordcloud, interpolation = 'bilinear')
plt.axis ('off')
plt.show ()

In [None]:
# now let's split the data into negative (score 1 or 2) and positive (4 or 5) reviews
negative_reviews = reviews_sample [reviews_sample ['review.point'].isin ([83, 85])]
positive_reviews = reviews_sample [reviews_sample ['review.point'].isin ([90, 93])]

# transform to single string 
negative_reviews_str = negative_reviews.description.str.cat ()
positive_reviews_str = positive_reviews.description.str.cat ()

# create wordclouds
wordcloud_negative = WordCloud (background_color = 'white').generate (negative_reviews_str)
wordcloud_positive = WordCloud (background_color = 'white').generate (positive_reviews_str)

# Plot
fig = plt.figure (figsize = (12, 12))
ax1 = fig.add_subplot (211)
ax1.imshow (wordcloud_negative, interpolation = 'bilinear')
ax1.axis ('off')
ax1.set_title ('Reviews with negative scores', fontsize = 20)

In [None]:
fig = plt.figure (figsize = (12, 12))
ax2 = fig.add_subplot (212)
ax2.imshow (wordcloud_positive, interpolation = 'bilinear')
ax2.axis ('off')
ax2.set_title ("Reviews with positive scores", fontsize = 20)

**Summarizing text data**

In [None]:
# import BeautifulSoup and urllib libraries to fetch data from Wikipedia
from bs4 import BeautifulSoup
from urllib.request import urlopen 

# function to get data from Wikipedia
def get_only_text (url):
    page = urlopen (url)
    soup = BeautifulSoup (page)
    text = ' '.join (map (lambda p: p.text, soup.find_all ('p')))
    print (text)
    return soup.title.text, text

# mention from Wikipedia url
url = "https://en.wikipedia.org/wiki/Natural_language_processing"

# call the function created above
text = get_only_text (url)

# count the number of letters
len ("".join (text))


In [None]:
# let's see first 500 letters from the text
text [:500]

**Clustering documents**

In [None]:
# import libraries
import numpy as np
import pandas as pd
import nltk
from nltk.stem.snowball import SnowballStemmer
from bs4 import BeautifulSoup
import re
import os
import codecs
from sklearn import feature_extraction
import mpld3
from sklearn.metrics.pairwise import cosine_similarity
import os
import matplotlib.pyplot as plt
import matplotlib as mpl
from sklearn.manifold import MDS

# Let's use the same comcastcomplaints dataset we use for classification
data = pd.read_csv ('../input/comcastcomplaints/comcast_consumeraffairs_complaints.csv')

# selecting required columns and rows
data = data [['text']]
data = data [pd.notnull (data ['text'])]

# let's do the clustering for just 300 rows. It's easier to interpret
data_sample = data.sample (300)

**Preprocessing and TF-IDF feature engineering**

In [None]:
# remove unwanted symbol
data_sample ['text'] = data_sample ['text'].str.replace ('XXXX', "")

# convert dataframe to list
complaints = data_sample ['text'].tolist ()

# create the rank of documents - I will use it later
ranks = []
for i in range (1, len (complaints) + 1):
    ranks.append (i)
    
# stop words
stopwords = nltk.corpus.stopwords.words ('english')

# load 'stemmer'
stemmer = SnowballStemmer ('english')

# functions for sentence tokenizer, to remove tokens and raw # pucntuation
def tokenize_and_stem (text):
    tokens = [word for sent in nltk.sent_tokenize (text) for word in nltk.word_tokenize (sent)]
    filtered_tokens = []
    for token in tokens:
        if re.search ('[a-zA-Z]', token):
            filtered_tokens.append (token)
    stems = [stemmer.stem (t) for t in filtered_tokens]
    return stems

def tokenize_only (text):
    tokens =[word.lower () for sent in nltk.sent_tokenize (text) for word
            in nltk.word_tokenize (sent)]

    filtered_tokens = []
    for token in tokens:
        if re.search ('[a-zA-Z]', token):
            filtered_tokens.append (token)
    return filtered_tokens

from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer (max_df = 0.8, max_features = 200000, min_df = 0.2, stop_words = 'english',
                                   use_idf = True, tokenizer = tokenize_and_stem, ngram_range = (1, 3))
# fit the vectiorizer to data
tfidf_matrix = tfidf_vectorizer.fit_transform (complaints)
terms = tfidf_vectorizer.get_feature_names ()
print (tfidf_matrix.shape)

**Clustering using K-means**

In [None]:
# import Kmeans
from sklearn.cluster import KMeans

# define numbers of clusters
num_clusters = 6

# running clustreing algorithm
km = KMeans (n_clusters = num_clusters)
km.fit (tfidf_matrix)

# final clusters
clusters = km.labels_.tolist ()
complaints_data = {'rank': ranks, 'complaints': complaints, 'cluster': clusters}
frame = pd.DataFrame (complaints_data, index = [clusters], columns = ['rank', 'cluster'])

# number of docs per cluster
frame ['cluster'].value_counts ()

**Identify cluster behavior**

In [None]:
totalvocab_stemmed = []
totalvocab_tokenized = []
for i in complaints:
    allwords_stemmed = tokenize_and_stem (i)
    totalvocab_stemmed.extend (allwords_stemmed)
    allwords_tokenized = tokenize_only (i)
    totalvocab_tokenized.extend (allwords_tokenized)
vocab_frame = pd.DataFrame ({'words': totalvocab_tokenized}, index = totalvocab_stemmed)

# sort cluster centers by proximity to centroid
order_centroids = km.cluster_centers_.argsort ()[:, ::-1]
for i in range (num_clusters):
    print ("Cluster %d words:" % i, end = "")
    for ind in order_centroids [i, :6]:
        print (' %s' % vocab_frame.loc [terms [ind].split (' ')].
              values.tolist ()[0][0].encode ('utf-8', 'ignore'), end = ',')
        print ()

**Plot the clusters on a 2D graph**

In [None]:
# similarity
similarity_distance = 1 - cosine_similarity (tfidf_matrix)

# convert two components as I'm plotting points in a two-dimensional plane
mds = MDS (n_components = 2, dissimilarity = 'precomputed', random_state = 1)
pos = mds.fit_transform (similarity_distance)
xs, ys = pos [:, 0], pos [:, 1]

# set up colors per clusters using a dict
clusters_colors = {0: '#1b9e77', 1: '#d95f020', 2: '#7570b3', 3: '#e7298a', 4: '#66a61e', 5: '#S2691E'}

# set up cluster names using a dict
cluster_names = {0: 'boxes, installation, cable',
                 1: 'did, said, month',
                 2: 'phone, account, year',
                 3: 'customer, cable, time',
                 4: 'tech, days, problem',
                 5: 'internet, month, only'}

# finally plot it
%matplotlib inline

# create data frame that has the result of the MDS and the cluster
df = pd.DataFrame (dict (x = xs, y = ys, label = clusters))
groups = df.groupby ('label')

# set up plot
fig, ax = plt.subplots(figsize = (17, 9))
for name, group in groups:
    ax.plot (group.x, group.y, marker = 'o', linestyle = "", ms = 20,
            label = cluster_names, mec = 'none')
ax.set_aspect ('auto')
ax.tick_params (axis = 'x', which = 'both', bottom = 'off', top = 'off', labelbottom = 'off')
ax.tick_params (axis = 'y', which = 'both', left = 'off', top = 'off', labelleft = 'off')
ax.legend (numpoints = 1)
plt.show ()

# **Deep learning in Natural language processing**

**Classifying text**

In [None]:
# read the dataset
import pandas as pd
data_s = pd.read_csv ('../input/email-spam-dataset/lingSpam.csv')

# preprocessing the data
from nltk.corpus import stopwords
from nltk import *
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem import WordNetLemmatizer
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

# remove stop words
stop = stopwords.words ('english')
data_s ['Body'] = data_s ['Body'].apply (lambda x: " ".join (x for x in x.split () if x not in stop))

# rename column names
data_s = data_s.rename (columns = {"Body": "Email", "Label": "Target"})
data_s.head (10)

In [None]:
# delete punctuations, convert text in lower case and delete the double space
data_s ['Email'] = data_s ['Email'].apply (lambda x: re.sub ('[!@#$:).:,?&]', "", x.lower ()))
data_s ['Email'] = data_s ['Email'].apply (lambda x: re.sub (' ', ' ', x))
data_s ['Email'].head (10)                                                             

In [None]:
# import required libraries
import sys, os, re, csv, codecs
import numpy as np
import pandas as pd
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Bidirectional, GlobalMaxPool1D, Conv1D, SimpleRNN
from keras.models import Model
from keras.models import Sequential
from keras import initializers, regularizers, constraints, optimizers, layers
from keras.layers import Dense, Input, Flatten, Dropout, BatchNormalization
from keras.layers import Conv1D, MaxPooling1D, Embedding, LSTM
from keras.models import Sequential

# separating text and target classes
list_sentences_rawdata = data_s ['Email'].fillna ("_na_").sort_values
list_classes = ["Target"]
target = data_s [list_classes].values
To_Process = data_s [["Email", "Target"]]

# train and test split with 80:20 ratio
train, test = train_test_split (To_Process, test_size = 0.2)

# define the sequence lengths, max number of words and embedding dimensions
MAX_SEQUENCE_LENGTH = 400

# top 20000 frequently occuring words
MAX_NB_WORDS = 30000

# get the frequently occuring words
tokenizer = Tokenizer (num_words = MAX_NB_WORDS)
tokenizer.fit_on_texts (train.Email)
train_sequences = tokenizer.texts_to_sequences (train.Email)
test_sequences = tokenizer.texts_to_sequences (test.Email)

# dictionary containing words and their index
word_index = tokenizer.word_index

print ("Found %s unique tokens." %len (word_index))

# get only the top frequent words on train
train_data = pad_sequences (train_sequences, maxlen = MAX_SEQUENCE_LENGTH)

# get only the top frequent words on test
test_data = pad_sequences (test_sequences, maxlen = MAX_SEQUENCE_LENGTH)

print (train_data.shape)
print (test_data.shape)

In [None]:
train_labels = train ['Target']
test_labels = test ['Target']

# import library
from sklearn.preprocessing import LabelEncoder

# converts the character array to numeric array. Assigns levels to unique labels
le = LabelEncoder ()
le.fit (train_labels)
train_labels = le.transform (train_labels)
test_labels = le.transform (test_labels)

print (le.classes_)
print (np.unique (train_labels, return_counts = True))
print (np.unique (test_labels, return_counts = True))

In [None]:
# changing data types
labels_train = to_categorical (np.asarray (train_labels))
labels_test = to_categorical (np.asarray (test_labels))
print ("Shape of data tensor:", train_data.shape)
print ("Shape of label tensor:", labels_train.shape)
print ("Shape of label tensor:", labels_test.shape)

In [None]:
EMBEDDING_DIM = 200
print (MAX_SEQUENCE_LENGTH)

**Model building and predicting**

In [None]:
print ('Training CNN 1D model.')
model = Sequential ()
model.add (Embedding (MAX_NB_WORDS, EMBEDDING_DIM, input_length = MAX_SEQUENCE_LENGTH))
model.add (Dropout (0.5))
model.add (Conv1D (128, 5, activation = 'relu'))
model.add (MaxPooling1D (5))
model.add (Dropout (0.5))
model.add (BatchNormalization ())
model.add (Conv1D (128, 5, activation = 'relu'))
model.add (MaxPooling1D (5))
model.add (Dropout (0.5))
model.add (BatchNormalization ())
model.add (Flatten ())
model.add (Dense (128, activation = 'relu'))
model.add (Dense (2, activation = 'softmax'))
model.compile (loss = 'categorical_crossentropy', optimizer = 'rmsprop', metrics = ['acc'])
model.fit (train_data, labels_train, batch_size = 64, epochs = 5, validation_data = (test_data, labels_test))

In [None]:
# predictions on test data
predicted = model.predict (test_data)
predicted

In [None]:
# model evaluation
import sklearn
from sklearn.metrics import precision_recall_fscore_support as score
precision, recall, fscore, support = score (labels_test, predicted.round(), zero_division = 1)
print ("Precision: {}".format (precision))
print ("Recall: {}".format (recall))
print ("Fscore: {}".format (fscore))
print ("Support: {}".format (support))
print ("----------------------------")
print (sklearn.metrics.classification_report (labels_test, predicted.round (), zero_division = 1))

In [None]:
# define RNN model
from keras.layers.recurrent import SimpleRNN

# model training
print ('Training SIMPLERNN model.')
model = Sequential ()
model.add (Embedding (MAX_NB_WORDS, EMBEDDING_DIM, input_length = MAX_SEQUENCE_LENGTH))
model.add (SimpleRNN (2, input_shape = (None, 1)))
model.add (Dense (2, activation = 'softmax'))
model.compile (loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
model.fit (train_data, labels_train, batch_size = 16, epochs = 5, validation_data = (test_data, labels_test))

In [None]:
# prediction on test data
predicted_Srnn = model.predict (test_data)
predicted_Srnn

In [None]:
# model evaluation
from sklearn.metrics import precision_recall_fscore_support as score

precision, recall, fscore, support = score (labels_test, predicted_Srnn.round ())
print ('precision: {}'.format (precision))
print ('recall: {}'.format (recall))
print ('fscor: {}'.format (fscore))
print ('support: {}'.format (support))

print ('----------------------------')

print (sklearn.metrics.classification_report (labels_test, predicted_Srnn.round ()))

In [None]:
# model training LSTM (Long short-term memory)
print ('Training LSTM model.')
model = Sequential ()
model.add (Embedding (MAX_NB_WORDS, EMBEDDING_DIM, input_length = MAX_SEQUENCE_LENGTH))
model.add (LSTM (activation = 'relu', return_sequences = True, units = 128))
model.add (Dropout (0.2))
model.add (BatchNormalization ())
model.add (Flatten ())
model.add (Dense (2, activation = 'softmax'))
model.compile (loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
model.fit (train_data, labels_train, batch_size = 16, epochs = 5, validation_data = (test_data, labels_test))

In [None]:
# predicition on text data
predicted_lstm = model.predict (test_data)
predicted_lstm

In [None]:
# model evaluation
from sklearn.metrics import precision_recall_fscore_support as score
precision, recall, fscore, support = score (labels_test, predicted_lstm.round ())

print ('precision: {}'.format (precision))
print ('recall: {}'.format (recall))
print ('fscore: {}'.format (fscore))
print ('support: {}'.format (support))

print ('----------------------------')

print (sklearn.metrics.classification_report (labels_test, predicted_lstm.round ()))

In [None]:
# bidirectional LSTM 
# model training
print ('Training Bidirectional LSTM model.')
model = Sequential ()
model.add (Embedding (MAX_NB_WORDS, EMBEDDING_DIM, input_length = MAX_SEQUENCE_LENGTH))
model.add (Bidirectional (LSTM (16, return_sequences = True, dropout = 0.1, recurrent_dropout = 0.1)))
model.add (Conv1D (16, kernel_size = 3, padding = "valid", kernel_initializer = "glorot_uniform"))
model.add (GlobalMaxPool1D ())
model.add (Dense (50, activation = "relu"))
model.add (Dropout (0.1))
model.add (Dense (2, activation = "softmax"))
model.compile (loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
model.fit (train_data, labels_train, batch_size = 16, epochs = 3, validation_data = (test_data, labels_test))

In [None]:
# prediction on test data
predicted_blstm = model.predict (test_data)
predicted_blstm

In [None]:
# model evaluation
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.metrics import classification_report
precision, recall, fscore, support = score (labels_test, predicted_blstm.round ())

print ('precision: {}'.format (precision))
print ('recall: {}'.format (recall))
print ('fscore: {}'.format (fscore))
print ('support: {}'.format (support))

print ('----------------------------')

print (classification_report (labels_test, predicted_blstm.round ()))

**Next word prediction**

In [5]:
# import required libraries
import numpy as np
import random
import pandas as pd
import sys
import os
import time
import codecs
import collections
import numpy
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM
from keras.callbacks import ModelCheckpoint
from keras.utils import np_utils
from nltk.tokenize import sent_tokenize, word_tokenize
import scipy
from scipy import spatial
from nltk.tokenize.toktok import ToktokTokenizer
import re

content = pd.read_csv ('../input/email-spam-dataset/completeSpamAssassin.csv')

# just selecting emails and converting it into list
email = content [['Body']]
list_data = email.values.tolist ()

tokenizer = ToktokTokenizer ()