# NLP Modeling 

1. Get your raw text into a pandas dataframe
2. Tokenize the text - splitting a phrase, sentence, paragraph, or an entire text document into smaller units, such as individual words or terms. Each of these smaller units are called tokens. 
3. Clean the text - this includes removing stopwords, punctuation and stems or lemmatizing 
4. Vectorize the text - convert the text to numeric form 
5. Fit/train an ML and/or deep learning model 


In [None]:
import numpy as np 
import pandas as pd 
pd.set_option('display.max_colwidth', 100)

from matplotlib import pyplot as plt
import numpy as np
%matplotlib inline
plt.style.use('fivethirtyeight')

import re #regular expressions for pattern searching 
import string
import nltk

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold, cross_val_score
from sklearn.model_selection import train_test_split


df = pd.read_csv("data/superheroes.csv")
df.head()

In [None]:
print(df.info())

In [None]:
df = df[['name', 'history_text', 'creator', 'alignment']]
df.head()

In [None]:
df.dropna(inplace=True)
df.info()

In [None]:
df.alignment.value_counts()

## Create a function to clean our text 

1. Remove punctuation
2. Tokenization
3. Remove stopwords
4. Lemmatize/Stem

In [None]:
stopwords = nltk.corpus.stopwords.words('english')
ps = nltk.PorterStemmer()
wn = nltk.WordNetLemmatizer()
string.punctuation

In [None]:
def clean_history(history):
    history = "".join([word for word in history if word not in string.punctuation])
    tokens = re.split('\W+', history)
    history = [wn.lemmatize(word) for word in tokens if word not in stopwords]
    return history

df['history_clean'] = df['history_text'].apply(lambda x: clean_history(x.lower()))



In [None]:
df.head()

In [None]:
df_good = df.loc[df['alignment'] == 'Good']
df_good.head()

In [None]:
df_bad = df.loc[df['alignment'] == 'Bad']
df_bad.head()

In [None]:
df_neutral = df.loc[df['alignment'] == 'Neutral']
df_neutral.head()

In [None]:
#most frequent and least frequent words 
good_list = []  # list containing all words of all texts
for x in df_good['history_clean']:  # loop over lists in df
    good_list += x  # append elements of lists to full list

good_val_counts = pd.Series(good_list).value_counts()  # make temporary Series to count
good_val_counts

In [None]:
#most frequent and least frequent words 
bad_list = []  # list containing all words of all texts
for x in df_bad['history_clean']:  # loop over lists in df
    bad_list += x  # append elements of lists to full list

bad_val_counts = pd.Series(bad_list).value_counts()  # make temporary Series to count
bad_val_counts

In [None]:
#most frequent and least frequent words 
neutral_list = []  # list containing all words of all texts
for x in df_neutral['history_clean']:  # loop over lists in df
    neutral_list += x  # append elements of lists to full list

neutral_val_counts = pd.Series(neutral_list).value_counts()  # make temporary Series to count
neutral_val_counts

In [None]:
from textblob import TextBlob, Word
from wordcloud import WordCloud



In [None]:
wordcloud = WordCloud(max_words=100, width=400, height=200).generate(str(good_val_counts))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.figure(figsize=(20,10))
plt.show()

In [None]:
wordcloud = WordCloud(max_words=100, width=400, height=200).generate(str(bad_val_counts))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.figure(figsize=(20,10))
plt.show()

In [None]:
wordcloud = WordCloud(max_words=100, width=400, height=200).generate(str(neutral_val_counts))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.figure(figsize=(20,10))
plt.show()

In [None]:
blob = TextBlob(str(df_good['history_clean']))
pos_df = pd.DataFrame(blob.tags, columns = ['word' , 'pos'])
pos_df = pos_df.pos.value_counts()[:20]
pos_df.plot(kind='bar', title="Parts of Speech in Good Histories" )

In [None]:
blob = TextBlob(str(df_bad['history_clean']))
pos_df = pd.DataFrame(blob.tags, columns = ['word' , 'pos'])
pos_df = pos_df.pos.value_counts()[:20]
pos_df.plot(kind='bar', title="Parts of Speech in Bad Histories")

In [None]:
blob = TextBlob(str(df_neutral['history_clean']))
pos_df = pd.DataFrame(blob.tags, columns = ['word' , 'pos'])
pos_df = pos_df.pos.value_counts()[:20]
pos_df.plot(kind='bar', title="Parts of Speech in Neutral Histories")

In [None]:
(pd.Series(nltk.ngrams(good_list, 2)).value_counts())[:20]

In [None]:
(pd.Series(nltk.ngrams(bad_list, 2)).value_counts())[:20]

In [None]:
(pd.Series(nltk.ngrams(neutral_list, 2)).value_counts())[:20]

## Vectorizing History Text: TF-IDF

In [None]:
tfidf_vect = TfidfVectorizer(analyzer=clean_history, ngram_range =(2, 2))
X_tfidf = tfidf_vect.fit_transform(df['history_text'])
print(X_tfidf.shape)
print(tfidf_vect.get_feature_names())

#### Vectorizers output sparse matrices

_**Sparse Matrix**: A matrix in which most entries are 0. In the interest of efficient storage, a sparse matrix will be stored by only storing the locations of the non-zero elements._

In [None]:
X_tfidf_df = pd.DataFrame(X_tfidf.toarray())
X_tfidf_df.columns = tfidf_vect.get_feature_names()
X_tfidf_df.head()

In [None]:
# Getting top ranking features 
sums = X_tfidf.sum(axis = 0) 
data1 = [] 
for col, term in enumerate(X_tfidf_df.columns): 
    data1.append( (term, sums[0, col] )) 
ranking = pd.DataFrame(data1, columns = ['term', 'rank']) 
words = (ranking.sort_values('rank', ascending = False)) 
print ("\n\nWords : \n", words.head(10)) 

## Feature Engineering 

In [None]:
df['history_len'] = df['history_text'].apply(lambda x: len(x) - x.count(" ")) #subtracting whitespace
df.info()


In [None]:
bins = np.linspace(0, 5000, 100)
plt.hist(df[df['alignment'] == 'Good']['history_len'], bins, alpha=0.5, label='Good')
plt.hist(df[df['alignment'] == 'Bad']['history_len'], bins, alpha=0.5, label='Bad')
plt.hist(df[df['alignment'] == 'Neutral']['history_len'], bins, alpha=0.5, label='Neutral')
plt.legend(loc='upper right')
plt.show()

In [None]:
def count_punct(text):
    count = sum([1 for char in text if char in string.punctuation])
    return round(count/(len(text) - text.count(" ")), 3) * 100

df['percent_punct'] = df['history_text'].apply(lambda x: count_punct(x))
df.head()

In [None]:
bins = np.linspace(0, 10, 60)
plt.hist(df[df['alignment'] == 'Good']['percent_punct'], bins, alpha=0.5, label='Good')
plt.hist(df[df['alignment'] == 'Bad']['percent_punct'], bins, alpha=0.5, label='Bad')
plt.hist(df[df['alignment'] == 'Neutral']['percent_punct'], bins, alpha=0.5, label='Neutral')
plt.legend(loc='upper right')
plt.show()

In [None]:
bins = np.linspace(0, 4000, 50)

plt.hist(df['history_len'], bins)
plt.title("History Length Distribution")
plt.show()

In [None]:
bins = np.linspace(0, 10, 50)

plt.hist(df['percent_punct'], bins)
plt.title("History Length Distribution")
plt.show()

In [None]:
for i in [1, 2, 3, 4, 5]:
    plt.hist((df['history_len'])**(1/i), bins=50)
    plt.title("Transformation: 1/{}".format(str(i)))
    plt.show()

In [None]:
df['history_len'] = round(df['history_len']**(1/i), 2)

In [None]:
new_features_df = df[['history_len', 'percent_punct', 'alignment']]

In [None]:
alignment_dict = {'Good': 0, 'Bad': 1, 'Neutral': 2}

In [None]:
new_features_df['alignment'] = [alignment_dict[item] for item in new_features_df.alignment]

In [None]:
X_tfidf_df.reset_index(drop=True, inplace=True)
new_features_df.reset_index(drop=True, inplace=True)

In [None]:
X = pd.concat([new_features_df, X_tfidf_df], axis=1)
X.head()

In [None]:
X.info()

In [None]:
X.drop(['alignment'], axis=1, inplace=True)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, new_features_df['alignment'], test_size=0.3)

In [None]:
rf = RandomForestClassifier(class_weight='balanced', n_estimators=50, max_depth=20, n_jobs=-1)
rf_model = rf.fit(X_train, y_train)

In [None]:
rf_model.score(X_train, y_train)

In [None]:
rf_model.score(X_test, y_test)

## RNNs 

#### TF-IDF

![](https://image.slidesharecdn.com/9bc43139-1398-4c31-a9cf-ed08dd37ef13-150521205535-lva1-app6891/95/text-mining-association-rules-and-decision-tree-learning-26-638.jpg?cb=1432241853)

#### Word-to-Vec
- The skip gram 
![](https://media.geeksforgeeks.org/wp-content/uploads/word2vec_diagram-1.jpg)

- Cosine Similarity
![](https://cdn.analyticsvidhya.com/wp-content/uploads/2019/07/img_8.png)
![](https://www.mathsisfun.com/algebra/images/cosine-graph.svg)

#### Recurrent Neural Networks 
- A recurrent neural network (RNN) is a type of artificial neural network commonly used in speech recognition and natural language processing (NLP). RNNs are designed to recognize a data's sequential characteristics and use patterns to predict the next likely scenario.  RNN unlike feed forward neural networks(think CNNs) - can use their internal memory to process arbitrary sequences of inputs.
[Turtorials Point - CNNs vs. RNNs](https://www.tutorialspoint.com/tensorflow/tensorflow_cnn_and_rnn_difference.htm)
![](https://www.nexmo.com/wp-content/uploads/2020/10/Recurrent-neural-network.png)

In [None]:
import gensim 
import gensim.downloader as api 

wiki_embeddings = api.load('glove-wiki-gigaword-100')

In [None]:
wiki_embeddings['queen']

In [None]:
wiki_embeddings.most_similar('dinosaur')

In [None]:
w2v_model = gensim.models.Word2Vec(X_train,
                                   size=100,
                                   window=5,
                                   min_count=2)

In [None]:
w2v_model.wv('queen')

In [None]:
from keras.utils import np_utils
from sklearn.preprocessing import LabelEncoder
# encode class values as integers
encoder = LabelEncoder()
encoder.fit(new_features_df['alignment'])
encoded_Y = encoder.transform(new_features_df['alignment'])
# convert integers to dummy variables (i.e. one hot encoded)
dummy_y = np_utils.to_categorical(encoded_Y)

In [None]:
rnn_X_train, rnn_X_test, rnn_y_train, rnn_y_test = train_test_split(df['history_text'],
                                                                    dummy_y,
                                                                    test_size=0.3)

In [None]:
from keras.preprocessing.text import Tokenizer #clean and tokenize the data 
from keras.preprocessing.sequence import pad_sequences

This will clean and tokenize our dataset. Also, it will build a vocabulary of all of the words in our training set and assign it an index. 

In [None]:
# Initialize and fit the tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(rnn_X_train)

In [None]:
X_train_seq = tokenizer.texts_to_sequences(rnn_X_train)
X_test_seq = tokenizer.texts_to_sequences(rnn_X_test)

In [None]:
# What do these sequences look like? Each integer represents a word in the first text history, this is the 1st text history
X_train_seq[0]

In [None]:
# Pad the sequences so each sequence is the same length

X_train_seq_padded = pad_sequences(X_train_seq, 50) 
X_test_seq_padded = pad_sequences(X_test_seq, 50) 

In [None]:
# What do these padded sequences look like?
X_train_seq_padded[0]

In [None]:
# Import the tools needed from keras and define functions to calculate recall and precision
import keras.backend as K
from keras.layers import Dense, Embedding, LSTM
from keras.models import Sequential

def recall_m(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

def precision_m(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision

In [None]:
# Construct a simple RNN model
model = Sequential()

model.add(Embedding(len(tokenizer.index_word)+1, 32))
model.add(LSTM(32, dropout=.2, recurrent_dropout=0))
model.add(Dense(32, activation='relu'))
model.add(Dense(3, activation='softmax')) 
model.summary()

In [None]:
# Compile the model
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy', precision_m, recall_m])

In [None]:
# Fit the RNN model
history = model.fit(X_train_seq_padded, rnn_y_train, 
                    batch_size=32, epochs=10,
                    validation_data=(X_test_seq_padded, rnn_y_test))