# NLP Modeling 

1. Get your raw text into a pandas dataframe
2. Tokenize the text - splitting a phrase, sentence, paragraph, or an entire text document into smaller units, such as individual words or terms. Each of these smaller units are called tokens. 
3. Clean the text - this includes removing stopwords, punctuation and stems or lemmatizing 
4. Vectorize the text - convert the text to numeric form 
5. Fit/train an ML and/or deep learning model 


In [None]:
import numpy as np 
import pandas as pd 
pd.set_option('display.max_colwidth', 100)

from matplotlib import pyplot
import numpy as np
%matplotlib inline

import re
import string
import nltk

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold, cross_val_score
from sklearn.model_selection import train_test_split


df = pd.read_csv("/Users/amberyandow/Downloads/superheroes_nlp_dataset.csv")
df.head()

In [None]:
print(df.info())

In [None]:
df = df[['name', 'history_text', 'creator', 'alignment']]
df.head()

In [None]:
df.dropna(inplace=True)
df.info()

In [None]:
df.alignment.value_counts()

## Create a pipeline to clean our text 

1. Remove punctuation
2. Tokenization
3. Remove stopwords
4. Lemmatize/Stem

In [None]:
stopwords = nltk.corpus.stopwords.words('english')
ps = nltk.PorterStemmer()
string.punctuation

In [None]:
def clean_history(history):
    history = "".join([word for word in history if word not in string.punctuation])
    tokens = re.split('\W+', history)
    history = [ps.stem(word) for word in tokens if word not in stopwords]
    return history

df['history_clean'] = df['history_text'].apply(lambda x: clean_history(x.lower()))



In [None]:
df.head()

## Vectorizing History Text: TF-IDF

In [None]:
tfidf_vect = TfidfVectorizer(analyzer=clean_history)
X_tfidf = tfidf_vect.fit_transform(df['history_text'])
print(X_tfidf.shape)
print(tfidf_vect.get_feature_names())

#### Vectorizers output sparse matrices

_**Sparse Matrix**: A matrix in which most entries are 0. In the interest of efficient storage, a sparse matrix will be stored by only storing the locations of the non-zero elements._

In [None]:
X_tfidf_df = pd.DataFrame(X_tfidf.toarray())
X_tfidf_df.columns = tfidf_vect.get_feature_names()
X_tfidf_df.head()

## Feature Engineering 

In [None]:
df['history_len'] = df['history_text'].apply(lambda x: len(x) - x.count(" ")) #subtracting whitespace
df.head()

In [None]:
bins = np.linspace(0, 5000, 100)
pyplot.hist(df[df['alignment'] == 'Good']['history_len'], bins, alpha=0.5, normed=True, label='Good')
pyplot.hist(df[df['alignment'] == 'Bad']['history_len'], bins, alpha=0.5, normed=True, label='Bad')
pyplot.legend(loc='upper right')
pyplot.show()

In [None]:
def count_punct(text):
    count = sum([1 for char in text if char in string.punctuation])
    return round(count/(len(text) - text.count(" ")), 3) * 100

df['percent_punct'] = df['history_text'].apply(lambda x: count_punct(x))
df.head()

In [None]:
bins = np.linspace(0, 10, 60)
pyplot.hist(df[df['alignment'] == 'Good']['percent_punct'], bins, alpha=0.5, normed=True, label='Good')
pyplot.hist(df[df['alignment'] == 'Bad']['percent_punct'], bins, alpha=0.5, normed=True, label='Bad')
pyplot.legend(loc='upper right')
pyplot.show()

In [None]:
bins = np.linspace(0, 4000, 50)

pyplot.hist(df['history_len'], bins)
pyplot.title("History Length Distribution")
pyplot.show()

In [None]:
bins = np.linspace(0, 10, 50)

pyplot.hist(df['percent_punct'], bins)
pyplot.title("History Length Distribution")
pyplot.show()

In [None]:
for i in [1, 2, 3, 4, 5]:
    pyplot.hist((df['history_len'])**(1/i), bins=50)
    pyplot.title("Transformation: 1/{}".format(str(i)))
    pyplot.show()

In [None]:
df['history_len'] = round(df['history_len']**(1/i), 2)

In [None]:
X_tfidf_df.info()

In [None]:
X = pd.concat([df['history_len'], df['percent_punct'], X_tfidf_df], axis=1)
X.head()

In [None]:
X.isna().sum()

In [None]:
rf = RandomForestClassifier(n_jobs=-1)
k_fold = KFold(n_splits=5)
cross_val_score(rf, X_tfidf_df, df['alignment'], cv=k_fold, scoring='accuracy', n_jobs=-1)

## RNNs 

#### TF-IDF

![](https://image.slidesharecdn.com/9bc43139-1398-4c31-a9cf-ed08dd37ef13-150521205535-lva1-app6891/95/text-mining-association-rules-and-decision-tree-learning-26-638.jpg?cb=1432241853)

#### Word-to-Vec
![](https://cdn.analyticsvidhya.com/wp-content/uploads/2019/07/img_8.png)

#### Recurrent Neural Networks 
![](https://www.nexmo.com/wp-content/uploads/2020/10/Recurrent-neural-network.png)