In [None]:
text = "It is a truth universally acknowledged, that a single man in possession of a good fortune, must be in want of a wife. However little known the feelings or views of such a man may be on his first entering a neighbourhood, this truth is so well fixed in the minds of the surrounding families, that he is considered the rightful property of some one or other of their daughters."

In [None]:
import nltk

In [None]:
nltk.download_shell()

# Text Preprocessing

### Tokenization and Stopwords in NLTK

In [None]:
from nltk import wordpunct_tokenize, word_tokenize, sent_tokenize
from nltk.corpus import stopwords

In [None]:
text

In [None]:
sent_tokenize(text)

In [None]:
text2 = 'cross-validation is fun'

In [None]:
word_tokenize(text2)

In [None]:
wordpunct_tokenize(text2)

In [None]:
tokens = wordpunct_tokenize(text)

In [None]:
#  turning it into a set can speed computation
sw = set(stopwords.words('english'))

In [None]:
tokens2 = [word for word in tokens if word not in sw]

In [None]:
tokens2

## Stemming and Lemmatizing

In [None]:
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

In [None]:
ps = PorterStemmer()
wn = WordNetLemmatizer()

In [None]:
print(" ".join(tokens2))

In [None]:
stemmed = [ps.stem(word) for word in tokens2]
print(" ".join(stemmed))

In [None]:
lemmed = [wn.lemmatize(word) for word in tokens2]
print(" ".join(lemmed))

### Function for text processing

In [None]:
def text_token(x):
    x = x.lower()
    tokens = wordpunct_tokenize(x)
    tokens = [tok for tok in tokens if tok.isalnum()]
    tokens = [tok for tok in tokens if tok not in sw]
    tokens = [wn.lemmatize(tok) for tok in tokens]
    return(tokens)

In [None]:
text_token(text)

# Vectorizing

In [None]:
import pandas as pd
sh = pd.read_csv('smsspamcollection.tsv', sep='\t')

In [None]:
sh.head()

In [None]:
sh['tokens'] = sh['message'].apply(text_token)
sh['clean'] = sh['tokens'].apply(lambda x: " ".join(x))

In [None]:
sh.head()

### Count Vectorizer

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [None]:
count_vect = CountVectorizer(min_df=5)

In [None]:
X_counts = count_vect.fit_transform(sh['clean'])

In [None]:
X_counts.shape

In [None]:
X_counts

In [None]:
#print(count_vect.get_feature_names())

In [None]:
df_count = pd.DataFrame(X_counts.toarray())

In [None]:
df_count.columns = count_vect.get_feature_names()

In [None]:
df_count.head()

#### Count bigrams

In [None]:
count_vect = CountVectorizer(ngram_range=(1,2))
X_counts = count_vect.fit_transform(sh['clean'])
X_counts.shape

In [None]:
df_count2 = pd.DataFrame(X_counts.toarray(), columns=count_vect.get_feature_names())

In [None]:
df_count2.head()

#### TF-IDF

In [None]:
count_vect = TfidfVectorizer()
X_counts = count_vect.fit_transform(sh['clean'])
print(X_counts.shape)
df_tfidf = pd.DataFrame(X_counts.toarray(), columns=count_vect.get_feature_names())

In [None]:
df_tfidf.head()

### Text Classification

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split

In [None]:
y = sh['label']

In [None]:
X = sh['clean']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=801, stratify=y)

In [None]:
from sklearn.pipeline import Pipeline

In [None]:
text_clf = Pipeline([('tdidf', TfidfVectorizer()), 
                     ('nb', MultinomialNB())])

In [None]:
text_clf.fit(X_train, y_train)

In [None]:
predictions = text_clf.predict(X_test)

In [None]:
from sklearn.metrics import confusion_matrix, classification_report

In [None]:
confusion_matrix(y_test, predictions)

In [None]:
print(classification_report(y_test, predictions))

# Sentiment Analysis

In [None]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [None]:
sid = SentimentIntensityAnalyzer()

In [None]:
sh['scores'] = sh['message'].apply(lambda x: sid.polarity_scores(x))

In [None]:
sh['compound'] = sh['scores'].apply(lambda x: x['compound'])

In [None]:
sh.head()

In [None]:
import seaborn as sns

In [None]:
sns.boxplot(x='label', y='compound',data=sh)

### Word Counts

In [None]:
from nltk.book import *

In [None]:
text6

In [None]:
type(text6)

In [None]:
type(text6)

In [None]:
len(set(text6))/len(text3)

In [None]:
from nltk.probability import FreqDist

In [None]:
fdist = FreqDist(text6)

In [None]:
fdist.most_common(50)

In [None]:
tokens = text6.tokens
tokens = [tok for tok in tokens if tok.isalnum()]
tokens = [tok for tok in tokens if tok not in sw]
tokens = [tok.lower() for tok in tokens]

In [None]:
fdist = FreqDist(tokens)

In [None]:
fdist.most_common(20)

In [None]:
bigram = nltk.bigrams(tokens)

In [None]:
FreqDist(bigram).most_common(20)

In [None]:
with open('PaP.txt', 'r') as file:
    pride_text = file.read()

In [None]:
print(pride_text[0:2000])

In [None]:
def text_token(x):
    x = x.lower()
    tokens = wordpunct_tokenize(x)
    tokens = [tok for tok in tokens if tok.isalnum()]
    tokens = [tok for tok in tokens if tok not in sw]
    return(tokens)

In [None]:
words = text_token(pride_text)

In [None]:
FreqDist(words).most_common(20)

In [None]:
FreqDist(nltk.ngrams(words,2)).most_common(20)