## Step 1: Reading data from a URL and Processing it using NLTK Library and Regex

In [None]:
import requests
from nltk import FreqDist
import seaborn as sbn
from nltk.corpus import stopwords
from matplotlib import pyplot

In [None]:
url='https://www.gutenberg.org/files/11/11-0.txt'
alice=requests.get(url)

In [None]:
print(alice.content.decode("utf8"))

In [None]:
#words=[i for i in alice.content.decode("utf8").split()]
#or 
words=alice.content.decode("utf8").split()


In [None]:
words

In [None]:
import re 
new_words=[]
for i in words:
    re.sub('\â\x80\x99','\'', i)
    new_words.append(re.sub('\â\x80\x99','\'', i))
    

In [None]:
import re 
final_words=[]
for j,i in enumerate(new_words):
    result=re.search('[a-zA-Z0-9].*[^,  .]',i)
    try:
        match=result.group(0)
    except AttributeError:
        match=result
    final_words.append(match)
    
    

In [None]:
final_words

In [None]:
final_words=list(filter(None, (final_words)))
final_words=list(map(lambda x: x.lower(),final_words))

In [None]:
#final_text=' '.join(final_words)
final_text=final_words

In [None]:
final_text

In [None]:
#function for getting the most common words 
def barplot_most_common(text, top_n):
    word_freq=FreqDist(text)
    pyplot.figure(figsize=(10,8))
    
    labels=[i[0] for i in word_freq.most_common(top_n)]
    counts=[i[1] for i in word_freq.most_common(top_n)]
    plot=sbn.barplot(labels,counts)
    
    return plot
    

In [None]:
barplot_most_common(final_text,15)

In [None]:
print(stopwords.words('english'))
stop_words=stopwords.words('english')

In [None]:
words_filtered=[i for i in final_text if i not in stop_words]

In [None]:
barplot_most_common(words_filtered,15)

## Step 2: Tokenization

In [None]:
document = "At nine o'clock I visited him myself. It looks like religious mania, and he'll soon think that he himself is God."

In [None]:
document.split()

In [None]:
from nltk.tokenize import word_tokenize
print(word_tokenize(document))

In [None]:
from nltk.tokenize import sent_tokenize
print(sent_tokenize(document))

In [None]:
message = "i recently watched this show called mindhunters:). i totally loved it 😍. it was gr8 <3. #bingewatching #nothingtodo 😎"

In [None]:
word_tokenize(message)

In [None]:
from nltk.tokenize import TweetTokenizer
tweet_tokenizer=TweetTokenizer()

In [None]:
tweet_tokenizer.tokenize(message)
#hashtags and smiles are managed well using Tweet Tokenizer()

In [None]:
from nltk.tokenize import regexp_tokenize
message = "i recently watched this show called mindhunters:). i totally loved it 😍. it was gr8 <3. #bingewatching #nothingtodo 😎"
pattern = "#[\w]+"

In [None]:
#re.findall(pattern,message)
regexp_tokenize(message,pattern)

## Step 3: Consolidating Pre-processing Steps & using it on a Spam vs Ham usecase

In [None]:
import pandas as pd
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        path=os.path.join(dirname, filename)
        print(os.path.join(dirname, filename))


In [None]:
df=pd.read_csv(path, sep='\t', names=['label','message'])

In [None]:
df.head()

In [None]:
#Now new preprocessing
from nltk.stem.snowball import SnowballStemmer
stemmer=SnowballStemmer('english')
 
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()    

def preprocess(document, stem=True):
    'changes document to lower case and removes stopwords'

    # change sentence to lower case
    document = document.lower()

    # tokenize into words
    words = word_tokenize(document)
    
    final_words=[]
    for i in words:
        result=re.search('([\w]+)',i)
        try:
            match=result.group(1)
        except AttributeError:
            match=result
        final_words.append(match)
    final_words=list(filter(None, (final_words)))
    final_words=list(map(lambda x: x.lower(),final_words))
    words=final_words    

    # remove stop words
    words = [word for word in words if word not in stopwords.words("english")]
    
    #stemming and lematization:
    
# Stemming is a process that stems or removes last few characters from a word, 
#often leading to incorrect meanings and spelling. Lemmatization considers the context 
#and converts the word to its meaningful base form, which is called Lemma. 
#For instance, stemming the word 'Caring' would return 'Car'
    if stem:
        words=[stemmer.stem(i) for i in words]
    else:
        words=[lemmatizer.lemmatize(i,pos='v') for i in words]
    # join words to make sentence
    document = " ".join(map( str, words))
    
    return document


In [None]:
#stemmed message
messages=[preprocess(i,stem=True) for i in df.iloc[0:50].message]
#bag of words
from sklearn.feature_extraction.text import CountVectorizer
vectorizer=CountVectorizer()
model_stem=vectorizer.fit_transform(messages)

In [None]:
messages

In [None]:
model_stem.shape

In [None]:
len(vectorizer.get_feature_names())

347 features with stemmer

In [None]:
messages=[preprocess(i,stem=False) for i in df.iloc[0:50].message]
#bag of words
from sklearn.feature_extraction.text import CountVectorizer
vectorizer=CountVectorizer()
model_lem=vectorizer.fit_transform(messages)

In [None]:
model_lem.shape

349 features with lemmatization

In [None]:
pd.DataFrame(model_lem.toarray(), columns=vectorizer.get_feature_names())

In [None]:
#Stemming doesn't apply any background knowledge whereas Lemmatization is a different process where understanding is built.

In [None]:
messages

## Step 4: Using TFIDF (short for term frequency–inverse document frequency)

### TFIDF is a numerical statistic that is intended to reflect how important a word is to a document in a collection or corpus.

In [None]:
#TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer
vectorize= TfidfVectorizer()
model_tf_idf= vectorize.fit_transform(messages)

In [None]:
model_tf_idf.shape

In [None]:
pd.DataFrame(model_tf_idf.toarray(),columns=vectorizer.get_feature_names())