In [1]:
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import re
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

In [None]:
def preprocess_data(messages):
    
    stemmer = PorterStemmer()
    corpus = []
    
# Data cleaning

# Remove duplicates
    
    messages = messages.drop_duplicates(["message"]).reset_index(drop=True)
    
    for i in range(0, len(messages)):
        item = re.sub('[^a-zA-Z]', ' ', messages['message'][i])
        item = item.lower().split()
        item = [stemmer.stem(word) for word in item if not word in stopwords.words('english')]
        item = ' '.join(item)
        corpus.append(item)

# Data pre-processing and split
   
    cv = CountVectorizer()
    X = cv.fit_transform(corpus).toarray()

    y=pd.get_dummies(messages['label'])
    y=y.iloc[:,1].values

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)
    
    return X_train, X_test, y_train, y_test;

In [None]:
def evaluate(y_test, pred, text):
    from sklearn.metrics import precision_score, recall_score, accuracy_score, confusion_matrix, f1_score
    import matplotlib.pyplot as plt
    import seaborn as sns
    
    accuracy = accuracy_score(y_test, pred)
    print('Accuracy: %f' % accuracy)
    
    precision = precision_score(y_test, pred)
    print('Precision: %f' % precision)
    
    recall = recall_score(y_test, pred)
    print('Recall: %f' % recall)
    
    f1 = f1_score(y_test, pred)
    print('F1 score: %f' % f1)

    conf_mat = confusion_matrix(y_test, pred)
#     fig, ax = plt.subplots(figsize=(8,8))
    sns.heatmap(conf_mat, annot=True, cmap="Blues", fmt='d')
    plt.ylabel('Actual')
    plt.xlabel('Predicted')
    plt.title("Confusion matrix - " + text, size=16);
    print();