In [1]:
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import re
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [2]:
def preprocess_data(df):
    stemmer = PorterStemmer()
    corpus = []
    # Data cleaning

    # Remove duplicates

    df = df.drop_duplicates(["text"]).reset_index(drop=True)

    for i in range(0, len(df)):
        item = re.sub('[^a-zA-Z]', ' ', df['text'][i])
        item = item.lower().split()
        item = [stemmer.stem(word) for word in item if not word in stopwords.words('english')]
        item = ' '.join(item)
        corpus.append(item)
        
    # Data pre-processing and split

    cv = TfidfVectorizer(sublinear_tf=True, min_df=5, ngram_range=(1, 2), stop_words='english')
    X = cv.fit_transform(corpus).toarray()
    
    labelencoder = LabelEncoder()
    y = labelencoder.fit_transform(df['category'])
    
    # Train Test Split
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)
    
    return X_train, X_test, y_train, y_test;

In [3]:
def evaluate(y_test, pred, text):
    from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
    import matplotlib.pyplot as plt
    import seaborn as sns
    
    accuracy = accuracy_score(y_test, pred)
    print('Accuracy: %f' % accuracy)
    
    print("Classification Report:")
    print(classification_report(y_test, pred, digits=3))

    conf_mat = confusion_matrix(y_test, pred)
#     fig, ax = plt.subplots(figsize=(8,8))
    sns.heatmap(conf_mat, annot=True, cmap="Blues", fmt='d')
    plt.ylabel('Actual')
    plt.xlabel('Predicted')
    plt.title("Confusion matrix - " + text, size=16);
    print();