In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
# Pre-process data
def preprocess_data(data):
    
    # Drop junk data
    
    data = data.replace('?', np.nan)
    data = data.dropna().reset_index(drop=True)
    
    # Feature X and target y

    cv = TfidfVectorizer()
    X = data.drop(['id' , 'class'], axis=1)
    y = pd.get_dummies(data['class'])
    y=y.iloc[:,1].values
    
    # Train Test Split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)
    
    return X_train, X_test, y_train, y_test;

In [3]:
def evaluate(y_test, pred, text):
    from sklearn.metrics import precision_score, recall_score, accuracy_score, confusion_matrix, f1_score
    import matplotlib.pyplot as plt
    import seaborn as sns
    
    accuracy = accuracy_score(y_test, pred)
    print('Accuracy: %f' % accuracy)
    
    precision = precision_score(y_test, pred)
    print('Precision: %f' % precision)
    
    recall = recall_score(y_test, pred)
    print('Recall: %f' % recall)
    
    f1 = f1_score(y_test, pred)
    print('F1 score: %f' % f1)

    conf_mat = confusion_matrix(y_test, pred)
#     fig, ax = plt.subplots(figsize=(8,8))
    sns.heatmap(conf_mat, annot=True, cmap="Blues", fmt='d')
    plt.ylabel('Actual')
    plt.xlabel('Predicted')
    plt.title("Confusion matrix - " + text, size=16);
    print();