# Проект по Методи и софтуер за машинно обучение
Студент: Кристиян Кръчмаров <br>
Фак. номер: 791324005 <br>


## Зареждане на данните

In [2]:
import pandas as pd

path = "../Data/"
data = pd.read_csv(path + "shuffle_email_spam_classification.csv")
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5172 entries, 0 to 5171
Columns: 3002 entries, Email No. to Prediction
dtypes: int64(3001), object(1)
memory usage: 118.5+ MB


In [3]:
data = data.iloc[:, 1:]  # remove 'Email No' column

X = data.iloc[:, :-1]  # explanatory variables
Y = data.iloc[:, -1]  # result

print(X.shape)
print(Y.shape)

(5172, 3000)
(5172,)


## Preprocessing
MinMaxScaller за нормализиране на данните

In [5]:
from sklearn import preprocessing

scaler = preprocessing.StandardScaler()
X_scaled = scaler.fit_transform(X)

# print("Min value:", X_scaled.min())
# print("Mean value:", X_scaled.mean())
# print("Max value:", X_scaled.max())

In [6]:
from sklearn.model_selection import train_test_split, KFold

X_train, X_test, Y_train, Y_test = train_test_split(X_scaled, Y, test_size=0.2, random_state=42)

In [7]:
print(X_train.shape, Y_train.shape)

(4137, 3000) (4137,)


In [8]:
print(X_test.shape, Y_test.shape)

(1035, 3000) (1035,)


Няколко helper метода за улеснение <br>
`getMetrics` е за обучаването на класификатора иизвличане на няколко метрики:
accuracy, AUC, TPR, FPR, ConfusionMatrix, ClassificationReport <br>
`plotRocCurve` e за показване на ROC кривата

In [9]:
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.metrics import confusion_matrix, classification_report


def getMetrics(classifier, X_train, Y_train, X_test, Y_test):
    classifier.fit(X_train, Y_train)

    predicted = classifier.predict(X_test)
    probabilities = classifier.predict_proba(X_test)
    probabilities = probabilities[:, 1]

    fpr, tpr, threshold = roc_curve(Y_test, probabilities)
    auc = roc_auc_score(Y_test, probabilities)

    conf_matrix = confusion_matrix(Y_test, predicted)

    accuracy = classification_report(Y_test, predicted, output_dict=True)['accuracy']
    cl_report = classification_report(Y_test, predicted, output_dict=False)

    return {
        "classifier": classifier,
        "ACC": accuracy,
        "AUC": auc,
        "TPR": tpr,
        "FPR": fpr,
        "Threshold": threshold,
        "ConfusionMatrix": conf_matrix,
        "ClassificationReport": cl_report,
    }


In [10]:
import matplotlib.pyplot as plt
def plotRocCurve(tpr, fpr):
    fg, ax1 = plt.subplots(1, 1, figsize=(10, 10))
    ax1.set_xlim([-0.05, 1.05])
    ax1.set_ylim([-0.05, 1.05])
    ax1.plot([0, 1], [0, 1], linestyle = "-", color = 'k')
    ax1.plot(fpr, tpr, marker = "", alpha = 0.8)
    plt.show()

In [11]:
def kFoldValidation(classifier, X, Y):
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    fig = plt.figure()
    fg, axes = plt.subplots(1, 2, figsize=(20, 8))

    axes[0].set_xlim([-0.05, 1.05])
    axes[0].set_ylim([-0.05, 1.05])
    axes[0].plot([0, 1], [0, 1], linestyle = "-", color = 'k')
    axes[0].title.set_text("ROC (Test set)")

    axes[1].set_xlim([-0.05, 1.05])
    axes[1].set_ylim([-0.05, 1.05])
    axes[1].plot([0, 1], [0, 1], linestyle = "-", color = 'k')
    axes[1].title.set_text("ROC (Training set)")

    performance_stats_df = pd.DataFrame(columns=["Fold",
                                            "ACC Test",
                                            "ACC Training",
                                            "ACC ratio",
                                            "AUC Test",
                                            "AUC Training",
                                            "AUC ratio",
                                            "CONF_MTX"])
    i=0
    for trainIndex, testIndex in kf.split(X):
        X_train, X_test = X[trainIndex], X[testIndex]
        Y_train, Y_test = Y[trainIndex], Y[testIndex]

        metrics_train_train = getMetrics(classifier, X_train, Y_train, X_train, Y_train)
        metrics_train_test = getMetrics(classifier, X_train, Y_train, X_test, Y_test)

        axes[0].plot(metrics_train_test["FPR"], metrics_train_test["TPR"], marker = "", color="green", alpha=0.6)
        axes[1].plot(metrics_train_train["FPR"], metrics_train_train["TPR"], marker = "", color="red", alpha=0.6)

        temp = {'Fold': i,
            'ACC Test': metrics_train_test["ACC"]*100,
            'ACC Training': metrics_train_train["ACC"]*100,
            'ACC ratio': metrics_train_test["ACC"]/metrics_train_train["ACC"],
            'AUC Test': metrics_train_test["AUC"],
            'AUC Training': metrics_train_train["AUC"],
            'AUC ratio': metrics_train_test["AUC"]/metrics_train_train["AUC"],
            'CONF_MTX':metrics_train_test["ConfusionMatrix"]}

        performance_stats_df = performance_stats_df.append(temp, ignore_index=True)
        i=i+1

    plt.show()