In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tensorflow as tf

import sklearn
from sklearn.metrics import precision_recall_fscore_support, classification_report,confusion_matrix, precision_recall_curve
from sklearn.metrics import roc_auc_score
from sklearn import utils  
from sklearn import svm
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings("ignore")
from keras import backend as K
import keras
from keras.models import Model
from keras.layers import Dense, Input
from sklearn.model_selection import GridSearchCV





In [None]:
origin_data = pd.read_csv("../input/processeddst2/generated_v3.csv")
origin_data.head(5)

Split the ID to identity ID and its round


In [None]:
origin_data[['call-id', 'round']] = origin_data['id'].str.split('|', 1, expand=True)
del origin_data["call-id"]
del origin_data["id"]
origin_data["round"].astype('float32')
origin_data["output-sentiment"].astype('float32')
origin_data["input-sentiment"].astype('float32')
origin_data["slot_in_round"].astype('float32')
origin_data.head(5)


In [None]:
failed_data = pd.read_csv("../input/processeddst2/generated_test_v3.csv")
failed_data[['call-id', 'round']] = failed_data['id'].str.split('|', 1, expand=True)

del failed_data["call-id"]
del failed_data["id"]
failed_data["round"].astype('float32')
failed_data["output-sentiment"].astype('float32')
failed_data["input-sentiment"].astype('float32')
failed_data["slot_in_round"].astype('float32')
failed_data.head(5)
failed_samples = failed_data.to_numpy("float32")

#First Idea is RNN

In [None]:
positive_samples = origin_data.to_numpy("float32")
X =  np.concatenate([positive_samples,failed_samples])

Y= [1 for x in range(len(positive_samples))] + [-1 for x in range(len(failed_samples))]

X_train,X_test, Y_train, Y_test = train_test_split(X,Y,stratify=Y)

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_transformed = scaler.fit_transform(X_train)
X_test_transformed = scaler.transform(X_test)




In [None]:
from imblearn.over_sampling import SMOTE 

sm = SMOTE(random_state=42)
X_res, Y_res = sm.fit_resample(X_train_transformed, Y_train)



In [None]:
from sklearn.neighbors import NearestCentroid

clf = NearestCentroid()
clf.fit(X_res,Y_res)

Y_pred = clf.predict(X_test_transformed)

print(classification_report(Y_test,Y_pred))
print(confusion_matrix(Y_test,Y_pred))


In [None]:
from sklearn.tree import DecisionTreeClassifier
tree_para = {'criterion':['gini','entropy'],'max_depth':range(1,20),"splitter":["best","random"],"min_samples_split":range(2,20),"max_features":["auto", "sqrt", "log2"]}


clf = GridSearchCV(DecisionTreeClassifier(class_weight="balanced"), tree_para, cv=5,scoring="roc_auc")


clf.fit(X_res,Y_res)
Y_pred = clf.predict(X_test_transformed)

print(classification_report(Y_test,Y_pred))
print(confusion_matrix(Y_test,Y_pred))

ROC_AUC:  [[  3   2]
 [  7 275]]
 P=0.3 recall = 0.6
 
 
 f1 :
 P=0.29, recall = 0.4 [[  2   3]
 [  5 277]]
 
 
 precision P0.1 recall 1.0
 
 [[  5   0]
 [ 44 238]]
 
 
 recall: p 0 recall 0
 
 [[  0   5]
 [  7 275]]
 
 
 
 
 Using SMOTE
 
 ------------------------------------------
 Not Using Smote
 
 recall: P 0.25 recall 0.2
 [[  1   4]
 [  3 279]]
 
 
 precision: P 0.09,recall 0.8 [[  4   1]
 [ 42 240]]
 
 f1: P 0.31 R 0.8 [[  4   1]
 [  9 273]]
 
 
 
roc_auc P 0.1 R 0.8

 
 

In [None]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier()

clf.fit(X_res, Y_res)
Y_pred = clf.predict(X_test_transformed)

print(classification_report(Y_test,Y_pred))
print(confusion_matrix(Y_test,Y_pred))

In [None]:
'''
test_data = pd.read_csv("../input/processeddst2/generated_val_v2.csv")
test_data[['call-id', 'round']] = test_data['id'].str.split('|', 1, expand=True)
del test_data["call-id"]
del test_data["id"]
test_data["round"].astype('float32')
test_data["output-sentiment"].astype('float32')
test_data["input-sentiment"].astype('float32')
test_data["slot_in_round"].astype('float32')
test_data.head(5)
test_data = test_data.to_numpy("float32")
'''

In [None]:
from sklearn.ensemble import AdaBoostClassifier


boost_classifier = {'n_estimators':range(1,10),"algorithm":["SAMME","SAMME.R"]}


clf = GridSearchCV(AdaBoostClassifier(), boost_classifier, cv=5,scoring="recall")


clf.fit(X_res, Y_res)
Y_pred = clf.predict(X_test_transformed)

print(classification_report(Y_test,Y_pred))
print(confusion_matrix(Y_test,Y_pred))





In [None]:


early_stop = keras.callbacks.EarlyStopping(monitor="loss", min_delta=0, patience=5, mode="auto")


# this time origin_data is the training data
input_dim=17

def inner_tanh(x):
    k = 3
    N = 4
    return 1/2 + 1/(2*(k-1)) * sum(K.tanh(x- (j/N)) for j in range(1, N-1))


def get_RNNmodel():
    inp = Input(shape=(input_dim, ))
    x = Dense(input_dim//2, activation="tanh")(inp)
    x = Dense(input_dim//4, activation=inner_tanh)(x)
    x = Dense(input_dim//2, activation="tanh")(x)
    outp = Dense(input_dim, activation="sigmoid")(x)
    
    model = Model(inputs=inp, outputs=outp)
    model.compile(loss="binary_crossentropy", optimizer="adam")
    return model


RNNmodel = get_RNNmodel()
RNNmodel.summary()



hist = RNNmodel.fit(positive_samples, positive_samples, epochs=1000, callbacks=[early_stop], verbose=0)
final_loss = hist.history["loss"][-1]
final_loss

In [None]:
# Build AE solution

def get_AEmodel():
    inp = Input(shape=(input_dim, ))
    encoded = Dense(input_dim//2, activation="relu")(inp)
    decoded = Dense(input_dim, activation="sigmoid")(encoded)
    
    model = Model(inputs=inp, outputs=decoded)
    model.compile(loss="binary_crossentropy", optimizer="adam")
    return model


AEmodel = get_AEmodel()
AEmodel.summary()

hist = AEmodel.fit(positive_samples, positive_samples, epochs=1000, callbacks=[early_stop], verbose=0)
final_loss = hist.history["loss"][-1]
final_loss



#SVM model

In [None]:
success_numbers=len(test_data)
failed_numbers=len(failed_samples)
outliers_fraction =failed_numbers/success_numbers
SVMmodel = svm.OneClassSVM(kernel='rbf', nu=outliers_fraction,gamma=0.1)
SVMmodel.fit(train_data)


In [None]:
# Test Under OneClassSVM


X =  np.concatenate([test_data,failed_samples])
Y_true = [1 for x in range(len(test_data))] + [-1 for x in range(failed_numbers)]


Y_pred = SVMmodel.predict(X)

from sklearn.metrics import confusion_matrix,roc_auc_score,classification_report

confusion_matrix(Y_true,Y_pred)


In [None]:
roc_auc_score(Y_true,Y_pred)

In [None]:
print(classification_report(Y_true,Y_pred))

In [None]:
# predict on  RNN

from collections import defaultdict
from sklearn.metrics import mean_squared_error


def analyze_outlier(df):
    sorted_df = df.sort_values(by=['OF'], ascending=False)
    first20 = sorted_df.head(20)
    first20_Y = first20['Y'].tolist()
    first20_0count = first20_Y.count(0)
    first20_accuracy = (first20_0count / 30) * 100
    
    print ("Within the top 20 ranked cases (ranked according to the Outlier Factor), {} of the malignant cases (the outliers), comprising {}% of all malignant cases, were identified.".format(first20_0count, first20_accuracy))


def calculate_outlier_factor(X, Y, pred):
    outlier_factors = defaultdict(dict)
    for i in range(X.shape[0]):
        outlier_factors[i]["OF"] = mean_squared_error(X[i], pred[i])
        outlier_factors[i]["Y"] = Y[i]
    return outlier_factors



Y_pred = RNNmodel.predict(X)
outlier_factors = calculate_outlier_factor(X, Y_true, Y_pred)
df = pd.DataFrame.from_dict(outlier_factors, orient="index")
analyze_outlier(df)
    

    
    


In [None]:

Y_pred = AEmodel.predict(X)
outlier_factors = calculate_outlier_factor(X, Y_true, Y_pred)
df = pd.DataFrame.from_dict(outlier_factors, orient="index")
analyze_outlier(df)
    


In [None]:
# IsolationForest

from sklearn.ensemble import IsolationForest


ISOModel = IsolationForest()
ISOModel.fit(train_data)




In [None]:
Y_pred = ISOModel.predict(X)
print(classification_report(Y_true,Y_pred))


In [None]:
# Local Outlier Factor

from sklearn.neighbors import LocalOutlierFactor 

LOF =  LocalOutlierFactor(n_neighbors=5,novelty=True)
LOF.fit(train_data)


In [None]:
Y_pred = LOF.predict(X)
print(classification_report(Y_true,Y_pred))
