In [None]:
import numpy as np
import pandas as pd

# Load dataset

In [None]:
df = pd.read_csv('../dataset/emails.csv')
df

In [None]:
df.shape

In [None]:
df.columns

# Preprocessing data

In [None]:
df.drop_duplicates(inplace=True)

In [None]:
df.isnull().sum()

In [None]:
X = df['text']
X

In [None]:
y = df['spam']
y

 -------------------------------------------------------

# CountVectorizer

In [None]:
# from sklearn.feature_extraction.text import CountVectorizer

In [None]:
# cv = CountVectorizer()

In [None]:
# X = cv.fit_transform(X)

In [None]:
# cv.get_feature_names_out()

In [None]:
# my_df = pd.DataFrame(data=X.toarray(), columns=cv.get_feature_names_out())

In [None]:
# my_df['isSpam'] = y.tolist()

In [None]:
# my_df

In [None]:
# X = pd.DataFrame(X.toarray()) # konverzija u DataFrame

--------------------------------------------------------------

# TF-IDF matrix

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
tf = TfidfVectorizer(max_df=0.8) # ignore terms that appear in more than 80% of the documents

In [None]:
X = tf.fit_transform(X)

In [None]:
tf.get_feature_names_out()

In [None]:
my_df = pd.DataFrame(data=X.toarray(), columns = tf.get_feature_names_out())

In [None]:
my_df['isSpam'] = y.tolist()

In [None]:
my_df.columns

In [None]:
my_df

In [None]:
X = pd.DataFrame(X.toarray())

----------------------------------------------

In [None]:
import gensim

df['clean message'] = X.apply(lambda x: gensim.utils.simple_preprocess(x))
df.head()

-------------------------------------------------------

# Train test split

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df['clean message'], y, test_size=0.2, random_state=42, stratify=y)

In [None]:
X_train.shape

In [None]:
X_test.shape

# Word2vector

In [None]:
w2v_model = gensim.models.Word2Vec(X_train,
                                   vector_size=200,
                                   window=700,
                                   min_count=5)

In [None]:
words = set(w2v_model.wv.index_to_key )
X_train_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in words])
                         for ls in X_train], dtype=object)
X_test_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in words])
                         for ls in X_test], dtype=object)

In [None]:
for i, v in enumerate(X_train_vect):
    print(len(X_train.iloc[i]), len(v))

In [None]:
X_train_vect_avg = []
for v in X_train_vect:
    if v.size:
        X_train_vect_avg.append(v.mean(axis=0))
    else:
        X_train_vect_avg.append(np.zeros(100, dtype=float))

X_test_vect_avg = []
for v in X_test_vect:
    if v.size:
        X_test_vect_avg.append(v.mean(axis=0))
    else:
        X_test_vect_avg.append(np.zeros(100, dtype=float))

In [None]:
for i, v in enumerate(X_train_vect_avg):
    print(len(X_train.iloc[i]), len(v))

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rf_model = rf.fit(X_train_vect_avg, y_train.values.ravel())

In [None]:
y_pred = rf_model.predict(X_test_vect_avg)

In [None]:
from sklearn.metrics import precision_score, recall_score
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
print('Precision: {} / Recall: {} / Accuracy: {}'.format(
    round(precision, 3), round(recall, 3), round((y_pred==y_test).sum()/len(y_pred), 3)))

-------------------------------------------------------------------

# MLP

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras import regularizers

In [None]:
model = Sequential()
model.add(Dense(input_dim=X_train.shape[1], units=100, kernel_regularizer= regularizers.L1(l1=1e-5), activation='relu'))
model.add(Dense(units=64, activation='relu'))
model.add(Dense(units=128, activation='relu'))
model.add(Dense(units=1, activation='sigmoid'))

In [None]:
model.summary()

In [None]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
history = model.fit(X_train, y_train, batch_size=64, epochs=40, verbose=1, validation_split=0.2)

In [None]:
from matplotlib import pyplot as plt

In [None]:
epochs = history.epoch
plt.plot(epochs, history.history['loss']) # blue
plt.plot(epochs, history.history['val_loss']) #orange
plt.title('loss')

In [None]:
plt.plot(epochs, history.history['accuracy']) #blue
plt.plot(epochs, history.history['val_accuracy']) #orange
plt.title('accuracy')

In [None]:
model.evaluate(X_test, y_test, batch_size=64)

----------------------------------------------------

# METRICS

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, precision_score

In [None]:
y_pred = model.predict(X_test)

y_pred_new = [int(not(i<0.5)) for i in y_pred]
y_test_new = y_test.to_numpy().tolist()


confusion_matrix(y_test_new, y_pred_new)

In [None]:
classification_report(y_test_new, y_pred_new)

In [None]:
precision_score(y_test_new, y_pred_new)