In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv('./basicstats3_cleaned_enron.csv')
df.head()

In [None]:
#Stopwords
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

stop = set(stopwords.words('english')) 

In [None]:
#Preprocess text-words message
from nltk.stem import SnowballStemmer
import re
snow = nltk.stem.SnowballStemmer('english')


In [None]:
def preprocessing_words(sentence):
    #lower case
    sentence=sentence.lower() 
    #remove html
    cleanr = re.compile('<.*?>')
    sentence = re.sub(cleanr, ' ', sentence)
    #Normalizing URLs
    cleanr = re.compile('(http|https)://[^\s]*')
    sentence = re.sub(cleanr, 'httpaddr', sentence)
    #Removing Punctuation
    sentence = re.sub(r'[?|!|\'|"|#]',r'',sentence)
    sentence = re.sub(r'[.|,|)|(|\|/]',r' ',sentence)
    #Normalize email address
    cleanr = re.compile('[^\s]+@[^\s]+.com')
    sentence = re.sub(cleanr, 'emailaddr', sentence)
    #Normalize Numbers
    cleanr = re.compile('[0-9]+')
    sentence = re.sub(cleanr, 'number', sentence)
    #Normalize money
    cleanr = re.compile('[$]+')
    sentence = re.sub(cleanr, 'dollar', sentence)
    #Remove non-words
    cleanr = re.compile('[^a-zA-Z0-9]')
    sentence = re.sub(cleanr, ' ', sentence)
    #Remove Subject
    cleanr = re.compile('subject')
    sentence = re.sub(cleanr, ' ', sentence)
    #Removal of stop-words and Stemming
    words = [snow.stem(word) for word in sentence.split() if word not in stopwords.words('english')]   
    return words

In [None]:
text_list = []
counter = 0
#Applying Preprocessing function
for sentence in df['email']:
    text_list.append(preprocessing_words(sentence))
    #print(text_list[-1])
    counter += 1
    print('\r{}/{}'.format(counter,len(df['email'])),end='')    

In [None]:
email_process = []
counter = 0
for row in text_list:
    sequ = ''
    for word in row:
        sequ = sequ + ' ' + word
    email_process.append(sequ)
    counter += 1
    print('\r{}/{}'.format(counter,len(text_list)),end='')

In [None]:
#Tokenization
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
def email_tokenization(data,features=500):
  count_vect = CountVectorizer(max_features=features)
  count_matrix = count_vect.fit_transform(data)
  count_array = count_matrix.toarray()
  tokens = pd.DataFrame(data=count_array,columns = count_vect.get_feature_names_out())
  word = count_vect.vocabulary_

  return (tokens, word)

In [None]:
tokens, word = email_tokenization(email_process, features=2000)
print(word)

In [None]:
print(tokens.shape)

In [None]:
tokens.head()

In [None]:
#export process messages and labels
from sklearn.preprocessing import LabelBinarizer

In [None]:
lb = LabelBinarizer()
lb.fit(df['label'])
print(lb.classes_)
y = lb.transform(df['label'])

In [None]:
print('Spam/Total')
print('{}/{} '.format(int(sum(y)),len(y)))
print('Spam proportion = {:0.2f}'.format(int(sum(y))/len(y)))

In [None]:
export_data = tokens
export_data['label'] = y
export_data.head()

In [None]:
export_data.to_csv('token_mails_2000f_labeled.csv', index=False)

In [None]:
#Model Train
df = pd.read_csv('./token_mails_2000f_labeled.csv')

In [None]:
X = df.iloc[:,0:-1]

In [None]:
y = df['label']

In [None]:
print(X.shape)
print(y.shape)

In [None]:
#Dataset split
from sklearn.model_selection import train_test_split

In [None]:
#test size is 20%
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.20)

In [None]:
pd.DataFrame([len(X),len(X_train),len(X_test)],
             index=['Total','Train','Test'],
             columns=['Size'])

In [None]:
print('Spam/Total')
print('{}/{} '.format(int(sum(y_train)),len(y_train)))
print('Spam proportion = {:0.2f}'.format(int(sum(y_train))/len(y_train)))

In [None]:
#Model selection
from sklearn.naive_bayes import GaussianNB

In [None]:
gnb = GaussianNB()

In [None]:
#Cross Validation
#Learning Curve
from sklearn.model_selection import learning_curve


In [None]:
train_sizes_fraction = np.arange(0.1,0.85,0.05)
train_sizes = np.array(train_sizes_fraction*len(y_train)).astype(int)
print('Train sizes')
print(train_sizes)

In [None]:
train_sizes, train_scores, valid_scores = learning_curve(gnb,X_train,y_train, train_sizes=train_sizes.astype(int), cv=5,scoring='f1')

In [None]:
cv_results = pd.DataFrame([np.round(train_sizes),np.mean(train_scores,axis=1),np.mean(valid_scores,axis=1)],
             index=['Training size','Training F1-score','CV F1-score']).T
cv_results = cv_results.sort_values(by='CV F1-score', ascending=False)
cv_results

In [None]:
plt.figure(figsize=(10,10))
plt.style.use('bmh');
plt.plot(train_sizes,np.mean(train_scores,axis=1),'-o',label='Training score',);
plt.plot(train_sizes,np.mean(valid_scores,axis=1),'-o',label='Cross-validation score');
plt.ylim((0.5,1))
plt.legend(loc=4,frameon=True);
plt.xlabel('Training size');
plt.ylabel('F1-Score');
plt.title('Gaussian Naive Bayes');

In [None]:
from sklearn.metrics import f1_score

In [None]:
gnb.fit(X_train,y_train)
print('GNB score: ', f1_score(y_test,gnb.predict(X_test)))

In [None]:
#Feature vs Performance
iterat = np.arange(200,2001,200)
best_models = np.zeros((len(iterat),4))
counter = 0

plt.figure(figsize=(20,10))
plt.style.use('bmh');

for n_features in iterat:
  #Cross-Validation for n features
  print('\r{}/{}'.format(counter+1,len(iterat)),end='')
  (train_sizes, train_scores, valid_scores, 
   fit_time, score_time) = learning_curve(gnb,X_train.iloc[:,0:n_features],
                                          y_train, train_sizes=train_sizes.astype(int),
                                          cv=5,scoring='f1',return_times=True)
   
  cv_results = pd.DataFrame([np.round(train_sizes),np.mean(train_scores,axis=1),
                             np.mean(valid_scores,axis=1),np.mean(fit_time,axis=1),
                             np.mean(score_time,axis=1)],
             index=['Training size','Training scores','CV scores','Fit time','Score time']).T
  #Learning curves
  plt.subplot(2,5,counter+1)
  plt.plot(train_sizes,np.mean(train_scores,axis=1),'-o',label='Training score',);
  plt.plot(train_sizes,np.mean(valid_scores,axis=1),'-o',label='Cross-validation score');
  plt.ylim((0.5,1))
  plt.legend(loc=4,frameon=True);
  plt.xlabel('Training size');
  plt.ylabel('F1-Score');
  plt.title('Features: {}'.format(n_features));

  #Extract best CV Score
  best_models[counter,0] = n_features
  best_models[counter,1] = cv_results[cv_results['CV scores']==cv_results['CV scores'].max()]['CV scores']
  best_models[counter,2] = cv_results[cv_results['CV scores']==cv_results['CV scores'].max()]['Fit time']
  best_models[counter,3] = cv_results[cv_results['CV scores']==cv_results['CV scores'].max()]['Score time']
  counter += 1

In [None]:
best_models = pd.DataFrame(best_models,
             columns=['Features','CV F1-scores','Fit time','Score time'])
best_models

In [None]:
plt.figure(figsize=(10,5))
plt.subplot(1,2,1)
plt.plot(best_models['Features'],best_models['CV F1-scores'],'-o');
plt.xlabel('Features');
plt.ylabel('CV F1-Score');

plt.subplot(1,2,2)
plt.plot(best_models['Features'],best_models['Fit time'],'-o');
plt.xlabel('Features');
plt.ylabel('Fit time');

In [None]:
#Best Model Validation
#Training size for best model
X_subtrain, X_cv, y_subtrain, y_cv = train_test_split(X_train,y_train,train_size=0.8)

In [None]:
pd.DataFrame([len(X_train),len(X_subtrain),len(X_cv)],
             index=['Total','Train','Test'],
             columns=['Size'])

In [None]:
nb.fit(X_subtrain, y_subtrain)

In [None]:
#Confusion amtrix and F1 score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

In [None]:
print('y-test size: {}'.format(len(y_test)))
print('Ham: {}'.format(len(y_test)-sum(y_test)))
print('Spam: {}'.format(sum(y_test)))

In [None]:
gnb.fit(X_train,y_train)

In [None]:
predictions = gnb.predict(X_test)

In [None]:
cm = confusion_matrix(y_test, predictions, labels=gnb.classes_);
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=gnb.classes_);
disp.plot()
plt.show()

In [None]:
print('Metrics for best model')
print('Accuracy: {:.2f}%'.format(accuracy_score(y_test,predictions)*100))
print('Precision: {:.2f}%'.format(precision_score(y_test,predictions)*100))
print('Recall: {:.2f}%'.format(recall_score(y_test,predictions)*100))
print('F1 score: {:.2f}%'.format(f1_score(y_test,predictions)*100))

In [None]:
from sklearn.metrics import classification_report

In [None]:
print(classification_report(y_test,predictions,target_names=['ham','spam']))

In [None]:
# Multilayer Perceptron Classifier
# Training is probably the issue.  

#Train the data
from sklearn.neural_network import MLPClassifier

#model=MLPClassifier()
#this should be needed for the plots
model=MLPClassifier(max_iter=500)

model.fit(x_train, y_train)

In [None]:
prediction=model.predict(x_test)
print(prediction)

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

print("MLPClassifier")
print("Accuracy score: {}". format(accuracy_score(y_test, prediction)) )
print("Precision score: {}". format(precision_score(y_test, prediction)) )
print("Recall score: {}". format(recall_score(y_test, prediction)))
print("F1 score: {}". format(f1_score(y_test, prediction)))

In [None]:
#Cross Validation
#Learning Curve

train_sizes_fraction = np.arange(0.1,0.85,0.05)
train_sizes = np.array(train_sizes_fraction*len(y_train)).astype(int)
print('Train sizes')
print(train_sizes)

In [None]:
train_sizes, train_scores, valid_scores = learning_curve(model,x_train,y_train, train_sizes=train_sizes.astype(int), cv=5,scoring='f1')

In [None]:
cv_results = pd.DataFrame([np.round(train_sizes),np.mean(train_scores,axis=1),np.mean(valid_scores,axis=1)],
             index=['Training size','Training F1-score','CV F1-score']).T
cv_results = cv_results.sort_values(by='CV F1-score', ascending=False)
cv_results

In [None]:
plt.figure(figsize=(10,10))
plt.style.use('bmh');
plt.plot(train_sizes,np.mean(train_scores,axis=1),'-o',label='Training score',);
plt.plot(train_sizes,np.mean(valid_scores,axis=1),'-o',label='Cross-validation score');
plt.ylim((0.5,1))
plt.legend(loc=4,frameon=True);
plt.xlabel('Training size');
plt.ylabel('F1-Score');
plt.title('Multilayer Perceptron Classifier');

In [None]:
model.fit(x_train,y_train)
print('MLPClassifier score: ', f1_score(y_test,model.predict(x_test)))

In [None]:
#Feature vs Performance
iterate = np.arange(200,2001,200)
best_models = np.zeros((len(iterate),4))
counter = 0

plt.figure(figsize=(20,10))
plt.style.use('bmh');

for n_features in iterate:
  #Cross-Validation for n features
  print('\r{}/{}'.format(counter+1,len(iterate)),end='')
  (train_sizes, train_scores, valid_scores, 
   fit_time, score_time) = learning_curve(model,x_train.iloc[:,0:n_features],
                                          y_train, train_sizes=train_sizes.astype(int),
                                          cv=5,scoring='f1',return_times=True)
   
  cv_results = pd.DataFrame([np.round(train_sizes),np.mean(train_scores,axis=1),
                             np.mean(valid_scores,axis=1),np.mean(fit_time,axis=1),
                             np.mean(score_time,axis=1)],
             index=['Training size','Training scores','CV scores','Fit time','Score time']).T
  #Learning curves
  plt.subplot(2,5,counter+1)
  plt.plot(train_sizes,np.mean(train_scores,axis=1),'-o',label='Training score',);
  plt.plot(train_sizes,np.mean(valid_scores,axis=1),'-o',label='Cross-validation score');
  plt.ylim((0.5,1))
  plt.legend(loc=4,frameon=True);
  plt.xlabel('Training size');
  plt.ylabel('F1-Score');
  plt.title('Features: {}'.format(n_features));

  #Extract best CV Score
  best_models[counter,0] = n_features
  best_models[counter,1] = cv_results[cv_results['CV scores']==cv_results['CV scores'].max()]['CV scores']
  best_models[counter,2] = cv_results[cv_results['CV scores']==cv_results['CV scores'].max()]['Fit time']
  best_models[counter,3] = cv_results[cv_results['CV scores']==cv_results['CV scores'].max()]['Score time']
  counter += 1

In [None]:
best_models = pd.DataFrame(best_models,
             columns=['Features','CV F1-scores','Fit time','Score time'])
best_models

In [None]:
plt.figure(figsize=(10,5))
plt.subplot(1,2,1)
plt.plot(best_models['Features'],best_models['CV F1-scores'],'-o');
plt.xlabel('Features');
plt.ylabel('CV F1-Score');

plt.subplot(1,2,2)
plt.plot(best_models['Features'],best_models['Fit time'],'-o');
plt.xlabel('Features');
plt.ylabel('Fit time');

In [None]:
#Best Model Validation
#Training size for best model
x_subtrain, x_cv, y_subtrain, y_cv = train_test_split(x_train,y_train,train_size=0.8)

In [None]:
pd.DataFrame([len(x_train),len(x_subtrain),len(x_cv)],
             index=['Total','Train','Test'],
             columns=['Size'])

In [None]:
model.fit(x_subtrain, y_subtrain)

In [None]:
#Confusion matrix and F1 score
print('y-test size: {}'.format(len(y_test)))
print('Ham: {}'.format(len(y_test)-sum(y_test)))
print('Spam: {}'.format(sum(y_test)))

In [None]:
model.fit(x_train,y_train)

In [None]:
predictions = model.predict(x_test)

In [None]:
cm = confusion_matrix(y_test, predictions, labels=model.classes_);
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=model.classes_);
disp.plot()
plt.show()

In [None]:
print('Metrics for best model')
print('Accuracy: {:.2f}%'.format(accuracy_score(y_test,predictions)*100))
print('Precision: {:.2f}%'.format(precision_score(y_test,predictions)*100))
print('Recall: {:.2f}%'.format(recall_score(y_test,predictions)*100))
print('F1 score: {:.2f}%'.format(f1_score(y_test,predictions)*100))

In [None]:
print(classification_report(y_test,predictions,target_names=['ham','spam']))
