# Combining the models

### Importing librairies

In [1]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from bs4 import BeautifulSoup
import unicodedata
import string
import joblib
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
import statistics
from statistics import mean

In [2]:
# Download the stopwords corpus if needed
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\MSI\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
pip install httplib2


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.0 -> 23.1.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [4]:
pip install uritemplate

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.0 -> 23.1.2
[notice] To update, run: python.exe -m pip install --upgrade pip


### Load the trained models from disk

In [5]:
import joblib
phishing_model = joblib.load('phishing_model.joblib')
fake_news_model = joblib.load('fake_news_model.joblib')
spam_model_xgb = joblib.load('spam_model_xgb.joblib')
sentiment_analysis_model = joblib.load('sentiment_analysis_model.joblib')


  If you are loading a serialized model (like pickle in Python, RDS in R) generated by
  older XGBoost, please export the model by calling `Booster.save_model` from that version
  first, then load it back in current version. See:

    https://xgboost.readthedocs.io/en/latest/tutorials/saving_model.html

  for more details about differences between saving model and serializing.



### Enter the testing email body here

In [151]:
text = "This is urgent! Someone has tried to access to your account. Please follow the link below to enter your credentials and reset your password"


### Text preprocessing

To prepare the test text for the models, we implemented preprocessing steps.

In [152]:
from bs4 import BeautifulSoup

# Define a function to preprocess the text
def preprocess_text(text):
    # Remove HTML and nbsp encoding
    soup = BeautifulSoup(text, 'html.parser')
    text = soup.get_text()

    # Remove numbers not attached to any other word
    text = re.sub(r'\b\d+\b', '', text)

    # Normalize unicode characters
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')

    # Tokenize the text into words
    tokens = word_tokenize(text.lower())

    # Remove stop words and punctuation
    stop_words = set(stopwords.words('english') + list(string.punctuation))
    filtered_tokens = [token for token in tokens if token not in stop_words]

    # Return list of tokens
    return filtered_tokens


In [153]:
preprocessed_text = preprocess_text(text)
print(preprocessed_text)


['urgent', 'someone', 'tried', 'access', 'account', 'please', 'follow', 'link', 'enter', 'credentials', 'reset', 'password']


In [154]:
def bag_of_words_phishing_model_vocab(preprocessed_text):
    # Create a CountVectorizer object
    vectorizer = CountVectorizer(vocabulary=phishing_model.get_booster().feature_names)
    
    # Apply BoW to the preprocessed text
    bow_matrix = vectorizer.fit_transform([preprocessed_text])

    # create a DataFrame from the bag of words matrix
    bow_df = pd.DataFrame(bow_matrix.toarray(), columns=vectorizer.get_feature_names())

    return bow_df


In [155]:
bow_df_phish = bag_of_words_phishing_model_vocab(' '.join(preprocessed_text))
bow_df_phish

Unnamed: 0,00,000,000000,000008,00000e2511c8,00000eur,000066,00084740000484800938,000999,000m,...,â_x0080__x0094_30,â_x0080__x0094_and,â_x0080__x0094_but,â_x0080__x0094_no,â_x0080__x0094_that,â_x0080__x0094_the,â_x0082_,â_x0096_,ï2007,Topic
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [156]:
def bag_of_words_fake_news_model_vocab(preprocessed_text):
    # Create a CountVectorizer object
    vectorizer = CountVectorizer(vocabulary=fake_news_model.get_booster().feature_names)
    
    # Apply BoW to the preprocessed text
    bow_matrix = vectorizer.fit_transform([preprocessed_text])

    # create a DataFrame from the bag of words matrix
    bow_df = pd.DataFrame(bow_matrix.toarray(), columns=vectorizer.get_feature_names())

    return bow_df


In [157]:
bow_df_fake_news = bag_of_words_fake_news_model_vocab(' '.join(preprocessed_text))
bow_df_fake_news

Unnamed: 0,003,0040,005380ks,01,0100,02,025,029,03,033,...,zitser,zoe,zona,zone,zoo,zoomph,zucker,zuckerberg,zuker,zzzzaaaacccchhh
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [158]:
def bag_of_words_spam_model_xgb_vocab(preprocessed_text):
    # Create a CountVectorizer object
    vectorizer = CountVectorizer(vocabulary=spam_model_xgb.get_booster().feature_names)
    
    # Apply BoW to the preprocessed text
    bow_matrix = vectorizer.fit_transform([preprocessed_text])

    # create a DataFrame from the bag of words matrix
    bow_df_spam = pd.DataFrame(bow_matrix.toarray(), columns=vectorizer.get_feature_names())

    return bow_df_spam

In [159]:
bow_df_spam = bag_of_words_fake_news_model_vocab(' '.join(preprocessed_text))
bow_df_spam

Unnamed: 0,003,0040,005380ks,01,0100,02,025,029,03,033,...,zitser,zoe,zona,zone,zoo,zoomph,zucker,zuckerberg,zuker,zzzzaaaacccchhh
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [160]:
# Assuming you have preprocessed_text and trained_logistic_model defined
bow_df_spam = bag_of_words_spam_model_xgb_vocab(' '.join(preprocessed_text))

bow_df_spam

Unnamed: 0,07xxxxxxxxx,08700621170150p,08702840625comuk,08718726270150gbpmtmsg18,10,100,1000,10000,10am7pm,10k,...,youre,youve,yr,yummy,yun,yunny,yup,zed,zoe,üll
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Classification

## Phishing classification

Predicting the likelihood of the test email being a phishing email.

In [161]:
# Use the phishing detection loaded model to make predictions on the test text
phishing_scores = phishing_model.predict_proba(bow_df_phish)[:, 1]
phishing_score = phishing_scores[0]
print('The probability of this email being a phishing one is :',phishing_score)

The probability of this email being a phishing one is : 0.8338311


## Malicious URL detection

Estimating the probability that the test email is a phishing email.

In [162]:
import requests

# replace YOUR_API_KEY with your actual VirusTotal API key
url = 'https://www.virustotal.com/vtapi/v2/url/report'
params = {'apikey': '0b7998a0f881d9dcf72c78041fa0cc2ca11433bd5343f33845e535b2453785f4', 'resource': 'http://appleid.apple.com-app.es/'}

response = requests.get(url, params=params)

if response.status_code == 200:
    result = response.json()
    if result['response_code'] == 1:
        if result['positives'] > 0:
            # calculate the probability of how likely the URL is a phishing site
            url_score = 1 - (result['positives'] / result['total'])
            print('The URL is a phishing site with a {:.2f} probability.'.format(url_score))
        else:
            print('The URL is safe.')
    else:
        print('The URL is not in the VirusTotal database.')
else:
    print('Error:', response.status_code, response.reason)


The URL is a phishing site with a 0.97 probability.


## Spam classification

Predicting the likelihood of the test email being a spam.

In [163]:
# Use the phishing detection loaded model to make predictions on the test text
spam_scores = spam_model_xgb.predict_proba(bow_df_spam)[:, 1]
spam_score = spam_scores[0]
print('The probability of this email being a spam :',spam_score)

The probability of this email being a spam : 0.80242026


## Fake news detection

Anticipating the probability that an email includes misinformation.

In [164]:
# Use the fake news loaded model to make predictions on the test text
fake_news_scores = fake_news_model.predict_proba(bow_df_fake_news)[:, 1]
fake_news_score = fake_news_scores[0]

print('The probability of this email containing fake news is :',fake_news_score)

The probability of this email containing fake news is : 0.9966377


## Cyberbullying detection

Predicting the likelihood of an email containing cyberbullying.

In [165]:
from googleapiclient import discovery
import json

API_KEY = 'AIzaSyD4otRh6Suo--QAuHQqPgpU1wj3wRaThAs'

def get_toxicity_score(text):
    client = discovery.build(
      "commentanalyzer",
      "v1alpha1",
      developerKey=API_KEY,
      discoveryServiceUrl="https://commentanalyzer.googleapis.com/$discovery/rest?version=v1alpha1",
      static_discovery=False,
    )

    analyze_request = {
      'comment': { 'text': text },
      'requestedAttributes': {'TOXICITY': {}}
    }


    response = client.comments().analyze(body=analyze_request).execute()
    toxicity_score = response['attributeScores']['TOXICITY']['summaryScore']['value']
    return toxicity_score

toxicity_score=get_toxicity_score(text)
print('The toxicity score of this email is :', toxicity_score)

The toxicity score of this email is : 0.025203144


## Sentiment analysis 

Estimating the likelihood that the test email is a phishing attempt by analyzing only the emotions conveyed in the message through natural language processing.

In [166]:
from nrclex import NRCLex
import pandas as pd


def extract_emotions(text):
    sentiment = NRCLex(text) 
    mapping = {'anticip': 'anticipation'}
    emotions = {mapping.get(k, k): v for k, v in sentiment.affect_frequencies.items()}
    return emotions

# Extract emotions from the text
emotions = extract_emotions(text)
# Create a DataFrame from the emotions dictionary
df_emotions = pd.DataFrame([emotions])

# Convert counts to binary values
df_emotions = df_emotions.applymap(lambda x: int(x > 0))

# Create a DataFrame with the text column
df_text = pd.DataFrame({"Text": [text]})

# Concatenate the DataFrames
df = pd.concat([df_text, df_emotions], axis=1)
#df = df.drop(columns=['anticip'])

# Print the DataFrame
df

Unnamed: 0,Text,fear,anger,anticipation,trust,surprise,positive,negative,sadness,disgust,joy
0,This is urgent! Someone has tried to access to...,1,0,1,1,1,0,1,0,0,0


In [167]:
from textblob import TextBlob

# Define a function to get sentiment polarity of each text
def get_sentiment(text):
    blob = TextBlob(text)
    return blob.sentiment.polarity

# Apply the function to each text in the 'text' column of the dataframe
df['Sentiment'] = df['Text'].apply(get_sentiment)

# Print the updated dataframe
df

Unnamed: 0,Text,fear,anger,anticipation,trust,surprise,positive,negative,sadness,disgust,joy,Sentiment
0,This is urgent! Someone has tried to access to...,1,0,1,1,1,0,1,0,0,0,0.0


In [168]:
df = df.drop(columns=['Text'])
df

Unnamed: 0,fear,anger,anticipation,trust,surprise,positive,negative,sadness,disgust,joy,Sentiment
0,1,0,1,1,1,0,1,0,0,0,0.0


## Calculating the technical score

In [169]:
technical_scores_list=[phishing_score, fake_news_score, toxicity_score,url_score,spam_score]
technical_score = sum(technical_scores_list)/len(technical_scores_list)
print('The technical score is :',technical_score)

The technical score is : 0.7249517686061117


## Calculating the emotional score

In [170]:
# Use the fake news loaded model to make predictions on the test text
emotional_scores = sentiment_analysis_model.predict_proba(df)[:, 1]
emotional_score = emotional_scores[0]
print('The probability of this email being a phishing one based on sentiment analysis is :',emotional_score)

The probability of this email being a phishing one based on sentiment analysis is : 0.512806892904256


## Final Score

We combined both the technical and the emotional scores by calculating their mean function. If the final calculated score is higher than 0.5, then it's a phishing email. Otherwise, it's legit.

In [171]:
final_scores_list=[technical_score, emotional_score]
final_score = sum(final_scores_list)/len(final_scores_list)
print('The Final score is :',final_score)

The Final score is : 0.6188793307551839


In [172]:
if (final_score>= 0.5):
    Class=1
    print("The test text is a phishing email !");
else:
    Class=0
    print("The test text is a legitimate email !")

The test text is a phishing email !
