In [3]:
import pandas as pd
import re

def count_urls(text):
    return len(re.findall(r'http[s]?://', text.lower()))

def count_exclamations(text):
    return text.count('!')

def count_urgency_keywords(text):
    urgency_words = ['immediately', 'urgent', 'verify', 'suspend', 'confirm',
                     'update', 'password', 'alert', 'action required', 'final notice',
                     'expires', 'within', 'hours', 'critical']
    text_lower = text.lower()
    count = sum(text_lower.count(word) for word in urgency_words)
    return count

def has_financial_keywords(text):
    financial_words = ['invoice', 'payment', 'transfer', 'bank', 'account',
                       'billing', 'subscription', 'charge', '$']
    text_lower = text.lower()
    return 1 if any(word in text_lower for word in financial_words) else 0

def has_login_request(text):
    login_phrases = ['click here', 'click the link', 'click to', 'log in',
                     'sign in', 'login', 'click below']
    text_lower = text.lower()
    return 1 if any(phrase in text_lower for phrase in login_phrases) else 0

def extract_features(text):
    features = {
        'char_count': len(text),
        'word_count': len(text.split()),
        'url_count': count_urls(text),
        'exclamation_count': count_exclamations(text),
        'urgency_keyword_count': count_urgency_keywords(text),
        'has_financial_keywords': has_financial_keywords(text),
        'has_login_request': has_login_request(text)
    }
    return features

def process_dataset(input_file, output_file):
    df = pd.read_csv(input_file)

    features_list = []
    for idx, row in df.iterrows():
        features = extract_features(row['text'])
        features['id'] = row['id']
        features['label'] = row['label']
        features_list.append(features)

    features_df = pd.DataFrame(features_list)

    cols = ['id', 'char_count', 'word_count', 'url_count', 'exclamation_count',
            'urgency_keyword_count', 'has_financial_keywords', 'has_login_request', 'label']
    features_df = features_df[cols]

    features_df.to_csv(output_file, index=False)
    print(f"Features extracted and saved to {output_file}")
    print(f"Total emails processed: {len(features_df)}")
    print(f"\nFeature summary:")
    print(features_df.describe())

    return features_df

if __name__ == "__main__":
    process_dataset('email_dataset.csv', 'email_features.csv')

Features extracted and saved to email_features.csv
Total emails processed: 24

Feature summary:
              id  char_count  word_count  url_count  exclamation_count  \
count  24.000000   24.000000   24.000000  24.000000               24.0   
mean   12.500000  130.625000   19.416667   0.500000                0.0   
std     7.071068   18.728756    3.786896   0.510754                0.0   
min     1.000000  101.000000   14.000000   0.000000                0.0   
25%     6.750000  115.000000   16.000000   0.000000                0.0   
50%    12.500000  131.500000   19.500000   0.500000                0.0   
75%    18.250000  141.750000   21.250000   1.000000                0.0   
max    24.000000  178.000000   28.000000   1.000000                0.0   

       urgency_keyword_count  has_financial_keywords  has_login_request  \
count              24.000000               24.000000          24.000000   
mean                1.958333                0.458333           0.250000   
std         

In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import warnings
warnings.filterwarnings('ignore')

def train_and_evaluate():
    df = pd.read_csv('email_features.csv')

    feature_columns = ['char_count', 'word_count', 'url_count', 'exclamation_count',
                       'urgency_keyword_count', 'has_financial_keywords', 'has_login_request']
    X = df[feature_columns]
    y = df['label']

    print("Dataset Info:")
    print(f"Total samples: {len(df)}")
    print(f"Phishing emails: {sum(y == 1)}")
    print(f"Benign emails: {sum(y == 0)}")
    print()

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, random_state=42, stratify=y
    )

    print(f"Training set: {len(X_train)} samples")
    print(f"Test set: {len(X_test)} samples")
    print()

    clf = LogisticRegression(random_state=42, max_iter=1000)
    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)

    print("="*50)
    print("RESULTS")
    print("="*50)
    print(f"\nAccuracy: {accuracy:.2%}")
    print("\nConfusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred, target_names=['Benign', 'Phishing']))

    print("\nFeature Importance (Coefficients):")
    feature_importance = pd.DataFrame({
        'feature': feature_columns,
        'coefficient': clf.coef_[0]
    }).sort_values('coefficient', ascending=False)
    print(feature_importance)

    test_results = pd.DataFrame({
        'id': df.iloc[X_test.index]['id'],
        'actual': y_test.values,
        'predicted': y_pred,
        'correct': y_test.values == y_pred
    })
    test_results.to_csv('test_predictions.csv', index=False)
    print("\nTest predictions saved to test_predictions.csv")

    return clf, accuracy, feature_importance

if __name__ == "__main__":
    clf, accuracy, feature_importance = train_and_evaluate()

Dataset Info:
Total samples: 24
Phishing emails: 12
Benign emails: 12

Training set: 16 samples
Test set: 8 samples

RESULTS

Accuracy: 100.00%

Confusion Matrix:
[[4 0]
 [0 4]]

Classification Report:
              precision    recall  f1-score   support

      Benign       1.00      1.00      1.00         4
    Phishing       1.00      1.00      1.00         4

    accuracy                           1.00         8
   macro avg       1.00      1.00      1.00         8
weighted avg       1.00      1.00      1.00         8


Feature Importance (Coefficients):
                  feature  coefficient
4   urgency_keyword_count     0.380464
0              char_count     0.226451
2               url_count     0.162721
5  has_financial_keywords     0.156861
6       has_login_request     0.075206
3       exclamation_count     0.000000
1              word_count    -0.883157

Test predictions saved to test_predictions.csv


In [5]:
import pandas as pd
import matplotlib.pyplot as plt

def create_visualizations():
    features_df = pd.read_csv('email_features.csv')

    fig, axes = plt.subplots(2, 2, figsize=(12, 10))

    phishing = features_df[features_df['label'] == 1]
    benign = features_df[features_df['label'] == 0]

    axes[0, 0].bar(['Phishing', 'Benign'],
                   [phishing['url_count'].mean(), benign['url_count'].mean()],
                   color=['red', 'green'])
    axes[0, 0].set_title('Average URL Count')
    axes[0, 0].set_ylabel('Count')

    axes[0, 1].bar(['Phishing', 'Benign'],
                   [phishing['urgency_keyword_count'].mean(), benign['urgency_keyword_count'].mean()],
                   color=['red', 'green'])
    axes[0, 1].set_title('Average Urgency Keywords')
    axes[0, 1].set_ylabel('Count')

    axes[1, 0].bar(['Phishing', 'Benign'],
                   [phishing['has_financial_keywords'].mean() * 100,
                    benign['has_financial_keywords'].mean() * 100],
                   color=['red', 'green'])
    axes[1, 0].set_title('Emails with Financial Keywords')
    axes[1, 0].set_ylabel('Percentage')

    axes[1, 1].bar(['Phishing', 'Benign'],
                   [phishing['has_login_request'].mean() * 100,
                    benign['has_login_request'].mean() * 100],
                   color=['red', 'green'])
    axes[1, 1].set_title('Emails with Login Requests')
    axes[1, 1].set_ylabel('Percentage')

    plt.tight_layout()
    plt.savefig('feature_comparison.png', dpi=300, bbox_inches='tight')
    print("Visualization saved to feature_comparison.png")
    plt.close()

    predictions = pd.read_csv('test_predictions.csv')
    from sklearn.metrics import confusion_matrix
    import seaborn as sns

    cm = confusion_matrix(predictions['actual'], predictions['predicted'])
    plt.figure(figsize=(6, 5))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=['Benign', 'Phishing'],
                yticklabels=['Benign', 'Phishing'])
    plt.title('Confusion Matrix')
    plt.ylabel('Actual')
    plt.xlabel('Predicted')
    plt.savefig('confusion_matrix.png', dpi=300, bbox_inches='tight')
    print("Confusion matrix saved to confusion_matrix.png")
    plt.close()

if __name__ == "__main__":
    create_visualizations()

Visualization saved to feature_comparison.png
Confusion matrix saved to confusion_matrix.png
