In [56]:
import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
from sklearn.metrics import accuracy_score, classification_report

# Download stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Banu\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [58]:
data = pd.read_csv('fraud_email_.csv')
texts = data['Text']
labels = data['Class'] 

In [60]:
# Updated Text Preprocessing Function to handle non-string inputs
def preprocess(text):
    if isinstance(text, str):
        # Lowercasing, removing non-alphabetic chars, and tokenizing
        text = text.lower()
        text = re.sub(r'\W', ' ', text)
        text = re.sub(r'\s+', ' ', text)
        return text.split()
    else:
        return []  # Return an empty list for non-string inputs

# Apply preprocessing and remove stopwords
preprocessed_texts = texts.apply(lambda x: [word for word in preprocess(x) if word not in stop_words])

# Join the tokens back into text
processed_texts = preprocessed_texts.apply(lambda x: ' '.join(x))

In [62]:
processed_texts

0        supply quality china exclusive dimensions unbe...
1                                          sidlet know thx
2        dear friend greetings wish accost request woul...
3        mr cheung puihang seng bank ltd des voeux rd b...
4                            surprising assessment embassy
                               ...                        
11924              travel well look forward hearing report
11925    dear friend wish begin way introduction willia...
11926    follow flag follow upflag status flaggedmore info
11927    sbwhoeop b6saturday january 23 2010 4 09 pmre ...
11928    fyi revising call sheet call karzai potentiall...
Name: Text, Length: 11929, dtype: object

In [64]:
vectorizer = TfidfVectorizer(max_df=0.7, min_df=5, ngram_range=(1, 2), max_features=10000)  # Adjust n-grams and features
X = vectorizer.fit_transform(processed_texts)

In [65]:
X

<11929x10000 sparse matrix of type '<class 'numpy.float64'>'
	with 1098395 stored elements in Compressed Sparse Row format>

In [68]:
# Feature Selection using Chi-Square
selector = SelectKBest(chi2, k=5000)  # Select top 5000 features
X_selected = selector.fit_transform(X, labels)

In [70]:
X_selected

<11929x5000 sparse matrix of type '<class 'numpy.float64'>'
	with 807235 stored elements in Compressed Sparse Row format>

In [72]:
smote = SMOTE()
X_resampled, y_resampled = smote.fit_resample(X_selected, labels)

In [74]:
X_resampled

<13484x5000 sparse matrix of type '<class 'numpy.float64'>'
	with 1111905 stored elements in Compressed Sparse Row format>

In [76]:
y_resampled

0        1
1        0
2        1
3        1
4        0
        ..
13479    1
13480    1
13481    1
13482    1
13483    1
Name: Class, Length: 13484, dtype: int64

In [78]:
# Train/Test split
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

In [80]:
# Logistic Regression Classifier with regularization
classifier = LogisticRegression(max_iter=1000, class_weight='balanced')

In [82]:
# Hyperparameter Tuning using GridSearchCV
param_grid = {
    'C': [0.1, 1, 10, 100],  # Regularization strength
    'penalty': ['l2'],  # L2 regularization
    'solver': ['lbfgs', 'liblinear']
}

grid_search = GridSearchCV(classifier, param_grid, cv=5, scoring='accuracy', verbose=1, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Best hyperparameters
best_params = grid_search.best_params_
print(f"Best Parameters: {best_params}")

Fitting 5 folds for each of 8 candidates, totalling 40 fits
Best Parameters: {'C': 100, 'penalty': 'l2', 'solver': 'lbfgs'}


In [84]:
# Train the optimized model with the best parameters
best_classifier = grid_search.best_estimator_

# Evaluate the model
y_pred = best_classifier.predict(X_test)

In [86]:
# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

# Classification report
print(classification_report(y_test, y_pred))

Accuracy: 0.9862810530218762
              precision    recall  f1-score   support

           0       0.98      0.99      0.99      1338
           1       0.99      0.98      0.99      1359

    accuracy                           0.99      2697
   macro avg       0.99      0.99      0.99      2697
weighted avg       0.99      0.99      0.99      2697



In [88]:
# Cross-validation score to ensure robustness
cross_val_scores = cross_val_score(best_classifier, X_resampled, y_resampled, cv=5)
print(f"Cross-Validation Scores: {cross_val_scores}")
print(f"Mean Cross-Validation Accuracy: {np.mean(cross_val_scores)}")

Cross-Validation Scores: [0.98924731 0.98850575 0.98961809 0.98331479 0.96179525]
Mean Cross-Validation Accuracy: 0.982496239914885


In [90]:
# Function to predict fraud likelihood on custom input text
def predict_fraud(text, model, vectorizer, selector):
    # Preprocess the input text
    preprocessed_text = ' '.join([word for word in preprocess(text) if word not in stop_words])
    
    # Vectorize the input text
    X_input = vectorizer.transform([preprocessed_text])
    
    # Select important features
    X_input_selected = selector.transform(X_input)
    
    # Predict fraud likelihood (probability)
    fraud_proba = model.predict_proba(X_input_selected)[0][1]  # Probability for fraud class (1)
    
    # Return fraud likelihood as percentage
    return fraud_proba * 100

In [92]:
input_text = """I hope you are well when you get this email. My name is [Name], and I'm the son of the late [Famous Person]. I need your help with a secret problem right away.
I received $30,000,000 (Thirty Million US Dollars), but I can't get to it because of political unrest. I've been told to find a trustworthy foreign partner who can help me move the money.
I'm willing to pay you 40% of the total amount for your help, but I need your bank information to move forward.
Please answer quickly, because time is running out.
All the best,
This case shows that Nigerian email scams often involve a sense of urgency, a lot of money, problems with the government or the law, and a request for personal information.
If you get an email like this, do not reply or give out any personal information. Instead, tell your email service or the right people in your country about it.
If you want to learn more about online security and how to keep yourself safe from scams, there are many ways to do so.
Be careful online.
Advit Sahdev"""

In [94]:
fraud_likelihood = predict_fraud(input_text, best_classifier, vectorizer, selector)
print(f"Fraud Likelihood: {fraud_likelihood:.2f}%")

Fraud Likelihood: 99.59%
