In [20]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier


In [9]:
# Load the dataset into a DataFrame
data_path = 'spam-data.csv'
spam_data = pd.read_csv(data_path)

# Display the first few rows of the dataset
spam_data.head()


Unnamed: 0,word_freq_make,word_freq_address,word_freq_all,word_freq_3d,word_freq_our,word_freq_over,word_freq_remove,word_freq_internet,word_freq_order,word_freq_mail,...,char_freq_;,char_freq_(,char_freq_[,char_freq_!,char_freq_$,char_freq_#,capital_run_length_average,capital_run_length_longest,capital_run_length_total,spam
0,0.0,0.64,0.64,0.0,0.32,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.778,0.0,0.0,3.756,61,278,1
1,0.21,0.28,0.5,0.0,0.14,0.28,0.21,0.07,0.0,0.94,...,0.0,0.132,0.0,0.372,0.18,0.048,5.114,101,1028,1
2,0.06,0.0,0.71,0.0,1.23,0.19,0.19,0.12,0.64,0.25,...,0.01,0.143,0.0,0.276,0.184,0.01,9.821,485,2259,1
3,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.137,0.0,0.137,0.0,0.0,3.537,40,191,1
4,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.135,0.0,0.135,0.0,0.0,3.537,40,191,1


In [10]:
# Separate the labels (y) and the features (X)
X = spam_data.drop(columns=['spam'])  # Features (X)
y = spam_data['spam']  # Labels (y)

# Display the first few rows of features and labels separately
print("Features (X):")
print(X.head())

print("\nLabels (y):")
print(y.head().to_list())  # Display as a list for cleaner output


Features (X):
   word_freq_make  word_freq_address  word_freq_all  word_freq_3d  \
0            0.00               0.64           0.64           0.0   
1            0.21               0.28           0.50           0.0   
2            0.06               0.00           0.71           0.0   
3            0.00               0.00           0.00           0.0   
4            0.00               0.00           0.00           0.0   

   word_freq_our  word_freq_over  word_freq_remove  word_freq_internet  \
0           0.32            0.00              0.00                0.00   
1           0.14            0.28              0.21                0.07   
2           1.23            0.19              0.19                0.12   
3           0.63            0.00              0.31                0.63   
4           0.63            0.00              0.31                0.63   

   word_freq_order  word_freq_mail  ...  word_freq_conference  char_freq_;  \
0             0.00            0.00  ...         

In [11]:
# Check the balance of the labels without showing 'dtype'
label_balance = y.value_counts().to_frame().T
label_balance.columns = ['Legitimate (0)', 'Spam (1)']

# Display the balance
label_balance


Unnamed: 0,Legitimate (0),Spam (1)
count,2788,1813


In [12]:
# Split the data into training and testing datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Display the shapes of the training and testing datasets
X_train.shape, X_test.shape, y_train.shape, y_test.shape


((3220, 57), (1381, 57), (3220,), (1381,))

In [14]:
# Check the balance of the labels (Step 4)
label_balance = y.value_counts().to_dict()  # Convert to a dictionary for cleaner output

# Split the data into training and testing datasets (Step 5)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Display the results
print("Label Balance:")
print(label_balance)

print("\nTraining Set Shape (X_train):", X_train.shape)
print("Test Set Shape (X_test):", X_test.shape)
print("Training Labels Shape (y_train):", y_train.shape)
print("Test Labels Shape (y_test):", y_test.shape)


Label Balance:
{0: 2788, 1: 1813}

Training Set Shape (X_train): (3220, 57)
Test Set Shape (X_test): (1381, 57)
Training Labels Shape (y_train): (3220,)
Test Labels Shape (y_test): (1381,)


In [16]:
# Create an instance of StandardScaler
scaler = StandardScaler()

# Fit the Standard Scaler with the training data and scale both the training and testing sets
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Display the scaled feature sets
print("Scaled Training Features (X_train_scaled):")
print(X_train_scaled[:5])

print("\nScaled Testing Features (X_test_scaled):")
print(X_test_scaled[:5])


Scaled Training Features (X_train_scaled):
[[-3.35858000e-01 -1.60384855e-01 -5.50299633e-01 -4.67575234e-02
  -4.48094248e-01 -3.44230997e-01  2.95069107e+00 -2.52313126e-01
  -3.26802421e-01  1.52695038e+00 -2.90920250e-01  7.73997149e-01
   3.99424956e+00 -1.75098692e-01 -1.88096271e-01  1.16778850e+00
  -3.13085657e-01 -3.45040635e-01  1.80338613e+00 -1.52399275e-01
  -6.55523923e-01 -1.13809069e-01 -2.83244283e-01 -1.99899282e-01
  -3.23300767e-01 -3.01262910e-01 -2.31258465e-01 -2.31185233e-01
  -1.73051236e-01 -2.19800513e-01 -1.72444267e-01 -1.41693225e-01
  -1.92112932e-01 -1.45042941e-01 -1.89138207e-01 -2.42503960e-01
  -3.30969458e-01 -5.54226110e-02 -1.78282901e-01 -1.81173966e-01
  -1.20697297e-01 -1.73489439e-01 -2.01299156e-01 -1.29063362e-01
  -2.99499865e-01 -2.11775675e-01 -7.00914584e-02 -1.15629137e-01
  -1.59690752e-01 -4.82174140e-01 -1.72383315e-01 -2.98221938e-01
  -3.40470319e-01 -1.22000826e-01 -1.06916416e-01 -1.75469981e-01
  -3.83831032e-01]
 [-3.35858000e

In [19]:
# Create a Logistic Regression model with random_state=1
log_reg_model = LogisticRegression(random_state=1)

# Fit the model using the scaled training data
log_reg_model.fit(X_train_scaled, y_train)

# Make predictions on the testing data
y_pred_log_reg = log_reg_model.predict(X_test_scaled)

# Evaluate the model's performance using accuracy score
log_reg_accuracy = accuracy_score(y_test, y_pred_log_reg)

# Display the accuracy
print("Logistic Regression Model Accuracy:", log_reg_accuracy)


Logistic Regression Model Accuracy: 0.9232440260680667


In [21]:

# Create a Random Forest Classifier model with random_state=1
rf_model = RandomForestClassifier(random_state=1)

# Fit the model using the scaled training data
rf_model.fit(X_train_scaled, y_train)

# Make predictions on the testing data
y_pred_rf = rf_model.predict(X_test_scaled)

# Evaluate the model's performance using accuracy score
rf_accuracy = accuracy_score(y_test, y_pred_rf)

# Display the accuracy
print("Random Forest Model Accuracy:", rf_accuracy)


Random Forest Model Accuracy: 0.9558291093410572


## Evaluate the Models

The Random Forest model is significantly more accurate than the Logistic Regression model. The data shows that the Random Forest achieved an accuracy of 95%, while the Logistic Regression model only reached 92%.