In [11]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

In [12]:
# Step 1: Load the dataset
emails_data = pd.read_csv("hyperparameter.csv.csv")

In [13]:
emails_data.head()

Unnamed: 0,Email No.,the,to,ect,and,for,of,a,you,hou,...,connevey,jay,valued,lay,infrastructure,military,allowing,ff,dry,Prediction
0,Email 1,0,0,1,0,0,0,2,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Email 2,8,13,24,6,6,2,102,1,27,...,0,0,0,0,0,0,0,1,0,0
2,Email 3,0,0,1,0,0,0,8,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Email 4,0,5,22,0,5,1,51,2,10,...,0,0,0,0,0,0,0,0,0,0
4,Email 5,7,6,17,1,5,2,57,0,9,...,0,0,0,0,0,0,0,1,0,0


In [14]:
emails_data.tail()

Unnamed: 0,Email No.,the,to,ect,and,for,of,a,you,hou,...,connevey,jay,valued,lay,infrastructure,military,allowing,ff,dry,Prediction
5167,Email 5168,2,2,2,3,0,0,32,0,0,...,0,0,0,0,0,0,0,0,0,0
5168,Email 5169,35,27,11,2,6,5,151,4,3,...,0,0,0,0,0,0,0,1,0,0
5169,Email 5170,0,0,1,1,0,0,11,0,0,...,0,0,0,0,0,0,0,0,0,1
5170,Email 5171,2,7,1,0,2,1,28,2,0,...,0,0,0,0,0,0,0,1,0,1
5171,Email 5172,22,24,5,1,6,5,148,8,2,...,0,0,0,0,0,0,0,0,0,0


In [17]:
emails_data.columns

Index(['Email No.', 'the', 'to', 'ect', 'and', 'for', 'of', 'a', 'you', 'hou',
       ...
       'connevey', 'jay', 'valued', 'lay', 'infrastructure', 'military',
       'allowing', 'ff', 'dry', 'Prediction'],
      dtype='object', length=3002)

In [18]:
emails_data.shape

(5172, 3002)

In [16]:
# Step 1: Data Preprocessing (TF-IDF vectorization for text data)
X = emails_data['text']  # Features (email content)
y = emails_data['category']  # Labels (spam/ham or categories)

In [19]:
# Splitting data into training and testing sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [20]:
# Step 2: Define a pipeline for text classification
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),  # Text data vectorization
    ('clf', RandomForestClassifier())  # Initial classifier (Random Forest)
])

In [21]:
# Step 3: Hyperparameter Tuning using Grid Search
param_grid = {
    'tfidf__max_df': [0.9, 0.95, 1.0],  # TF-IDF hyperparameters (max document frequency)
    'tfidf__ngram_range': [(1, 1), (1, 2)],  # Unigram and Bigram
    'clf__n_estimators': [100, 200],  # Number of trees in Random Forest
    'clf__max_depth': [10, 20, None]  # Maximum depth of trees
}

In [23]:
#for ignoring all type of warnings
import warnings

# Ignore all warnings
warnings.filterwarnings("ignore")

In [25]:
# Ensure all entries in X_train are strings
X_train = X_train.astype(str)
X_test = X_test.astype(str)


In [26]:
# Apply Grid Search with 3-fold cross-validation
grid_search = GridSearchCV(pipeline, param_grid, cv=3, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)


import pandas as pd  # Importing pandas for data manipulation
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV  # Importing necessary functions for splitting and tuning
from sklearn.feature_extraction.text import TfidfVectorizer  # Importing TF-IDF vectorizer for text processing
from sklearn.ensemble import RandomForestClassifier  # Importing Random Forest classifier
from sklearn.pipeline import Pipeline  # Importing Pipeline for creating a model workflow
from sklearn.metrics import classification_report  # Importing classification report for evaluation

# Load dataset
file_path = '/mnt/data/hyperparameter.csv'  # Path to the dataset
emails_data = pd.read_csv(file_path)  # Reading the dataset into a pandas DataFrame

# Step 1: Data Preprocessing
# Check if the necessary columns exist
if 'text' not in emails_data.columns or 'category' not in emails_data.columns:
    raise ValueError("The dataset must contain 'text' and 'category' columns.")

# Convert the text column to string to avoid type errors
emails_data['text'] = emails_data['text'].astype(str)

# Check for null values
if emails_data['text'].isnull().any() or emails_data['category'].isnull().any():
    raise ValueError("The dataset contains null values. Please clean the data before proceeding.")

# Remove empty or whitespace-only entries
emails_data = emails_data[emails_data['text'].str.strip() != '']

# Ensure there is valid data to process
if emails_data.empty:
    raise ValueError("The dataset contains no valid documents after cleaning.")

# Display the first few rows to understand the data structure
print(emails_data.head())

X = emails_data['text']  # Features (email content)
y = emails_data['category']  # Labels (spam/ham or categories)

# Splitting data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)  # 80-20 split

# Step 2: Define a pipeline for text classification
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english', min_df=1)),  # Step for text vectorization
    ('clf', RandomForestClassifier(random_state=42))  # Step for classification using Random Forest
])

# Step 3: Hyperparameter Tuning using Grid Search
param_grid = {
    'tfidf__max_df': [0.9, 0.95, 1.0],  # TF-IDF hyperparameters
    'tfidf__ngram_range': [(1, 1), (1, 2)],  # Testing unigrams and bigrams
    'clf__n_estimators': [100, 200],  # Number of trees in Random Forest
    'clf__max_depth': [10, 20, None]  # Max depth of trees
}

# Apply Grid Search
grid_search = GridSearchCV(pipeline, param_grid, cv=3, n_jobs=-1, verbose=2)  # Setting up grid search
grid_search.fit(X_train, y_train)  # Fitting the grid search model

# Best hyperparameters
print("Best Parameters from Grid Search:")
print(grid_search.best_params_)  # Display the best parameters found

# Step 4: Hyperparameter Tuning using Random Search
param_dist = {
    'tfidf__max_df': [0.8, 0.85, 0.9, 1.0],  # Range of values for TF-IDF
    'tfidf__ngram_range': [(1, 1), (1, 2)],  # Unigrams and bigrams
    'clf__n_estimators': [50, 100, 200, 300],  # Number of trees
    'clf__max_depth': [10, 20, 30, None],  # Max depth of trees
    'clf__min_samples_split': [2, 5, 10]  # Minimum samples required to split an internal node
}

# Apply Random Search
random_search = RandomizedSearchCV(pipeline, param_dist, n_iter=10, cv=3, n_jobs=-1, verbose=2, random_state=42)  # Setting up random search
random_search.fit(X_train, y_train)  # Fitting the random search model

# Best hyperparameters
print("Best Parameters from Random Search:")
print(random_search.best_params_)  # Display the best parameters found from random search

# Step 5: Model Evaluation
# Evaluate Grid Search Model
y_pred_grid = grid_search.predict(X_test)  # Predictions from the grid search model
print("Classification Report for Grid Search Tuned Model:")
print(classification_report(y_test, y_pred_grid))  # Display the classification report

# Evaluate Random Search Model
y_pred_random = random_search.predict(X_test)  # Predictions from the random search model
print("Classification Report for Random Search Tuned Model:")
print(classification_report(y_test, y_pred_random))  # Display the classification report


Fitting 3 folds for each of 36 candidates, totalling 108 fits


ValueError: 
All the 108 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
108 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Mani\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Mani\anaconda3\Lib\site-packages\sklearn\pipeline.py", line 401, in fit
    Xt = self._fit(X, y, **fit_params_steps)
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Mani\anaconda3\Lib\site-packages\sklearn\pipeline.py", line 359, in _fit
    X, fitted_transformer = fit_transform_one_cached(
                            ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Mani\anaconda3\Lib\site-packages\joblib\memory.py", line 349, in __call__
    return self.func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Mani\anaconda3\Lib\site-packages\sklearn\pipeline.py", line 893, in _fit_transform_one
    res = transformer.fit_transform(X, y, **fit_params)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Mani\anaconda3\Lib\site-packages\sklearn\feature_extraction\text.py", line 2133, in fit_transform
    X = super().fit_transform(raw_documents)
        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Mani\anaconda3\Lib\site-packages\sklearn\feature_extraction\text.py", line 1388, in fit_transform
    vocabulary, X = self._count_vocab(raw_documents, self.fixed_vocabulary_)
                    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Mani\anaconda3\Lib\site-packages\sklearn\feature_extraction\text.py", line 1294, in _count_vocab
    raise ValueError(
ValueError: empty vocabulary; perhaps the documents only contain stop words


In [None]:
# Step 3: Hyperparameter Tuning using Grid Search
param_grid = {
    'tfidf__max_df': [0.9, 0.95, 1.0],  # TF-IDF hyperparameters
    'tfidf__ngram_range': [(1, 1), (1, 2)],  # Testing unigrams and bigrams
    'clf__n_estimators': [100, 200],  # Number of trees in Random Forest
    'clf__max_depth': [10, 20, None]  # Max depth of trees
}


In [None]:
# Apply Grid Search
grid_search = GridSearchCV(pipeline, param_grid, cv=3, n_jobs=-1, verbose=2)  # Setting up grid search
grid_search.fit(X_train, y_train)  # Fitting the grid search model

In [None]:
# Best hyperparameters
print("Best Parameters from Grid Search:")
print(grid_search.best_params_)  # Display the best parameters found


In [None]:
# Apply Random Search
random_search = RandomizedSearchCV(pipeline, param_dist, n_iter=10, cv=3, n_jobs=-1, verbose=2, random_state=42)  # Setting up random search
random_search.fit(X_train, y_train)  # Fitting the random search model


In [None]:
# Step 4: Hyperparameter Tuning using Random Search
param_dist = {
    'tfidf__max_df': [0.8, 0.85, 0.9, 1.0],  # Range of values for TF-IDF
    'tfidf__ngram_range': [(1, 1), (1, 2)],  # Unigrams and bigrams
    'clf__n_estimators': [50, 100, 200, 300],  # Number of trees
    'clf__max_depth': [10, 20, 30, None],  # Max depth of trees
    'clf__min_samples_split': [2, 5, 10]  # Minimum samples required to split an internal node
}

In [None]:
# Best hyperparameters
print("Best Parameters from Random Search:")
print(random_search.best_params_)  # Display the best parameters found from random search

In [None]:
# Step 5: Model Evaluation
# Evaluate Grid Search Model
y_pred_grid = grid_search.predict(X_test)  # Predictions from the grid search model
print("Classification Report for Grid Search Tuned Model:")
print(classification_report(y_test, y_pred_grid))  # Display the classification report

In [None]:
# Evaluate Random Search Model
y_pred_random = random_search.predict(X_test)  # Predictions from the random search model
print("Classification Report for Random Search Tuned Model:")
print(classification_report(y_test, y_pred_random))  # Display the classification report