Here the goal is vectorization of the email data. To take a more NLP approach.

In [4]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


# Load the data
df = pd.read_csv("Sentiment_Analysis_Export_df.csv")

# Initialize the TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fitting the vectorizer to the text
tfidf_vectorizer.fit(df["Tokenized_Text"])

# Transforming tokenized text into TF-IDF vectors
tfidf_vectors = tfidf_vectorizer.transform(df["Tokenized_Text"])

# Initialize the logistic regression model
logistic_model = LogisticRegression()

# Fitting the logistic regression model, "Email Type is the target variable"
logistic_model.fit(tfidf_vectors, df["Email Type"])


# Define the selected configurations
selected_configurations = [
    {'penalty': 'l1', 'C': 5.0, 'solver': 'liblinear'},
    {'penalty': 'l1', 'C': 0.5, 'solver': 'liblinear'},
    {'penalty': 'l2', 'C': 5.0, 'solver': 'liblinear'},
    {'penalty': 'l2', 'C': 10.0, 'solver': 'newton-cg'}
]

# Create an empty list to store the results
summary = []

# Perform stratified k-fold cross-validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for config in selected_configurations:
    for train_index, test_index in skf.split(tfidf_vectors, df["Email Type"]):
        X_train, X_test = tfidf_vectors[train_index], tfidf_vectors[test_index]
        y_train, y_test = df["Email Type"][train_index], df["Email Type"][test_index]
        
        model = LogisticRegression(**config)
        clf = model.fit(X_train, y_train)
        y_pred = clf.predict(X_test)

        # Evaluate the model
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average='weighted')
        recall = recall_score(y_test, y_pred, average='weighted')
        f1 = f1_score(y_test, y_pred, average='weighted')

        # Store the results
        summary.append({
            'config': config,
            'accuracy': accuracy,
            'precision': precision,
            'recall': recall,
            'f1': f1
        })

# Display the results
for result in summary:
    print("Configuration:", result['config'])
    print('Accuracy:', result['accuracy'])
    print('Precision:', result['precision'])
    print('Recall:', result['recall'])
    print('F1 Score:', result['f1'])
    print()



Configuration: {'penalty': 'l1', 'C': 5.0, 'solver': 'liblinear'}
Accuracy: 0.9740403203534935
Precision: 0.9741184454994524
Recall: 0.9740403203534935
F1 Score: 0.9740641164526009

Configuration: {'penalty': 'l1', 'C': 5.0, 'solver': 'liblinear'}
Accuracy: 0.9787292817679558
Precision: 0.9788754102380285
Recall: 0.9787292817679558
F1 Score: 0.9787609850534218

Configuration: {'penalty': 'l1', 'C': 5.0, 'solver': 'liblinear'}
Accuracy: 0.974585635359116
Precision: 0.9748487770343509
Recall: 0.974585635359116
F1 Score: 0.9746376887955178

Configuration: {'penalty': 'l1', 'C': 5.0, 'solver': 'liblinear'}
Accuracy: 0.9784530386740331
Precision: 0.9788418069145214
Recall: 0.9784530386740331
F1 Score: 0.9785101637387623

Configuration: {'penalty': 'l1', 'C': 5.0, 'solver': 'liblinear'}
Accuracy: 0.9787292817679558
Precision: 0.9791009119468521
Recall: 0.9787292817679558
F1 Score: 0.978784254980693

Configuration: {'penalty': 'l1', 'C': 0.5, 'solver': 'liblinear'}
Accuracy: 0.954156310411488

In [5]:
# Perform stratified k-fold cross-validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for config in selected_configurations:
    for train_index, test_index in skf.split(tfidf_vectors, df["Email Type"]):
        X_train, X_test = tfidf_vectors[train_index], tfidf_vectors[test_index]
        y_train, y_test = df["Email Type"][train_index], df["Email Type"][test_index]
        
        model = LogisticRegression(**config)
        clf = model.fit(X_train, y_train)
        y_pred = clf.predict(X_test)

        # Evaluate the model
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average='weighted')
        recall = recall_score(y_test, y_pred, average='weighted')
        f1 = f1_score(y_test, y_pred, average='weighted')

        # Store the results
        summary.append({
            'config': config,
            'accuracy': accuracy,
            'precision': precision,
            'recall': recall,
            'f1': f1
        })

# Display the results
for result in summary:
    print("Configuration:", result['config'])
    print('Accuracy:', result['accuracy'])
    print('Precision:', result['precision'])
    print('Recall:', result['recall'])
    print('F1 Score:', result['f1'])
    print()


Configuration: {'penalty': 'l1', 'C': 5.0, 'solver': 'liblinear'}
Accuracy: 0.9740403203534935
Precision: 0.9741184454994524
Recall: 0.9740403203534935
F1 Score: 0.9740641164526009

Configuration: {'penalty': 'l1', 'C': 5.0, 'solver': 'liblinear'}
Accuracy: 0.9787292817679558
Precision: 0.9788754102380285
Recall: 0.9787292817679558
F1 Score: 0.9787609850534218

Configuration: {'penalty': 'l1', 'C': 5.0, 'solver': 'liblinear'}
Accuracy: 0.974585635359116
Precision: 0.9748487770343509
Recall: 0.974585635359116
F1 Score: 0.9746376887955178

Configuration: {'penalty': 'l1', 'C': 5.0, 'solver': 'liblinear'}
Accuracy: 0.9784530386740331
Precision: 0.9788418069145214
Recall: 0.9784530386740331
F1 Score: 0.9785101637387623

Configuration: {'penalty': 'l1', 'C': 5.0, 'solver': 'liblinear'}
Accuracy: 0.9787292817679558
Precision: 0.9791009119468521
Recall: 0.9787292817679558
F1 Score: 0.978784254980693

Configuration: {'penalty': 'l1', 'C': 0.5, 'solver': 'liblinear'}
Accuracy: 0.954156310411488