In [267]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score,f1_score
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import TruncatedSVD 
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from scipy.sparse import hstack

In [268]:
df = pd.read_csv("Final_customer_support_tickets.csv")
df.shape

(4981, 17)

In [269]:
df = df[['Ticket Description','Ticket Subject','Ticket Type']]

In [270]:
df.rename(columns={'Ticket Description':'description','Ticket Type':'type','Ticket Subject':'subject'}, inplace=True)
df.head()

Unnamed: 0,description,subject,type
0,I'm facing issues logging into my {product_pur...,Payment issue,Billing inquiry
1,I believe there has been a violation of my con...,Terms of service violation,Legal inquiry
2,I'm unable to access my {product_purchased} ac...,Display issue,Billing inquiry
3,I'm having an issue with the {product_purchase...,Software bug,Technical issue
4,I'm having an issue with the {product_purchase...,Display issue,Billing inquiry


In [271]:
df.isna().sum()

description    0
subject        0
type           0
dtype: int64

In [272]:
df['type'].value_counts()

type
Technical issue    1747
Billing inquiry    1634
Legal inquiry      1600
Name: count, dtype: int64

In [273]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'\n', ' ', text)
    text = re.sub(r'[^a-zA-Z ]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

In [274]:
df['clean_description'] = df['description'].apply(clean_text)
df['clean_subject'] = df['subject'].apply(clean_text)

In [275]:
df.head()

Unnamed: 0,description,subject,type,clean_description,clean_subject
0,I'm facing issues logging into my {product_pur...,Payment issue,Billing inquiry,im facing issues logging into my productpurcha...,payment issue
1,I believe there has been a violation of my con...,Terms of service violation,Legal inquiry,i believe there has been a violation of my con...,terms of service violation
2,I'm unable to access my {product_purchased} ac...,Display issue,Billing inquiry,im unable to access my productpurchased accoun...,display issue
3,I'm having an issue with the {product_purchase...,Software bug,Technical issue,im having an issue with the productpurchased p...,software bug
4,I'm having an issue with the {product_purchase...,Display issue,Billing inquiry,im having an issue with the productpurchased p...,display issue


In [276]:
text = set(df['clean_description'].str.cat(sep=' ').split())
len(text)

4203

In [277]:
text = set(df['clean_subject'].str.cat(sep=' ').split())
len(text)

43

In [278]:
X = df[['clean_subject','clean_description']]
y = df['type']

In [279]:
# Encode target 
le = LabelEncoder() 
y = le.fit_transform(df["type"]) 

# Vectorize subject and content separately 
vec_subject = TfidfVectorizer() 
vec_description = TfidfVectorizer()

In [280]:
X_subject = vec_subject.fit_transform(df["subject"]) 
X_description = vec_description.fit_transform(df["description"]) 
# Combine features 
X = hstack([X_subject, X_description]) 
#Train/test split 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,stratify=y)

In [281]:
rf_params = { 'n_estimators': [100, 250,300, 350], 'max_depth': [None, 5,7,10, 20], 'min_samples_split': [2, 5], 'min_samples_leaf': [3, 2], 'max_features': ['sqrt', 'log2'] } 
rf_grid = GridSearchCV(RandomForestClassifier(), rf_params, cv=5, scoring='accuracy', n_jobs=-1) 
rf_grid.fit(X_train, y_train) 
print("Best RF:", rf_grid.best_params_, rf_grid.best_score_)

Best RF: {'max_depth': 20, 'max_features': 'log2', 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 100} 0.6759512745660549


In [282]:
svm_params = { 'C': [0.1,0.3,0.5,1, 10]} 
svm_grid = GridSearchCV(LinearSVC(), svm_params, cv=5, scoring='accuracy', n_jobs=-1) 
svm_grid.fit(X_train, y_train) 
print("Best SVM:", svm_grid.best_params_, svm_grid.best_score_)

Best SVM: {'C': 0.5} 0.6724415679400768


In [283]:
nb_params = { 'alpha': [0.1, 0.2,0.5,1.0] } 
nb_grid = GridSearchCV(MultinomialNB(), nb_params, cv=5, scoring='accuracy', n_jobs=-1) 
nb_grid.fit(X_train, y_train) 
print("Best NB:", nb_grid.best_params_, nb_grid.best_score_)

Best NB: {'alpha': 0.2} 0.666418037489833


In [284]:
# Pipeline: dimensionality reduction + KNN 
knn_pipeline = Pipeline([ ('svd', TruncatedSVD(n_components=300)), ('knn', KNeighborsClassifier()) ]) 

knn_params = { 'knn__n_neighbors': [3, 5, 7, 11], 'knn__weights': ['uniform', 'distance'], 'knn__metric': ['euclidean', 'manhattan', 'cosine'] } 
knn_grid = GridSearchCV(knn_pipeline, knn_params, cv=5, scoring='accuracy', n_jobs=-1) 
knn_grid.fit(X_train, y_train) 
print("Best KNN:", knn_grid.best_params_)
print("Best KNN Accuracy:", knn_grid.best_score_)

Best KNN: {'knn__metric': 'manhattan', 'knn__n_neighbors': 11, 'knn__weights': 'uniform'}
Best KNN Accuracy: 0.6681708416612547


In [289]:
results = []

models = {
    "Random Forest": rf_grid.best_estimator_,
    "SVM": svm_grid.best_estimator_,
    "Naive Bayes": nb_grid.best_estimator_,
    "KNN": knn_grid.best_estimator_
}

predictions = {
    "Random Forest": rf_grid.best_estimator_.predict(X_test),
    "SVM": svm_grid.best_estimator_.predict(X_test),
    "Naive Bayes": nb_grid.best_estimator_.predict(X_test),
    "KNN": knn_grid.best_estimator_.predict(X_test)
}

for name, pred in predictions.items():
    results.append({
        "Model": name,
        "Accuracy": accuracy_score(y_test, pred),
        "Precision": precision_score(y_test, pred, average='macro'),
        "Recall": recall_score(y_test, pred, average='macro'),
        "F1 Score": f1_score(y_test, pred, average='macro')
    })

results_df = pd.DataFrame(results)
print(results_df.sort_values(by="Accuracy", ascending=False))

           Model  Accuracy  Precision    Recall  F1 Score
0  Random Forest  0.685055   0.697442  0.683067  0.645064
3            KNN  0.672016   0.676944  0.676849  0.676498
2    Naive Bayes  0.662989   0.666314  0.666335  0.662482
1            SVM  0.660983   0.665336  0.665368  0.664062


In [290]:
import pickle

best_model_name = results_df.sort_values(by="Accuracy", ascending=False).iloc[0]["Model"]

best_model = models[best_model_name]

with open("best_model.pkl", "wb") as f:
    pickle.dump(best_model, f)

print(f"Saved best model: {best_model_name}")


Saved best model: Random Forest


In [291]:
# Save vectorizers 
with open("subject_vectorizer.pkl", "wb") as f: 
    pickle.dump(vec_subject, f) 
with open("content_vectorizer.pkl", "wb") as f: 
    pickle.dump(vec_description, f)

In [None]:
import pickle
import pandas as pd
from scipy.sparse import hstack


def predict_email_category(subject, content):
    """
    Takes subject and content as input strings
    Returns predicted class label
    """
    
    # Create dataframe (same format as training)
    new_data = pd.DataFrame({
        "subject": [subject],
        "content": [content]
    })
    
    # Apply cleaning
    new_data['clean_subject'] = new_data['subject'].apply(clean_text)
    new_data['description'] = new_data['content'].apply(clean_text)

    # --- Load the saved model ---
    with open("best_model.pkl", "rb") as f:
        model = pickle.load(f)

    # --- Load the vectorizers ---
    with open("subject_vectorizer.pkl", "rb") as f:
        sub_vectorizer = pickle.load(f)

    with open("content_vectorizer.pkl", "rb") as f:
        des_vectorizer = pickle.load(f)
    
    # Transform using saved vectorizers
    X_subject = sub_vectorizer.transform(new_data['clean_subject'])
    X_content = des_vectorizer.transform(new_data['description'])
    
    # Combine features
    X_new_vec = hstack([X_subject, X_content])
    
    # Predict
    pred_numeric = model.predict(X_new_vec)
    
    # Convert numeric label back to original class
    pred_class = le.inverse_transform(pred_numeric)
    
    return pred_class[0]

In [299]:
print(predict_email_category("Meeting reminder", "Don't forget the team sync at 10 AM"))
print(predict_email_category("Payment issue", "I'm facing issues logging into my {product_purchased} account. It says my account is locked. What should I do to unlock it? If my account is locked, why do I need to use I3 API to log in to I'm concerned about the security of my {product_purchased} and would like to ensure that my data is safe."))

Legal inquiry
Billing inquiry
