In [2]:
import pandas as pd
import neattext as nt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import SVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from joblib import dump

# Load dataset
df = pd.read_csv("emotion_dataset_raw.csv")

# Text preprocessing
def clean_text(text):
    text = nt.TextFrame(text).clean_text().lower()
    return text

df['cleaned_text'] = df['Text'].apply(clean_text)

# Label encoding
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df['Emotion'])  # Fit and transform labels

# Save the label encoder
dump(label_encoder, 'label_encoder.joblib')  # Save the encoder for later use

# Split data
X_train, X_test, y_train, y_test = train_test_split(df['cleaned_text'], y, test_size=0.2, stratify=y, random_state=42)

#  Define a pipeline with CountVectorizer and Calibrated SVM
svc = SVC(kernel='rbf', C=1.5)  # No probability=True
calibrated_svc = CalibratedClassifierCV(svc, cv=3)  # Calibrate probabilities

pipe_svm = Pipeline(steps=[('cv', CountVectorizer(ngram_range=(1,2), max_features=10000, stop_words='english')),
                            ('svc', calibrated_svc)])

# Define a pipeline using CountVectorizer and SVM
# pipe_svm = Pipeline(steps=[('cv', CountVectorizer(ngram_range=(1,2), max_features=10000, stop_words='english')),
#                            ('svc', SVC(kernel='rbf', C=1.5, probability=True))])

# Train the pipeline
pipe_svm.fit(X_train, y_train)

# Evaluate Model
accuracy = pipe_svm.score(X_test, y_test)
print(f'Pipeline SVM Accuracy: {accuracy:.4f}')

# Save the pipeline
dump(pipe_svm, 'svm_pipeline.joblib')




Pipeline SVM Accuracy: 0.6257
 Model and Label Encoder saved successfully!
