In [9]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

# Actual Work

In [10]:
corpus = pd.read_csv('corpus.txt')
label = pd.read_csv('labels.txt')
# X_train, X_test, y_train, y_test = train_test_split(corpus["text"], label["label"], test_size=0.2, random_state=42)
# vectorizer = TfidfVectorizer(strip_accents='ascii', stop_words='english', ngram_range=(1, 2))
# X_train2 = vectorizer.fit_transform(list(X_train))
# clf = SVC(gamma='auto')
# clf.fit(X_train, y_train)

In [11]:
# Assuming you have your data and labels (X, y) ready
data = pd.concat([corpus, label], axis=1)
data2 = data.dropna()

In [12]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data2["text"], data2["label"], test_size=0.2, random_state=42)

In [13]:
# Create TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()
# Fit and transform on training data, and transform test data
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [14]:
# Define the SVC model
svc = SVC()

# Define the parameter grid for GridSearchCV
param_grid = {
    'C': [0.1, 1, 10, 100],
    'gamma': [1, 0.1, 0.01, 0.001],
    'kernel': ['linear', 'rbf', 'poly', 'sigmoid']
}

# Perform GridSearchCV
grid_search = GridSearchCV(estimator=svc, param_grid=param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train_tfidf, y_train)

# Get the best parameters and best model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

In [15]:
# Test the performance of the best model on the test set 
y_pred = best_model.predict(X_test_tfidf)

# Evaluate the performance
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

# Print the results
print("Best Parameters:", best_params)
print("Test Accuracy:", accuracy)
print("Classification Report:\n", report)

Best Parameters: {'C': 10, 'gamma': 0.1, 'kernel': 'rbf'}
Test Accuracy: 0.7666666666666667
Classification Report:
               precision    recall  f1-score   support

       anger       0.79      0.73      0.76        75
        fear       0.75      0.75      0.75        77
         joy       0.66      0.73      0.69        88
        love       0.76      0.84      0.80        80
     sadness       0.73      0.65      0.69        80
    surprise       0.94      0.90      0.92        80

    accuracy                           0.77       480
   macro avg       0.77      0.77      0.77       480
weighted avg       0.77      0.77      0.77       480



In [16]:
# Drop rows with missing values
data2 = data.dropna()

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data2['text'], data2['label'], test_size=0.2, random_state=42)

# Create TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform on training data, and transform test data
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Define the SVC model
svc = SVC()

# Fit the model
svc.fit(X_train_tfidf, y_train)

# Test the performance of the model on the test set
y_pred = svc.predict(X_test_tfidf)

# Evaluate the performance
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

# Print the results
print("Test Accuracy:", accuracy)
print("Classification Report:\n", report)


Test Accuracy: 0.7041666666666667
Classification Report:
               precision    recall  f1-score   support

       anger       0.77      0.68      0.72        75
        fear       0.69      0.68      0.68        77
         joy       0.60      0.70      0.65        88
        love       0.64      0.72      0.68        80
     sadness       0.68      0.59      0.63        80
    surprise       0.91      0.85      0.88        80

    accuracy                           0.70       480
   macro avg       0.71      0.70      0.71       480
weighted avg       0.71      0.70      0.71       480

