# DATASET

In [1]:
import pandas as pd
url = '/content/sample_data/emotions (1).csv'
df = pd.read_csv(url)
print(df)

                                                     text  label
0           i just feel really helpless and heavy hearted      4
1       ive enjoyed being able to slouch about relax a...      0
2       i gave up my internship with the dmrg and am f...      4
3                              i dont know i feel so lost      0
4       i am a kindergarten teacher and i am thoroughl...      4
...                                                   ...    ...
416804  i feel like telling these horny devils to find...      2
416805  i began to realize that when i was feeling agi...      3
416806  i feel very curious be why previous early dawn...      5
416807  i feel that becuase of the tyranical nature of...      3
416808  i think that after i had spent some time inves...      5

[416809 rows x 2 columns]


In [2]:
df.shape

(416809, 2)

In [3]:
df.isnull().sum()

Unnamed: 0,0
text,0
label,0


In [4]:
df.duplicated().sum()

np.int64(686)

In [5]:
df.drop_duplicates(inplace=True)

# PREPROCESSING

Lowercasing

In [6]:
df['text'] = df['text'].str.lower()

Removing urls

In [7]:
import re
df['text'] = df['text'].apply(lambda x: re.sub(r'https?://\S+|www\.\S+', '', x))

Remove html

In [8]:
df['text'] = df['text'].apply(lambda x: re.sub(r'<.*?>', '', x))

Remove special characters

In [9]:
df['text'] = df['text'].apply(lambda x: re.sub(r'[^a-z\s]', '', x))

Remove white spaces

In [10]:
df['text'] = df['text'].apply(lambda x: re.sub(r'\s+', ' ', x).strip())

In [45]:
def stratified_sample(df, label_col='label', n_samples=100000, random_state=42):
    # Calculate samples per class
    n_classes = df[label_col].nunique()
    samples_per_class = n_samples // n_classes

    # Sample from each class
    df_sampled = df.groupby(label_col).apply(
        lambda x: x.sample(n=min(len(x), samples_per_class), random_state=random_state)
    ).reset_index(drop=True)

    return df_sampled

df = stratified_sample(df, label_col='label', n_samples=100000)

  df_sampled = df.groupby(label_col).apply(


Convert text into TF - IDF vectors

In [46]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=500, stop_words='english')
X = vectorizer.fit_transform(df['text'])

Encode the labels

In [47]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df['label'])

Split the dataset

In [48]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# TRAINING AND EVALUATION

Import necessary packages

In [49]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report

LINEAR

In [50]:
# Train SVM with linear kernel
model = SVC(kernel='linear')
model.fit(X_train, y_train)

# Predict
y_pred = model.predict(X_test)

# Print performance matrix
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.58      0.58      0.58       351
           1       0.53      0.62      0.57       333
           2       0.86      0.93      0.89       352
           3       0.91      0.72      0.80       332
           4       0.92      0.74      0.82       320
           5       0.88      0.98      0.93       312

    accuracy                           0.76      2000
   macro avg       0.78      0.76      0.76      2000
weighted avg       0.77      0.76      0.76      2000



POLYNOMIAL

In [51]:
# Train SVM with polynomial kernel
model = SVC(kernel='poly', degree=2, C=1.0)  # degree=3 is default
model.fit(X_train, y_train)

# Predict
y_pred = model.predict(X_test)

# Print performance matrix
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.52      0.57      0.54       351
           1       0.46      0.60      0.52       333
           2       0.86      0.84      0.85       352
           3       0.88      0.69      0.77       332
           4       0.86      0.69      0.77       320
           5       0.89      0.90      0.90       312

    accuracy                           0.71      2000
   macro avg       0.74      0.72      0.72      2000
weighted avg       0.74      0.71      0.72      2000



RBF

In [52]:
#Train SVM with rbf kernel
model = SVC(kernel='rbf')
model.fit(X_train, y_train)

# Predict
y_pred = model.predict(X_test)

# Print performance matrix
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.55      0.57      0.56       351
           1       0.50      0.59      0.54       333
           2       0.86      0.92      0.89       352
           3       0.90      0.72      0.80       332
           4       0.91      0.72      0.80       320
           5       0.88      0.98      0.93       312

    accuracy                           0.75      2000
   macro avg       0.77      0.75      0.75      2000
weighted avg       0.76      0.75      0.75      2000



SIGMOID

In [53]:
#Train SVM with sigmoid kernel
model = SVC(kernel='sigmoid')
model.fit(X_train, y_train)

# Predict
y_pred = model.predict(X_test)

# Print performance matrix
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.55      0.60      0.58       351
           1       0.53      0.57      0.55       333
           2       0.86      0.91      0.89       352
           3       0.86      0.72      0.78       332
           4       0.92      0.74      0.82       320
           5       0.87      0.97      0.92       312

    accuracy                           0.75      2000
   macro avg       0.77      0.75      0.76      2000
weighted avg       0.76      0.75      0.75      2000

