NAME ENTITY RECOGNITION IN CODE MIXING DATASET USING SVM, CNN

1. Importing Libraries

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Embedding, Input
from tensorflow.keras.models import Model

2. Loading and Preprocessing the Dataset

In [None]:
data = pd.read_csv('/content/Code_Mixed.csv')

# Assuming 'text' and 'label' are the column names in  dataset
texts = data['text'].values
labels = data['label'].values

# Preprocess the text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
word_index = tokenizer.word_index


3. Padding and Encoding

In [None]:
MAX_SEQUENCE_LENGTH = 100
EMBEDDING_DIM = 100

# Pad sequences
X = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
y = pd.get_dummies(labels).values  # One-hot encoding


4. Splitting the Data into Training and Testing Sets

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

5. Building and Training a CNN Model

In [None]:
def create_cnn_model(input_length, embedding_dim):
    input = Input(shape=(input_length,))
    x = Embedding(len(word_index) + 1, embedding_dim, input_length=input_length)(input)
    x = Conv1D(128, 5, activation='relu')(x)
    x = MaxPooling1D(pool_size=4)(x)
    x = Flatten()(x)
    x = Dense(64, activation='relu')(x)
    x = Dense(len(y_train[0]), activation='softmax')(x)

    model = Model(inputs=input, outputs=x)
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

# Create and train CNN model
cnn_model = create_cnn_model(MAX_SEQUENCE_LENGTH, EMBEDDING_DIM)
cnn_model.fit(X_train, y_train, epochs=5, batch_size=32, validation_split=0.2)



Epoch 1/5
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 61ms/step - accuracy: 0.3668 - loss: 1.0755 - val_accuracy: 0.5217 - val_loss: 1.0478
Epoch 2/5
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 46ms/step - accuracy: 0.7750 - loss: 0.8732 - val_accuracy: 0.5031 - val_loss: 0.9898
Epoch 3/5
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 46ms/step - accuracy: 0.8813 - loss: 0.5221 - val_accuracy: 0.5590 - val_loss: 0.8600
Epoch 4/5
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 48ms/step - accuracy: 0.9989 - loss: 0.1807 - val_accuracy: 0.6894 - val_loss: 0.7136
Epoch 5/5
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 48ms/step - accuracy: 1.0000 - loss: 0.0380 - val_accuracy: 0.7329 - val_loss: 0.6783


<keras.src.callbacks.history.History at 0x7ed72a3408b0>

6. Extracting Features Using the CNN

In [None]:
def extract_features(model, X):
    feature_extractor = Model(inputs=model.input, outputs=model.layers[-2].output)
    features = feature_extractor.predict(X)
    return features

# Extract features for SVM
X_train_features = extract_features(cnn_model, X_train)
X_test_features = extract_features(cnn_model, X_test)


[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step


7. Training and Evaluating the SVM Model

In [None]:
# Train SVM
svm_model = make_pipeline(StandardScaler(), SVC(kernel='poly', C=1, gamma='scale'))
svm_model.fit(X_train_features, np.argmax(y_train, axis=1))

# Evaluate
y_pred = svm_model.predict(X_test_features)
print(f'Accuracy: {accuracy_score(np.argmax(y_test, axis=1), y_pred)}')
print(classification_report(np.argmax(y_test, axis=1), y_pred))


Accuracy: 0.4405940594059406
              precision    recall  f1-score   support

           0       1.00      0.06      0.11        68
           1       0.70      0.12      0.21        56
           2       0.41      1.00      0.59        78

    accuracy                           0.44       202
   macro avg       0.70      0.39      0.30       202
weighted avg       0.69      0.44      0.32       202

