In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.multioutput import MultiOutputClassifier

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.pipeline import Pipeline

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
data = pd.read_csv('kolam_dataset_multilabels.csv')
inputs = data['input']

In [3]:
inputs


0                               a
1                               b
2                               c
3                               d
4                              ab
                  ...            
155                  dbac c cd dc
156          dbac c c cab b b bac
157          dca a a acdb b b bdc
158    bacd d dbac c cd d d d dba
159           dbac c cd d d d dba
Name: input, Length: 160, dtype: object

In [4]:
labels = data.drop(['input'], axis =1)
labels

Unnamed: 0,a,b,c,d
0,1,1,1,0
1,1,1,0,1
2,1,0,1,1
3,0,1,1,1
4,0,1,0,0
...,...,...,...,...
155,1,0,1,0
156,0,0,1,1
157,1,0,1,0
158,1,0,1,0


In [5]:
from gensim.models import Word2Vec
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize


# Assuming inputs is a list of text documents and labels is a list of corresponding labels
# Tokenize your input text
tokenized_inputs = []
for text in inputs:
    tokens = word_tokenize(text.lower())
    tokenized_inputs.append(tokens)

# Train the Word2Vec model
word2vec_model = Word2Vec(sentences=tokenized_inputs, vector_size=100, window=5, min_count=1, workers=4)

# Vectorize input text using Word2Vec embeddings
X = []
for tokens in tokenized_inputs:
    # Calculate the mean of word vectors for each document
    doc_vector = np.mean([word2vec_model.wv[token] for token in tokens if token in word2vec_model.wv], axis=0)
    X.append(doc_vector)

# Convert X to numpy array
X = np.array(X)

[nltk_data] Downloading package punkt to C:\Users\efm-
[nltk_data]     workstation\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [9]:
X

array([[ 8.6519940e-05,  3.3124264e-03, -6.7321127e-03, ...,
         4.1963515e-04,  8.2388557e-03, -6.9112903e-03],
       [-8.2448330e-03,  9.5326332e-03, -1.3123872e-04, ...,
        -7.5922576e-03, -2.5037262e-03, -5.4663518e-03],
       [-4.9189792e-04,  3.4509110e-04,  5.1286370e-03, ...,
        -7.1302252e-03,  9.0571831e-04,  6.4456766e-03],
       ...,
       [-2.5955513e-03,  5.2654194e-03,  1.7248106e-04, ...,
         1.0798306e-03,  2.6407545e-03, -2.4870511e-03],
       [-2.6335183e-03,  1.2275662e-03,  2.9670873e-03, ...,
        -1.1412036e-03, -3.0794302e-03,  3.4259344e-03],
       [-1.4178631e-03,  1.9794677e-03,  4.1776733e-03, ...,
        -1.9420154e-03, -3.9405595e-03,  3.2463970e-03]], dtype=float32)

In [6]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.2, random_state=42, stratify = labels)

In [11]:
from sklearn.svm import SVC

# Initialize and train SVM model
svm_model = MultiOutputClassifier(SVC())
svm_model.fit(X_train, y_train)

# Predict labels for test data
y_pred_svm = svm_model.predict(X_test)

# Evaluate the model
svm_accuracy = svm_model.score(X_test, y_test)
print("SVM Accuracy:", svm_accuracy)


# Print classification report
print("Classification Report:")
print(classification_report(y_test, y_pred_svm))

SVM Accuracy: 0.46875
Classification Report:
              precision    recall  f1-score   support

           0       0.78      0.82      0.80        17
           1       0.70      0.78      0.74        18
           2       0.77      0.59      0.67        17
           3       0.85      1.00      0.92        17

   micro avg       0.77      0.80      0.79        69
   macro avg       0.77      0.80      0.78        69
weighted avg       0.77      0.80      0.78        69
 samples avg       0.77      0.74      0.73        69



In [12]:
from sklearn.neighbors import KNeighborsClassifier

# Initialize and train KNN model
knn_model = MultiOutputClassifier(KNeighborsClassifier())
knn_model.fit(X_train, y_train)

# Predict labels for test data
y_pred_knn = knn_model.predict(X_test)


# Evaluate the model
knn_accuracy = knn_model.score(X_test, y_test)
print("KNN Accuracy:", knn_accuracy)


# Print classification report
print("Classification Report:")
print(classification_report(y_test, y_pred_knn))

KNN Accuracy: 0.21875
Classification Report:
              precision    recall  f1-score   support

           0       0.50      0.59      0.54        17
           1       0.59      0.72      0.65        18
           2       0.59      0.59      0.59        17
           3       0.67      0.82      0.74        17

   micro avg       0.59      0.68      0.63        69
   macro avg       0.59      0.68      0.63        69
weighted avg       0.59      0.68      0.63        69
 samples avg       0.57      0.64      0.58        69



In [19]:
from sklearn.ensemble import GradientBoostingClassifier

# Initialize and train GBM model
gbm_model = MultiOutputClassifier(GradientBoostingClassifier())
gbm_model.fit(X_train, y_train)


# Predict labels for test data
y_pred_gbm = gbm_model.predict(X_test)

# Evaluate the model
gbm_accuracy = gbm_model.score(X_test, y_test)
print("GBM Accuracy:", gbm_accuracy)


# Print classification report
print("Classification Report:")
print(classification_report(y_test, y_pred_gbm))

GBM Accuracy: 0.375
Classification Report:
              precision    recall  f1-score   support

           0       0.68      0.76      0.72        17
           1       0.67      0.78      0.72        18
           2       0.73      0.65      0.69        17
           3       0.88      0.82      0.85        17

   micro avg       0.73      0.75      0.74        69
   macro avg       0.74      0.75      0.74        69
weighted avg       0.74      0.75      0.74        69
 samples avg       0.70      0.69      0.68        69



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [28]:
# Define and train the classifier
classifier_RF = MultiOutputClassifier(RandomForestClassifier())
classifier_RF.fit(X_train, y_train)

# Predict labels for test data
y_pred_RF = classifier_RF.predict(X_test)

# Evaluate performance
accuracy = accuracy_score(y_test, y_pred_RF)
print("Accuracy:", accuracy)

# Print classification report
print("Classification Report:")
print(classification_report(y_test, y_pred_RF))

Accuracy: 0.3125
Classification Report:
              precision    recall  f1-score   support

           0       0.75      0.71      0.73        17
           1       0.67      0.67      0.67        18
           2       0.69      0.53      0.60        17
           3       0.80      0.94      0.86        17

   micro avg       0.73      0.71      0.72        69
   macro avg       0.73      0.71      0.71        69
weighted avg       0.73      0.71      0.71        69
 samples avg       0.73      0.67      0.66        69



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
