## Importing the libraries

In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.multioutput import MultiOutputClassifier

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.pipeline import Pipeline

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


## Loading the dataset

In [2]:
data = pd.read_csv('kolam_dataset_multilabels.csv')
inputs = data['input']

In [3]:
inputs

0                               a
1                               b
2                               c
3                               d
4                              ab
                  ...            
155                  dbac c cd dc
156          dbac c c cab b b bac
157          dca a a acdb b b bdc
158    bacd d dbac c cd d d d dba
159           dbac c cd d d d dba
Name: input, Length: 160, dtype: object

In [4]:
labels = data.drop(['input'], axis =1)
labels

Unnamed: 0,a,b,c,d
0,1,1,1,0
1,1,1,0,1
2,1,0,1,1
3,0,1,1,1
4,0,1,0,0
...,...,...,...,...
155,1,0,1,0
156,0,0,1,1
157,1,0,1,0
158,1,0,1,0


## Converting inputs to TF-IDF vectors
## Splitting the data into train and test sets

In [5]:
# Convert text data to TF-IDF vectors
tfidf_vectorizer = TfidfVectorizer(token_pattern=r"(?u)\b\w+\b")
X = tfidf_vectorizer.fit_transform(inputs)

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.2, random_state=42, stratify = labels)

## Applying Gradient Boosting Classifier model

In [6]:
from sklearn.ensemble import GradientBoostingClassifier

# Initialize and train GBM model
gbm_model = MultiOutputClassifier(GradientBoostingClassifier())
gbm_model.fit(X_train, y_train)


# Predict labels for test data
y_pred_gbm = gbm_model.predict(X_test)

# Evaluate the model
gbm_accuracy = gbm_model.score(X_test, y_test)
print("GBM Accuracy:", gbm_accuracy)


# Print classification report
print("Classification Report:")
print(classification_report(y_test, y_pred_gbm))

GBM Accuracy: 0.5625
Classification Report:
              precision    recall  f1-score   support

           0       0.76      0.94      0.84        17
           1       0.68      0.83      0.75        18
           2       0.87      0.76      0.81        17
           3       0.84      0.94      0.89        17

   micro avg       0.78      0.87      0.82        69
   macro avg       0.79      0.87      0.82        69
weighted avg       0.79      0.87      0.82        69
 samples avg       0.78      0.84      0.78        69



## Getting the parameters of the model with good accuracy

In [7]:
gbm_model.get_params()

{'estimator__ccp_alpha': 0.0,
 'estimator__criterion': 'friedman_mse',
 'estimator__init': None,
 'estimator__learning_rate': 0.1,
 'estimator__loss': 'log_loss',
 'estimator__max_depth': 3,
 'estimator__max_features': None,
 'estimator__max_leaf_nodes': None,
 'estimator__min_impurity_decrease': 0.0,
 'estimator__min_samples_leaf': 1,
 'estimator__min_samples_split': 2,
 'estimator__min_weight_fraction_leaf': 0.0,
 'estimator__n_estimators': 100,
 'estimator__n_iter_no_change': None,
 'estimator__random_state': None,
 'estimator__subsample': 1.0,
 'estimator__tol': 0.0001,
 'estimator__validation_fraction': 0.1,
 'estimator__verbose': 0,
 'estimator__warm_start': False,
 'estimator': GradientBoostingClassifier(),
 'n_jobs': None}

## Predicting the sequence of Kolam upto certain length (max_length)

In [18]:
input_sequence = ['ca']
# Define the maximum length for the sequence
max_length = 10

# Initialize an empty list to store the labels
predicted_labels = []

# Perform TF-IDF transformation and prediction until the sequence length reaches max_length
while len(input_sequence[0]) < max_length:
    # Perform TF-IDF transformation on the input sequence
    
    X_new_transf = tfidf_vectorizer.transform(input_sequence)
    # Predict using GBM model
    predicted_label_index = gbm_model.predict(X_new_transf)[0]
    # Predict probabilities for each class
    y_pred_proba = gbm_model.predict_proba(X_new_transf)
    
    class_1_probs = [arr[0][1] for arr in y_pred_proba]

# Find the index of the array with the highest probability in positive class
    max_prob_index = np.argmax(class_1_probs)

    label_names = ['a', 'b', 'c', 'd']  # The order of the labels to map with predicted index

# Get the label name corresponding to the index
    predicted_label = label_names[max_prob_index]

# Print the predicted label with the highest probability for class 1
    #print("Predicted label with highest probability:", predicted_label)
    
    if predicted_label == "a":
        if input_sequence[0][-1] == "a":
            input_sequence[0] += " " + "a"
        else:
            input_sequence[0] += "a"
    
    elif predicted_label == "b":
        if input_sequence[0][-1] == "b":
            input_sequence[0] += " " + "b"
        else:
            input_sequence[0] += "b"
    elif predicted_label == "c":
        if input_sequence[0][-1] == "c":
            input_sequence[0] += " " + "c"
        else:
            input_sequence[0] += "c"
    elif predicted_label == "d":
        if input_sequence[0][-1] == "d":
            input_sequence[0] += " " + "d"
        else:
            input_sequence[0] += "d"

input_sequence = [input_sequence[0]]
input_sequence

['cab b ba a']