## Multilabel Classification

Objective: How  well  can  we  classify  a  restaurant’s  cuisine  type  using  the content of their reviews as input?

### Imports

In [10]:
import numpy as np
import pandas as pd
import regex as re
import nltk
import warnings
warnings.filterwarnings('ignore')
import sys, os

#Preprocessing
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..', 'utils')))
import pipeline as p
from pipeline import MainPipeline
import preproc

from nltk.tokenize import word_tokenize
from nltk.tokenize import PunktSentenceTokenizer
from textblob import TextBlob
sent_tokenizer = PunktSentenceTokenizer()

#Vectorization
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

#Classification and Metrics
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multioutput import ClassifierChain
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.dummy import DummyClassifier
from sklearn.pipeline import Pipeline

#Visualization
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
from tqdm import tqdm

### Dataset preparation
Importing both datasets and applying the prepared preprocessing function to merge them.

In [11]:
reviews = pd.read_csv('C:/Users/msard/OneDrive/Desktop/Data Science/Fall 2024/Text Mining/Hyderabadi-Word-Soup/data_hyderabad/10k_reviews.csv')
restaurants = pd.read_csv('C:/Users/msard/OneDrive/Desktop/Data Science/Fall 2024/Text Mining/Hyderabadi-Word-Soup/data_hyderabad/105_restaurants.csv')

In [12]:
reviews = preproc.multilabel_preproc(reviews, restaurants)

In [13]:
reviews.head()

Unnamed: 0,Review,Cuisines
0,"The ambience was good, food was quite good . h...","[Chinese, Continental, Kebab, European, South ..."
1,Ambience is too good for a pleasant evening. S...,"[Chinese, Continental, Kebab, European, South ..."
2,A must try.. great food great ambience. Thnx f...,"[Chinese, Continental, Kebab, European, South ..."
3,Soumen das and Arun was a great guy. Only beca...,"[Chinese, Continental, Kebab, European, South ..."
4,Food is good.we ordered Kodi drumsticks and ba...,"[Chinese, Continental, Kebab, European, South ..."


### Preprocessing
Applying the general preprocessing pipeline and creating vectorizations.

#### Basic review preprocessing
Applying the basic preprocessing pipeline. This fixes the shortened versions of some verbs to be more readable. All other preprocessing options aren't viable to be applied this early, since not all encoding methods require the same level of context.

In [14]:
preprocessor = MainPipeline()
reviews['Preproc_Review'] =\
      reviews['Review'].map(lambda content: preprocessor.main_pipeline(content))

#### Vectorization
Creating new columns with the encoded reviews

Encodings researched:
- TF-IDF: Simple, interpretable, and effective for many text classification tasks, but does not capture semantic meaning or word order.
- Doc2Vec: Captures the context of entire documents, useful for tasks requiring document-level understanding (like classification).
- FastText: Captures subword information, handles out-of-vocabulary and mispelled words better, but requires a large corpus to train effectively.
- BERT: Provides contextualized embeddings, state-of-the-art performance on many NLP tasks. Very computationally expensive, requires fine-tuning.
- GloVe: Would not be able to capture domain-specific nuances, and since this dataset is so specific this would not result in good scores.

In [15]:
'''import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import FastText, Doc2Vec
from gensim.models.doc2vec import TaggedDocument
from transformers import BertTokenizer, BertModel

# Assuming reviews DataFrame with a 'Preproc_Review' column
reviews = pd.DataFrame({
    'Preproc_Review': ["This is a sample review.", "Another example review text."]
})

# TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=100)
tfidf_matrix = tfidf_vectorizer.fit_transform(reviews['Preproc_Review']).toarray()
reviews['tfidf_vector'] = list(tfidf_matrix)

# Doc2Vec
tokenized_reviews = [review.split() for review in reviews['Preproc_Review']]
tagged_reviews = [TaggedDocument(words=review, tags=[i]) for i, review in enumerate(tokenized_reviews)]
doc2vec_model = Doc2Vec(documents=tagged_reviews, vector_size=100, window=5, min_count=1, workers=4)
reviews['doc2vec_vector'] = [doc2vec_model.infer_vector(review) for review in tokenized_reviews]

# FastText
fasttext_model = FastText(sentences=tokenized_reviews, vector_size=100, window=5, min_count=1, workers=4)
reviews['fasttext_vector'] = [fasttext_model.wv[review].mean(axis=0) for review in tokenized_reviews]

# BERT
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

def bert_encode(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
    outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).detach().numpy()

reviews['bert_vector'] = reviews['Preproc_Review'].apply(bert_encode)

# GloVe
embedding_index = {}
with open('path/to/glove.6B.100d.txt', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embedding_index[word] = coefs

embedding_dim = 100
def glove_encode(text):
    words = text.split()
    word_vectors = [embedding_index.get(word, np.zeros(embedding_dim)) for word in words]
    return np.mean(word_vectors, axis=0)

reviews['glove_vector'] = reviews['Preproc_Review'].apply(glove_encode)

# Display the DataFrame with the new vectorized columns
print(reviews.head())'''

'import numpy as np\nimport pandas as pd\nfrom sklearn.feature_extraction.text import TfidfVectorizer\nfrom gensim.models import FastText, Doc2Vec\nfrom gensim.models.doc2vec import TaggedDocument\nfrom transformers import BertTokenizer, BertModel\n\n# Assuming reviews DataFrame with a \'Preproc_Review\' column\nreviews = pd.DataFrame({\n    \'Preproc_Review\': ["This is a sample review.", "Another example review text."]\n})\n\n# TF-IDF\ntfidf_vectorizer = TfidfVectorizer(max_features=100)\ntfidf_matrix = tfidf_vectorizer.fit_transform(reviews[\'Preproc_Review\']).toarray()\nreviews[\'tfidf_vector\'] = list(tfidf_matrix)\n\n# Doc2Vec\ntokenized_reviews = [review.split() for review in reviews[\'Preproc_Review\']]\ntagged_reviews = [TaggedDocument(words=review, tags=[i]) for i, review in enumerate(tokenized_reviews)]\ndoc2vec_model = Doc2Vec(documents=tagged_reviews, vector_size=100, window=5, min_count=1, workers=4)\nreviews[\'doc2vec_vector\'] = [doc2vec_model.infer_vector(review) for 

In [16]:
#TF-IDF
vectorizer = TfidfVectorizer()
reviews_tfidf_matrix = vectorizer.fit_transform(reviews['Preproc_Review'])


reviews['tfidf_vector'] = reviews_tfidf_matrix.toarray().tolist()
reviews_tfidf_matrix.shape #(documents, features)

(9955, 16148)

In [17]:
#Doc2Vec
# We need first to put the reviews in a tagged document object, with a (word, tag) shape
review_docs = [TaggedDocument(doc, [i]) for i, doc in enumerate(reviews['Preproc_Review'].apply(str.split))]

doc2vec = Doc2Vec(review_docs, vector_size=100, window=2, min_count=1, workers=4)
reviews_doc2vec_matrix = [doc2vec.infer_vector(doc.split()) for doc in reviews['Preproc_Review']]

reviews['doc2vec_vector'] = reviews_doc2vec_matrix

reviews_doc2vec_matrix = np.array(reviews_doc2vec_matrix)
#reviews_doc2vec_matrix.shape #(documents, embedding dimensionality)

Creating the labels matrix and adding it as a column

In [18]:
mlb = MultiLabelBinarizer()
binarised_labels = mlb.fit_transform(reviews["Cuisines"])
reviews["cuisine_binary"] = binarised_labels.tolist()

#binarised_labels.shape #(documents, labels)

### Model Testing
Checking if there are no errors, through running all classifiers once with non-optimized parameters.

Relevant classifiers found through research:
- Binary Relevance (Mulan2): Treats multilabel classification as multiple independent binary classification problems. For a dataset with n possible labels, it trains n separate binary classifiers—one for each label. Each classifier predicts whether an instance belongs to that specific label or not, ignoring the relationships between labels. It is a simple classifier that scales linearly with the number of classes in a multilabel classification dataset. (https://www.researchgate.net/profile/Antal-Van-Den-Bosch/publication/228075659_Using_Language_Models_for_Spam_Detection_in_Social_Bookmarking)

Previously used classifiers:
- Random Forest: An ensemble method that uses multiple decision trees to improve classification performance.
- Gradient Boosting: An ensemble technique that builds models sequentially, with each new model correcting errors made by previous models.
- Support Vector Machine (SVM): Effective for high-dimensional spaces and suitable for text classification.
- K-Nearest Neighbors (KNN): A simple, instance-based learning algorithm that classifies based on the majority label of the nearest neighbors.
- Simple Neural Networks (MLP): Deep learning models that can capture complex patterns in the data.

Classification Strategies:
- One vs Rest (OvR): Trains one classifier per label, with each classifier trained to predict the presence or absence of a single label
- Classifier Chain: Treats the problem as a chain of binary classification problems, where the prediction of each label depends on the predictions of previous labels in the chain.
- Label Powerset: Transforms the multilabel problem into a multiclass problem by treating each unique combination of labels as a single class.

In [19]:
X_train, X_test, y_train, y_test = train_test_split(reviews_doc2vec_matrix, binarised_labels,
                                                test_size=0.2, random_state=0, shuffle=True)

#### One vs Rest

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.multioutput import ClassifierChain, MultiOutputClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

# Assuming reviews DataFrame with 'tfidf_vector' and 'cuisine_binary' columns
# Split the data
X_train, X_test, y_train, y_test = train_test_split(reviews['tfidf_vector'].tolist(), reviews['cuisine_binary'].tolist(), test_size=0.2, random_state=42)

# Convert lists to numpy arrays
X_train = np.array(X_train)
X_test = np.array(X_test)
y_train = np.array(y_train)
y_test = np.array(y_test)

# Define classifiers
classifiers = {
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(n_estimators=100, random_state=42),
    "SVM": SVC(kernel='linear', probability=True, random_state=42),
    "k-NN": KNeighborsClassifier(n_neighbors=5),
    "Neural Network": MLPClassifier(hidden_layer_sizes=(100,), max_iter=300, random_state=42)
}

# Initialize dictionary to store F1 scores
f1_scores = {
    "OneVsRest": {},
    "Binary Relevance": {},
    "Classifier Chain": {}
}

# Test OneVsRestClassifier with each classifier
for name, clf in classifiers.items():
    ovr_classifier = OneVsRestClassifier(clf)
    ovr_classifier.fit(X_train, y_train)
    y_pred_ovr = ovr_classifier.predict(X_test)
    f1_scores["OneVsRest"][name] = f1_score(y_test, y_pred_ovr, average='micro')

# Test Binary Relevance with each classifier
for name, clf in classifiers.items():
    br_classifier = MultiOutputClassifier(clf)
    br_classifier.fit(X_train, y_train)
    y_pred_br = br_classifier.predict(X_test)
    f1_scores["Binary Relevance"][name] = f1_score(y_test, y_pred_br, average='micro')

# Test Classifier Chain with each classifier
for name, clf in classifiers.items():
    chain_classifier = ClassifierChain(clf)
    chain_classifier.fit(X_train, y_train)
    y_pred_chain = chain_classifier.predict(X_test)
    f1_scores["Classifier Chain"][name] = f1_score(y_test, y_pred_chain, average='micro')

In [None]:
# Plotting the F1 scores
labels = list(classifiers.keys())
one_vs_rest_scores = [f1_scores["OneVsRest"].get(label, 0) for label in labels]
binary_relevance_scores = [f1_scores["Binary Relevance"].get(label, 0) for label in labels]
classifier_chain_scores = [f1_scores["Classifier Chain"].get(label, 0) for label in labels]

x = np.arange(len(labels))
width = 0.2

fig, ax = plt.subplots(figsize=(12, 6))
rects1 = ax.bar(x - width, one_vs_rest_scores, width, label='OneVsRest')
rects2 = ax.bar(x, binary_relevance_scores, width, label='Binary Relevance')
rects3 = ax.bar(x + width, classifier_chain_scores, width, label='Classifier Chain')

ax.set_xlabel('Classifiers')
ax.set_ylabel('F1 Score')
ax.set_title('F1 Scores by Classifier and Strategy')
ax.set_xticks(x)
ax.set_xticklabels(labels)
ax.legend()

fig.tight_layout()
plt.show()s

#### Classifier Chain

In [13]:
#Test the classifier Chain
cchain = ClassifierChain(SGDClassifier(loss="log_loss", random_state=0, class_weight="balanced"), order="random")
cchain.fit(X_train, y_train)

In [14]:
y_pred_cchain = cchain.predict(X_test)
# mlb.inverse_transform(y_pred_q1_cchain)
p.fold_score_calculator(y_pred_cchain, y_test, verbose=True)

Accuracy: 0.020090406830738324 
Precision: 0.3447235558376375 
Recall: 0.2855483656877385 
F1: 0.2637119846574813


(0.020090406830738324,
 0.3447235558376375,
 0.2855483656877385,
 0.2637119846574813)

### Model Optimization - Grid Search

In [11]:
X_train, X_test, y_train, y_test = train_test_split(reviews["Review"], binarised_labels,
                                                    test_size=0.2, random_state=0)


In [16]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import ClassifierChain
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, f1_score

# Assuming MainPipeline is defined in module p
class MainPipeline(BaseEstimator, TransformerMixin):
    def __init__(self, print_output=True, no_stopwords=True, custom_stopwords=[], convert_diacritics=True, 
                 lowercase=True, lemmatized=True, list_pos=["n","v","a","r","s"], pos_tags_list="no_pos", 
                 tokenized_output=False):
        self.print_output = print_output
        self.no_stopwords = no_stopwords
        self.custom_stopwords = custom_stopwords
        self.convert_diacritics = convert_diacritics
        self.lowercase = lowercase
        self.lemmatized = lemmatized
        self.list_pos = list_pos
        self.pos_tags_list = pos_tags_list
        self.tokenized_output = tokenized_output

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return X.apply(lambda x: p.main_pipeline(x, print_output=self.print_output, no_stopwords=self.no_stopwords,
                                               custom_stopwords=self.custom_stopwords, convert_diacritics=self.convert_diacritics,
                                               lowercase=self.lowercase, lemmatized=self.lemmatized, list_pos=self.list_pos,
                                               pos_tags_list=self.pos_tags_list, tokenized_output=self.tokenized_output))

# Define the pipeline
pipeline = Pipeline([
    ('preprocessor', MainPipeline()),
    ('vectorizer', TfidfVectorizer()),
    ('classifier', OneVsRestClassifier(LogisticRegression()))
])

# Define the parameter grid
param_grid = {
    'preprocessor__no_stopwords': [True, False],
    'preprocessor__convert_diacritics': [True, False],
    'preprocessor__lowercase': [True, False],
    'preprocessor__lemmatized': [True, False],
    'vectorizer__max_features': [5000, 10000],
    'classifier__estimator__solver': ['lbfgs'],
    'classifier__estimator__C': [1],
    'classifier__estimator__penalty': ['l2'],
    'classifier__estimator__class_weight': [None],
    'classifier__estimator__random_state': [1],
    'classifier__estimator__multi_class': ["ovr"]
}

# Define the scoring function
scorer = make_scorer(f1_score, average='micro')

# Perform the grid search
grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, scoring=scorer, cv=5, verbose=2, n_jobs=-1)

# Fit the grid search to your data
grid_search.fit(X_train, y_train)

# Print the best parameters and best score
print("Best parameters found: ", grid_search.best_params_)
print("Best F1 score: ", grid_search.best_score_)

Fitting 5 folds for each of 32 candidates, totalling 160 fits
good
['good']
With great music and ambience the excitement for the final IPL match was enhanced.
The food is great along with the exotic drinks.
Must try is the cock in the basket which is fried chicken breast.
['great', 'music', 'ambience', 'excitement', 'final', 'ipl', 'match', 'enhance', 'food', 'great', 'along', 'exotic', 'drink', 'must', 'try', 'cock', 'basket', 'fry', 'chicken', 'breast']
What a amazing experience, very delicious desserts
Staff is very friendly, hurry to visit to try more treats,
Very good ambience, 10/10 for everything
['amaze', 'experience', 'delicious', 'dessert', 'staff', 'friendly', 'hurry', 'visit', 'try', 'treat', 'good', 'ambience', '1010', 'everything']
Night life and Saturday weekend parties are fun after long busy schedule when it’s time for weekend I would suggest. My frnds to go to Sheraton gachubowli and party all night ambience is gud all the staff and bartenders are respectful and polit

In [26]:
'''# Vectorizers
bigram_tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 1), token_pattern=r"(?u)\b\w+\b")
bigram_bow_vectorizer = CountVectorizer(ngram_range=(1, 1), token_pattern=r"(?u)\b\w+\b")

# Classifiers
class_ovr = OneVsRestClassifier(LogisticRegression())
class_cc = ClassifierChain(LogisticRegression())

# Pipelines
preprocessor = p.MainPipeline()
hermetic_classifier = p.HermeticClassifier(preprocessor, vectorizer, class_ovr)

# Simplified parameter grid
parameter_grid = {
    'preprocessor__no_stopwords': [True],
    'preprocessor__lemmatized': [True],
    'preprocessor__lowercase': [True],
    'vectorizer': [bigram_tfidf_vectorizer],
    'classifier': [class_ovr],
    'classifier__estimator__solver': ['lbfgs'],
    'classifier__estimator__C': [1],
    'classifier__estimator__penalty': ['l2'],
    'classifier__estimator__class_weight': [None],
    'classifier__estimator__random_state': [1],
    'classifier__estimator__multi_class': ["ovr"]
}

# Scoring metrics
scores = ["accuracy", "precision_weighted", "recall_weighted", "f1_weighted"]

# Grid search
grid_search = GridSearchCV(hermetic_classifier, parameter_grid, scoring=scores, verbose=4, refit="f1_weighted")'''

In [25]:
'''# Testing hermetic_classifier
hermetic_classifier.set_params(
    preprocessor__no_stopwords=True,
    preprocessor__lemmatized=True,
    preprocessor__lowercase=True,
    vectorizer=bigram_tfidf_vectorizer,
    classifier=class_ovr,
    classifier__estimator__solver='lbfgs',
    classifier__estimator__C=1,
    classifier__estimator__penalty='l2',
    classifier__estimator__class_weight=None,
    classifier__estimator__random_state=1,
    classifier__estimator__multi_class="ovr"
)

hermetic_classifier.fit(reviews["Review"], binarised_labels)'''

In [27]:
'''# Fit the model
grid_result = grid_search.fit(reviews["Review"], binarised_labels)

# Check the results
print("Best parameters found: ", grid_result.best_params_)
print("Best score: ", grid_result.best_score_)'''

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV 1/5] END classifier=OneVsRestClassifier(estimator=LogisticRegression()), classifier__estimator__C=1, classifier__estimator__class_weight=None, classifier__estimator__multi_class=ovr, classifier__estimator__penalty=l2, classifier__estimator__random_state=1, classifier__estimator__solver=lbfgs, preprocessor__lemmatized=True, preprocessor__lowercase=True, preprocessor__no_stopwords=True, vectorizer=TfidfVectorizer(token_pattern='(?u)\\b\\w+\\b'); accuracy: (test=nan) f1_weighted: (test=nan) precision_weighted: (test=nan) recall_weighted: (test=nan) total time= 1.3min


KeyboardInterrupt: 

In [None]:
'''grid_result.best_params_'''

NameError: name 'grid_result' is not defined

In [None]:
'''## Best results
for score in scores:
    print("{} = {}".format(score,round(grid_result.cv_results_['mean_test_{}'.format(score)][grid_result.best_index_],3)))'''

### Model Optimization - Random

In [17]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import make_scorer, f1_score
from scipy.stats import uniform

# Define the pipeline
pipeline = Pipeline([
    ('preprocessor', MainPipeline()),
    ('vectorizer', TfidfVectorizer()),
    ('classifier', OneVsRestClassifier(LogisticRegression()))
])

# Define the parameter distribution
param_dist = {
    'preprocessor__no_stopwords': [True, False],
    'preprocessor__convert_diacritics': [True, False],
    'preprocessor__lowercase': [True, False],
    'preprocessor__lemmatized': [True, False],
    'vectorizer__max_features': [5000, 10000],
    'classifier__estimator__solver': ['lbfgs'],
    'classifier__estimator__C': uniform(0.1, 10),
    'classifier__estimator__penalty': ['l2'],
    'classifier__estimator__class_weight': [None],
    'classifier__estimator__random_state': [1],
    'classifier__estimator__multi_class': ["ovr"]
}

# Define the scoring function
scorer = make_scorer(f1_score, average='micro')

# Perform the randomized search
random_search = RandomizedSearchCV(estimator=pipeline, param_distributions=param_dist, scoring=scorer, cv=5, verbose=2, n_jobs=-1, n_iter=50)

# Fit the randomized search to your data
random_search.fit(X_train, y_train)

# Print the best parameters and best score
print("Best parameters found: ", random_search.best_params_)
print("Best F1 score: ", random_search.best_score_)

Fitting 5 folds for each of 50 candidates, totalling 250 fits
good
['good']
With great music and ambience the excitement for the final IPL match was enhanced.
The food is great along with the exotic drinks.
Must try is the cock in the basket which is fried chicken breast.
['great', 'music', 'ambience', 'excitement', 'final', 'ipl', 'match', 'enhance', 'food', 'great', 'along', 'exotic', 'drink', 'must', 'try', 'cock', 'basket', 'fry', 'chicken', 'breast']
What a amazing experience, very delicious desserts
Staff is very friendly, hurry to visit to try more treats,
Very good ambience, 10/10 for everything
['amaze', 'experience', 'delicious', 'dessert', 'staff', 'friendly', 'hurry', 'visit', 'try', 'treat', 'good', 'ambience', '1010', 'everything']
Night life and Saturday weekend parties are fun after long busy schedule when it’s time for weekend I would suggest. My frnds to go to Sheraton gachubowli and party all night ambience is gud all the staff and bartenders are respectful and polit

In [32]:
'''from sklearn.model_selection import RandomizedSearchCV, train_test_split
from scipy.stats import uniform
from sklearn.metrics import classification_report

# Simplified parameter grid with distributions for RandomizedSearchCV
parameter_distributions = {
    'preprocessor__no_stopwords': [True, False],
    'preprocessor__lemmatized': [True, False],
    'preprocessor__lowercase': [True, False],
    'vectorizer': [bigram_tfidf_vectorizer, bigram_bow_vectorizer],
    'classifier': [class_ovr],
    'classifier__estimator__solver': ['lbfgs'],
    'classifier__estimator__C': uniform(0.1, 10),  # Uniform distribution between 0.1 and 10
    'classifier__estimator__penalty': ['l2'],
    'classifier__estimator__class_weight': [None, 'balanced'],
    'classifier__estimator__random_state': [1],
    'classifier__estimator__multi_class': ["ovr"]
}

# Scoring metrics
scores = ["accuracy", "precision_weighted", "recall_weighted", "f1_weighted"]

# Randomized search
random_search = RandomizedSearchCV(hermetic_classifier, parameter_distributions, n_iter=50, scoring=scores, verbose=4, refit="f1_weighted", random_state=1)

# Fit the model on the training data
random_result = random_search.fit(X_train, y_train)

# Check the results
print("Best parameters found: ", random_result.best_params_)
print("Best score: ", random_result.best_score_)

# Evaluate the best model on the test set
best_model = random_result.best_estimator_
y_pred = best_model.predict(X_test)

# Print classification report
print(classification_report(y_test, y_pred, target_names=mlb.classes_))'''

Fitting 5 folds for each of 50 candidates, totalling 250 fits
[CV 1/5] END classifier=OneVsRestClassifier(estimator=LogisticRegression()), classifier__estimator__C=4.27022004702574, classifier__estimator__class_weight=None, classifier__estimator__multi_class=ovr, classifier__estimator__penalty=l2, classifier__estimator__random_state=1, classifier__estimator__solver=lbfgs, preprocessor__lemmatized=True, preprocessor__lowercase=False, preprocessor__no_stopwords=False, vectorizer=CountVectorizer(token_pattern='(?u)\\b\\w+\\b'); accuracy: (test=nan) f1_weighted: (test=nan) precision_weighted: (test=nan) recall_weighted: (test=nan) total time= 3.0min


KeyboardInterrupt: 

### Model Optimization - Optuna

In [18]:
import optuna
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import make_scorer, f1_score
from sklearn.model_selection import cross_val_score

# Assuming MainPipeline is defined in module p
class MainPipeline(BaseEstimator, TransformerMixin):
    def __init__(self, print_output=True, no_stopwords=True, custom_stopwords=[], convert_diacritics=True, 
                 lowercase=True, lemmatized=True, list_pos=["n","v","a","r","s"], pos_tags_list="no_pos", 
                 tokenized_output=False):
        self.print_output = print_output
        self.no_stopwords = no_stopwords
        self.custom_stopwords = custom_stopwords
        self.convert_diacritics = convert_diacritics
        self.lowercase = lowercase
        self.lemmatized = lemmatized
        self.list_pos = list_pos
        self.pos_tags_list = pos_tags_list
        self.tokenized_output = tokenized_output

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return X.apply(lambda x: p.main_pipeline(x, print_output=self.print_output, no_stopwords=self.no_stopwords,
                                               custom_stopwords=self.custom_stopwords, convert_diacritics=self.convert_diacritics,
                                               lowercase=self.lowercase, lemmatized=self.lemmatized, list_pos=self.list_pos,
                                               pos_tags_list=self.pos_tags_list, tokenized_output=self.tokenized_output))

# Define the objective function for Optuna
def objective(trial):
    pipeline = Pipeline([
        ('preprocessor', MainPipeline(
            no_stopwords=trial.suggest_categorical('preprocessor__no_stopwords', [True, False]),
            convert_diacritics=trial.suggest_categorical('preprocessor__convert_diacritics', [True, False]),
            lowercase=trial.suggest_categorical('preprocessor__lowercase', [True, False]),
            lemmatized=trial.suggest_categorical('preprocessor__lemmatized', [True, False])
        )),
        ('vectorizer', TfidfVectorizer(max_features=trial.suggest_categorical('vectorizer__max_features', [5000, 10000]))),
        ('classifier', OneVsRestClassifier(LogisticRegression(
            solver=trial.suggest_categorical('classifier__estimator__solver', ['lbfgs']),
            C=trial.suggest_float('classifier__estimator__C', 0.1, 10),
            penalty=trial.suggest_categorical('classifier__estimator__penalty', ['l2']),
            class_weight=trial.suggest_categorical('classifier__estimator__class_weight', [None]),
            random_state=1,
            multi_class="ovr"
        )))
    ])
    
    scorer = make_scorer(f1_score, average='micro')
    score = cross_val_score(pipeline, X_train, y_train, cv=5, scoring=scorer).mean()
    return score

# Create the study and optimize
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

# Print the best parameters and best score
print("Best parameters found: ", study.best_params)
print("Best F1 score: ", study.best_value)

ImportError: cannot import name 'SklearnPipeline' from 'optuna.integration' (unknown location)

In [45]:
'''import optuna
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import classification_report

# Define the objective function
def objective(trial):
    # Define the hyperparameters to tune
    no_stopwords = trial.suggest_categorical('preprocessor__no_stopwords', [True, False])
    lemmatized = trial.suggest_categorical('preprocessor__lemmatized', [True, False])
    lowercase = trial.suggest_categorical('preprocessor__lowercase', [True, False])
    vectorizer = trial.suggest_categorical('vectorizer', [bigram_tfidf_vectorizer, bigram_bow_vectorizer])
    C = trial.suggest_loguniform('classifier__estimator__C', 0.1, 10)
    class_weight = trial.suggest_categorical('classifier__estimator__class_weight', [None, 'balanced'])
    
    # Set the parameters
    hermetic_classifier.set_params(
        preprocessor__no_stopwords=no_stopwords,
        preprocessor__lemmatized=lemmatized,
        preprocessor__lowercase=lowercase,
        vectorizer=vectorizer,
        classifier=class_ovr,
        classifier__estimator__solver='lbfgs',
        classifier__estimator__C=C,
        classifier__estimator__penalty='l2',
        classifier__estimator__class_weight=class_weight,
        classifier__estimator__random_state=1,
        classifier__estimator__multi_class="ovr"
    )
    
    # Perform cross-validation
    scores = cross_val_score(hermetic_classifier, X_train, y_train, cv=3, scoring='f1_weighted')
    print("Cross-Validation Scores:", scores)
    return scores.mean()

# Create a study and optimize the objective function
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

# Check the results
print("Best parameters found: ", study.best_params)
print("Best score: ", study.best_value)

# Set the best parameters to the classifier
hermetic_classifier.set_params(**study.best_params)

# Fit the model on the training data
hermetic_classifier.fit(X_train, y_train)

# Evaluate the best model on the test set
y_pred = hermetic_classifier.predict(X_test)

# Print classification report
print(classification_report(y_test, y_pred, target_names=mlb.classes_))'''

[I 2024-12-14 03:09:10,692] A new study created in memory with name: no-name-b98522d1-4a38-4817-aae7-e0df45f93f71
[W 2024-12-14 03:16:08,128] Trial 0 failed with parameters: {'preprocessor__no_stopwords': False, 'preprocessor__lemmatized': True, 'preprocessor__lowercase': True, 'vectorizer': CountVectorizer(token_pattern='(?u)\\b\\w+\\b'), 'classifier__estimator__C': 0.8344491386665716, 'classifier__estimator__class_weight': 'balanced'} because of the following error: The value nan is not acceptable.
[W 2024-12-14 03:16:08,140] Trial 0 failed with value nan.


Cross-Validation Scores: [nan nan nan]


### Comparing against Dummy

In [None]:
preprocessor = MainPipeline()
dummy_classifier = ClassifierChain(DummyClassifier())
bigram_bow_vectorizer = CountVectorizer(ngram_range=(1,1), token_pattern=r"(?u)\b\w+\b")
hermetic_classifier3 = HermeticClassifier(preprocessor, bigram_bow_vectorizer, dummy_classifier)

parameter_grid = {'preprocessor__no_stopwords':[True],
                  'preprocessor__lemmatized':[True],
                  'preprocessor__lowercase':[True],                  
                  'vectorizer':[bigram_bow_vectorizer],
                  'classifier':[dummy_classifier],
                  'classifier__base_estimator__strategy':["most_frequent", "prior", "stratified", "uniform", "constant"]}

grid_search2 = GridSearchCV(hermetic_classifier3, param_grid=parameter_grid, scoring=scores, verbose=4, refit="f1_weighted")

grid_result2 = grid_search2.fit(reviews["Review"], binarised_labels)

In [None]:
## Best results
for score in scores:
    print("{} = {}".format(score,round(grid_result2.cv_results_['mean_test_{}'.format(score)][grid_result2.best_index_],3)))

## Neural Networks

In this section, we attempt to use CNN and RNN, using pre-trained GLOvE embeddings.

In [None]:
# Embedding matrix

# Loading pre-trained GloVe embeddings
embedding_index = {}
with open('path/to/glove.6B.100d.txt', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embedding_index[word] = coefs

# Creating the embedding matrix
vocab_size = len(reviews['Preproc_Review'])
embedding_matrix = np.zeros((vocab_size, embedding_dim))
for word, i in word_index.items():  # word_index is a dictionary mapping words to their indices
    if i < vocab_size:
        embedding_vector = embedding_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

In [24]:
import tensorflow as tf
from tensorflow.keras.layers import Embedding, Conv1D, MaxPooling1D, Dropout, Dense, Flatten, Input
from tensorflow.keras.models import Model

# Parameters
max_sequence_length = 100
vocab_size = 20000
embedding_dim = 100
embedding_matrix = np.random.rand(vocab_size, embedding_dim)
all_cuisines = [cuisine for sublist in reviews['Cuisines'] for cuisine in sublist]
num_classes = len(all_cuisines)

model = tf.keras.Sequential()

model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, weights=[embedding_matrix], input_length=max_sequence_length, trainable=False))
model.add(Conv1D(filters=128, kernel_size=5, activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Dropout(0.5))
model.add(Flatten())
model.add(Dense(128, activation='relu'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()



In [25]:
model.fit(X_train, y_train, epochs=10, batch_size=32)

ValueError: Invalid dtype: object

In [None]:
# Assuming y_test and y_pred are the true and predicted labels respectively

# Accuracy
accuracy = accuracy_score(y_test, y_pred)

# Precision, Recall, F1-Score
precision = precision_score(y_test, y_pred, average='micro')
recall = recall_score(y_test, y_pred, average='micro')
f1 = f1_score(y_test, y_pred, average='micro')

# Hamming Loss
hamming = hamming_loss(y_test, y_pred)

# Subset Accuracy
subset_accuracy = np.mean(np.all(y_test == y_pred, axis=1))

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-Score: {f1}")
print(f"Hamming Loss: {hamming}")
print(f"Subset Accuracy: {subset_accuracy}")