# Azure

In [None]:

import random

import numpy
from azure.ai.textanalytics import TextAnalyticsClient
from azure.core.credentials import AzureKeyCredential

In [None]:
endpoint = "https://nlpnavadarucalin.cognitiveservices.azure.com/"
key = "<template>"

client = TextAnalyticsClient(endpoint=endpoint, credential=AzureKeyCredential(key))

In [None]:
message1 = [
    "By choosing a bike over a car, I’m reducing my environmental footprint. Cycling promotes eco-friendly transportation, and I’m proud to be part of that movement.."]

result = client.analyze_sentiment(message1, show_opinion_mining=True)
docs = [doc for doc in result if not doc.is_error]

for idx, doc in enumerate(docs):
    print(doc.sentiment)

In [None]:
import pandas as pd

data = pd.read_csv("data/reviews_mixed.csv")
data

In [None]:
list_reviews = data['Text'].to_list()
sentiments_reviews = data['Sentiment'].to_list()
ground_truth = pd.factorize(data['Sentiment'])

In [None]:
predicted = []
for id_review, values in enumerate(zip(list_reviews, sentiments_reviews)):
    review, sentiment = values
    result_reviews = client.analyze_sentiment([review], show_opinion_mining=True)
    result_docs = [doc for doc in result_reviews if not doc.is_error]
    for review_text in result_docs:
        print(f'Review: {review} -> {review_text.sentiment}')
        if review_text.sentiment == "positive":
            predicted.append(1)
        else:
            predicted.append(0)

In [None]:
print(predicted)
ground_truth = ground_truth[0]
print(ground_truth)

In [None]:
from sklearn.metrics import accuracy_score

print(f'Accuracy {accuracy_score(predicted, ground_truth)}')

# k-Means Library

In [None]:
from gensim.models import KeyedVectors

word2vec_model = KeyedVectors.load_word2vec_format("data/GoogleNews-vectors-negative300.bin", binary=True)

msg = message1[0].split()
result = 0
for word in msg:
    word = word.strip()
    result += word2vec_model[word] if len(word) > 2 and word in word2vec_model.index_to_key else 0

result = result / len(msg)
print(result)

In [None]:
from random import shuffle

indexes = [i for i in range(len(list_reviews))]
shuffle(indexes)
train_indexes = indexes[:int(0.78 * len(indexes))]
train_input = [list_reviews[i] for i in train_indexes]
test_input = [list_reviews[i] for i in indexes if i not in train_indexes]
test_output = [ground_truth[i] for i in indexes if i not in train_indexes]

In [None]:
import numpy as np


def train_feature_w2v(data_arg):
    result_list = []
    for prop in data_arg:
        feature = 0
        list_words = prop.split()
        for word_arg in list_words:
            word_arg = word_arg.strip()
            if word_arg in word2vec_model.index_to_key and len(word_arg) > 2:
                feature += np.mean(word2vec_model[word_arg])
            else:
                feature += 0

        feature = feature / len(list_words)
        result_list.append(feature)

    return numpy.array(result_list).reshape(-1, 1)

In [None]:
train_input = train_feature_w2v(train_input)
test_input = train_feature_w2v(test_input)

In [None]:
from sklearn.cluster import KMeans

unsupervisedClassifier = KMeans(n_clusters=2, random_state=0)
unsupervisedClassifier.fit(train_input)

In [None]:
predicted = unsupervisedClassifier.predict(test_input)
print(predicted)

from sklearn.metrics import accuracy_score

print(f'Accuracy {accuracy_score(predicted, test_output)}')

# Caracteristici Text

#### Word2Vec

In [None]:
print(train_feature_w2v(list_reviews))

#### Bag of words

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()

In [None]:
indexes = [i for i in range(len(list_reviews))]
shuffle(indexes)
train_indexes = indexes[:int(0.78 * len(indexes))]
train_input = [list_reviews[i] for i in train_indexes]
test_input = [list_reviews[i] for i in indexes if i not in train_indexes]

In [None]:
train_input = vectorizer.fit_transform(train_input)
test_input = vectorizer.transform(test_input)

In [None]:
print(type(train_input))

In [None]:
print("vocab size: ", len(vectorizer.vocabulary_), " words")
print("trainFeatures shape: ", train_input.shape)

print('some words of the vocab: ', vectorizer.get_feature_names_out()[-20:])
print('some features: ', train_input.toarray()[:3])

#### TF-IDF

In [None]:
indexes = [i for i in range(len(list_reviews))]
shuffle(indexes)
train_indexes = indexes[:int(0.78 * len(indexes))]
train_input = [list_reviews[i] for i in train_indexes]
test_input = [list_reviews[i] for i in indexes if i not in train_indexes]

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=50)

trainFeatures = vectorizer.fit_transform(train_input)
testFeatures = vectorizer.transform(test_input)

print('vocab: ', vectorizer.get_feature_names_out()[:10])
print('features: ', trainFeatures.toarray()[:3])

## Extra 

#### CBOW

In [None]:
indexes = [i for i in range(len(list_reviews))]
shuffle(indexes)
train_indexes = indexes[:int(0.78 * len(indexes))]
train_input = [list_reviews[i] for i in train_indexes]

In [None]:
from nltk import word_tokenize

train_input = [word_tokenize(review) for review in train_input]

In [None]:
from gensim.models import Word2Vec

cbow_model = Word2Vec(train_input, min_count=1, window=5, sg=0)

cbow_model.train(train_input, total_examples=len(train_input), epochs=100)
print(cbow_model.wv)

#### Cosine Similarity

In [None]:
propozitia1 = "Eu sunt Calin."
propozitia2 = "Eu nu sunt Calin, sunt Marius."

distance = word2vec_model.wmdistance(propozitia1, propozitia2)
print(distance)

# K-means manual

In [None]:
import math


def euclidian_distance(a, b):
    sum_value = 0
    for elem_a, elem_b in zip(a, b):
        sum_value += (elem_a - elem_b) ** 2
    return math.sqrt(np.sum(sum_value))

In [None]:
def stop(old_c, c, no_iteration):
    if no_iteration > 10000:
        return True
    return old_c == c


class KMeans:

    def __init__(self, input_size, number_of_classes):
        self.input_size = input_size
        self.centroids = None
        self.number_of_classes = number_of_classes

    def fit(self, train_data_arg):
        values_random = random.sample(list(train_data_arg), self.number_of_classes)
        self.centroids = np.array([[x] for x in values_random])
        no_iteration = 0
        old_c = None
        c = []
        while not stop(old_c, c, no_iteration):
            old_c = c.copy()
            c = []
            no_iteration += 1
            for i in range(len(train_data_arg)):
                # c_min = np.linalg.norm(self.centroids[0] - train_data_arg[i])
                c_min = euclidian_distance(self.centroids[0], train_data_arg[i])
                c_index = 0
                for j in range(1, len(self.centroids)):
                    # d = np.linalg.norm(self.centroids[j] - train_data_arg[i])
                    d = euclidian_distance(self.centroids[j], train_data_arg[i])
                    if c_min > d:
                        c_index = j
                        c_min = d
                c.append(c_index)

            for j in range(len(self.centroids)):
                denominator = 0
                numerator = 0
                for i in range(len(c)):
                    if c[i] == j:
                        numerator += train_data_arg[i]
                        denominator += 1

                self.centroids[j] = numerator / denominator if denominator != 0 \
                    else train_data_arg[random.randint(0, len(train_data_arg) - 1)]

    def predict(self, test_data_arg):
        result_predict = []
        for i in range(len(test_data_arg)):
            c_min = np.linalg.norm(self.centroids[0] - test_data_arg[i])
            c_index = 0
            for j in range(1, len(self.centroids)):
                d = euclidian_distance(self.centroids[j], test_data_arg[i])
                if c_min > d:
                    c_index = j
                    c_min = d

            result_predict.append(c_index)

        return result_predict

In [None]:
from random import shuffle

indexes = [i for i in range(len(list_reviews))]
shuffle(indexes)
train_indexes = indexes[:int(0.75 * len(indexes))]
train_input = [list_reviews[i] for i in train_indexes]
test_input = [list_reviews[i] for i in indexes if i not in train_indexes]
test_output = [ground_truth[i] for i in indexes if i not in train_indexes]

train_input = train_feature_w2v(train_input)
test_input = train_feature_w2v(test_input)

In [None]:
unsupervisedClassifier = KMeans(1, 2)
unsupervisedClassifier.fit(train_input)
predicted = unsupervisedClassifier.predict(test_input)

from sklearn.metrics import accuracy_score

print(f'Accuracy {accuracy_score(predicted, test_output)}')

# Alternative la K-means

In [None]:
from random import shuffle

indexes = [i for i in range(len(list_reviews))]
shuffle(indexes)
train_indexes = indexes[:int(0.75 * len(indexes))]
train_input = [list_reviews[i] for i in train_indexes]
test_input = [list_reviews[i] for i in indexes if i not in train_indexes]
test_output = [ground_truth[i] for i in indexes if i not in train_indexes]

train_input = train_feature_w2v(train_input)
test_input = train_feature_w2v(test_input)

In [None]:
from matplotlib import pyplot as plt
from sklearn.cluster import MeanShift

# Apply Mean Shift clustering
ms = MeanShift()
ms.fit(train_input)
labels = ms.labels_
cluster_centers = ms.cluster_centers_

# Number of clusters
n_clusters = len(np.unique(labels))

print("Number of estimated clusters:", n_clusters)
print(cluster_centers)
# Plot the clusters
plt.figure(figsize=(8, 6))
plt.scatter(train_input[:, 0], np.zeros_like(train_input[:, 0]), c=labels, cmap='viridis')
plt.scatter(cluster_centers[:, 0], np.zeros_like(cluster_centers), marker='x', color='red', s=100, linewidths=4)
plt.title('Estimated number of clusters: {}'.format(n_clusters))
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.show()

In [None]:
predicted = ms.predict(test_input)
from sklearn.metrics import accuracy_score

print(f'Accuracy {accuracy_score(predicted, test_output)}')

In [None]:
from random import shuffle

indexes = [i for i in range(len(list_reviews))]
shuffle(indexes)
train_indexes = indexes[:int(0.75 * len(indexes))]
train_input = [list_reviews[i] for i in train_indexes]
test_input = [list_reviews[i] for i in indexes if i not in train_indexes]
test_output = [ground_truth[i] for i in indexes if i not in train_indexes]

train_input = train_feature_w2v(train_input)
test_input = train_feature_w2v(test_input)

In [None]:
from sklearn.cluster import SpectralClustering

spectral_clustering = SpectralClustering(n_clusters=1, affinity='nearest_neighbors', random_state=0)
spectral_clustering.fit(train_input)

predicted = spectral_clustering.fit_predict(test_input)

In [None]:
from sklearn.metrics import accuracy_score

print(f'Accuracy {accuracy_score(predicted, test_output)}')