<a href="https://colab.research.google.com/github/DavidGlezGmz/Natural-Language-Processing-NLP-/blob/main/Tweets_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Unsupervised NLP using GUSE and K-means

$ r_{i,k} =\frac{e^{-\beta || x_i - m_k||^2}}{\sum_{i=0}^{N} e^{-\beta || x_i - m_k||^2}} $ 

$m_k=\frac{\sum_i^N r_{i,k} x_i}{\sum r_{s,k}}$

In [None]:
def Responsability(X,means,beta=1e-3,*args,**kwargs):
  diff=[]
  responsibilities=[]

  for mean in means:
    diff =X -mean
    dist =(np.sum(diff ** 2,axis=1,keepdims=True))

    numerator= np.exp(-(beta*dist))
    denominator=np.sum(numerator)
    responsibility=-numerator/denominator

    responsibilities.append(responsibility)
  return np.hstack(responsibilities)

In [None]:
import tensorflow as tf
import tensorflow_hub as hub

In [None]:
import numpy as np
import numpy.linalg as lg
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import string
import os
import re

In [None]:
module_url = "https://tfhub.dev/google/universal-sentence-encoder/4"
embedder = hub.load(module_url)

In [None]:
def TextPreProcessing(sentence):
  text = sentence.lower().strip()
  text_p = "".join(char for char in text if char not in string.punctuation)
  clean_sentence = text_dn = re.sub(r'https?:\/\/.*[\r\n]*','',text_p)
  return clean_sentence

In [None]:
def embed(sentence):
  sentence_in = [sentence]
  return embedder(sentence_in)

In [None]:
def data_load(filename):
  data = pd.read_csv(filename, header=0, encoding='iso-8859-1')
  X = data.Tweet
  return X

In [None]:
def LoadVectors(X):
  Dict_of_Sentences = dict()
  for i in range(len(X)):
    Dict_of_Sentences[i] = {'sentence': X[i], 'vector': embed(TextPreProcessing(X[i]))}
  return Dict_of_Sentences

In [None]:
X = data_load('https://raw.githubusercontent.com/DavidGlezGmz/K-Means/main/data_elonmusk2.csv')

In [None]:
X

In [None]:
X.shape

In [None]:
My_NLP_dict = LoadVectors(X)

In [None]:
My_NLP_dict

In [None]:
class K_Means:
  def __init__(self, k=3, distance_func=Distance, beta=None):
    self.k=k
    self.distance_func=distance_func
    self.beta=beta

  def fit(self,X,iterations=5):
    indices = np.arange(X.shape[0])
    sample_indices = np.random.choice(indices,size=self.k,replace=False)
    self.means = X[sample_indices]

    for i in range(iterations):
      y_hat = self.Predict(X)
      self.means=[]
      for j in range(self.k):
        mean=np.mean(X[y_hat==j], axis=0)
        self.means.append(mean)
      self.means=np.vstack(self.means)

    y_hat=self.Predict(X)
    plt.figure(figsize=(10,7))
    plt.scatter(X[:,0],X[:,1],s=1,c=y_hat)
    plt.scatter(self.means[:,0],self.means[:,1], c='k',s=10)

    return y_hat
  
  def Predict(self,X):
    dist=self.distance_func(X,self.means,self.beta)
    y_hat = np.argmin(dist,axis=1)
    return y_hat

In [None]:
vector_tweets=[]
for i in range(len(X)):
  vector_tweets.append(np.hstack(My_NLP_dict[i]['vector'].numpy())) 
vector_tweets = np.vstack(vector_tweets)

In [None]:
vector_tweets[1]

In [None]:
vector_tweets[1:3]

In [None]:
tweetcluster = K_Means(k=12)

In [None]:
tweetcluster.fit(vector_tweets, iterations = 20)

In [None]:
y_hat = tweetcluster.Predict(vector_tweets)

In [None]:
y_hat

In [None]:
for i in range (len(X)):
  print(y_hat[i], " ", My_NLP_dict[i]["sentence"])

In [None]:
tweet_cluster_soft = K_Means(k=12, distance_func = Responsability, beta = 1.6)

In [None]:
y_hat2 = tweet_cluster_soft.fit(vector_tweets, iterations = 10)

In [None]:
sentences = []
for i in range(len(X)):
  sentences.append(My_NLP_dict[i]['sentence'])

In [None]:
results_df = pd.DataFrame(y_hat, sentences)

In [None]:
results_df