###
Remember to upload the data to the folder before running the program
###

In [1]:
!pip install langdetect

Collecting langdetect
[?25l  Downloading https://files.pythonhosted.org/packages/56/a3/8407c1e62d5980188b4acc45ef3d94b933d14a2ebc9ef3505f22cf772570/langdetect-1.0.8.tar.gz (981kB)
[K     |▍                               | 10kB 15.9MB/s eta 0:00:01[K     |▊                               | 20kB 23.0MB/s eta 0:00:01[K     |█                               | 30kB 12.4MB/s eta 0:00:01[K     |█▍                              | 40kB 11.0MB/s eta 0:00:01[K     |█▊                              | 51kB 10.5MB/s eta 0:00:01[K     |██                              | 61kB 10.7MB/s eta 0:00:01[K     |██▍                             | 71kB 10.1MB/s eta 0:00:01[K     |██▊                             | 81kB 10.0MB/s eta 0:00:01[K     |███                             | 92kB 9.5MB/s eta 0:00:01[K     |███▍                            | 102kB 9.7MB/s eta 0:00:01[K     |███▊                            | 112kB 9.7MB/s eta 0:00:01[K     |████                            | 122kB 9.7MB/s eta

In [None]:
import itertools
import numpy as np
import os
import re
import string
import tensorflow as tf
import tqdm
from langdetect import detect
import pandas as pd

from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from tensorflow.keras import Model, Sequential
from tensorflow.keras.layers import Activation, Dense, Dot, Embedding, Flatten, GlobalAveragePooling1D, Reshape

SEED = 42 
AUTOTUNE = tf.data.AUTOTUNE

def custom_standardization(input_data):
  lowercase = tf.strings.lower(input_data)
  return tf.strings.regex_replace(lowercase,
                                  '[%s]' % re.escape(string.punctuation), '')

europe_csv = pd.read_csv('/european_tweets.csv')
english_tweets = []
for content in europe_csv["Text"]:
    if detect(content) == 'en':
        english_tweets.append(content.encode('utf-8'))

tweet_ds = tf.data.Dataset.from_tensor_slices(english_tweets)

# Define the vocabulary size and number of words in a sequence.
vocab_size = 10000
sequence_length = 75

# Use the text vectorization layer to normalize, split, and map strings to
# integers. Set output_sequence_length length to pad all samples to same length.
vectorize_layer = TextVectorization(
    standardize=custom_standardization,
    max_tokens=vocab_size,
    output_mode='int',
    output_sequence_length=sequence_length)

vectorize_layer.adapt(tweet_ds.batch(1024))

# Save the created vocabulary for reference.
inverse_vocab = vectorize_layer.get_vocabulary()
print(inverse_vocab[:20])

def vectorize_text(text):
  text = tf.expand_dims(text, -1)
  return tf.squeeze(vectorize_layer(text))

# Vectorize the data in text_ds.
tweet_vector_ds = tweet_ds.batch(1024).prefetch(AUTOTUNE).map(vectorize_layer).unbatch()
sequences = list(tweet_vector_ds.as_numpy_iterator())
print(len(sequences))

for seq in sequences[:5]:
  print(f"{seq} => {[inverse_vocab[i] for i in seq]}")

['', '[UNK]', 'the', 'to', 'and', 'of', 'you', 'coronavirus', 'a', 'in', 'covid19', 'for', 'we', 'this', 'can', 'is', 'on', 'our', 'if', 'have']
3536
[ 222    3   28    9  368  355   88  894    2  391   98  240  487    3
  422  107    4   50 1985  260 2586  193   25 8965    3 8315 7166    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0] => ['great', 'to', 'be', 'in', 'northern', 'ireland', 'today', 'seeing', 'the', 'incredible', 'work', 'being', 'done', 'to', 'tackle', 'covid', 'and', 'get', 'jabs', 'into', 'arms', '–', 'from', 'belfast', 'to', 'enniskillen', 'httpstcofq3zrvorge', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', 

In [None]:
# Generates skip-gram pairs with negative sampling for a list of sequences
# (int-encoded sentences) based on window size, number of negative samples
# and vocabulary size.
def generate_training_data(sequences, window_size, num_ns, vocab_size, seed):
  # Elements of each training example are appended to these lists.
  targets, contexts, labels = [], [], []

  # Build the sampling table for vocab_size tokens.
  sampling_table = tf.keras.preprocessing.sequence.make_sampling_table(vocab_size)

  # Iterate over all sequences (sentences) in dataset.
  for sequence in tqdm.tqdm(sequences):

    # Generate positive skip-gram pairs for a sequence (sentence).
    positive_skip_grams, _ = tf.keras.preprocessing.sequence.skipgrams(
          sequence, 
          vocabulary_size=vocab_size,
          sampling_table=sampling_table,
          window_size=window_size,
          negative_samples=0)

    # Iterate over each positive skip-gram pair to produce training examples 
    # with positive context word and negative samples.
    for target_word, context_word in positive_skip_grams:
      context_class = tf.expand_dims(
          tf.constant([context_word], dtype="int64"), 1)
      negative_sampling_candidates, _, _ = tf.random.log_uniform_candidate_sampler(
          true_classes=context_class,
          num_true=1, 
          num_sampled=num_ns, 
          unique=True, 
          range_max=vocab_size, 
          seed=SEED, 
          name="negative_sampling")

      # Build context and label vectors (for one target word)
      negative_sampling_candidates = tf.expand_dims(
          negative_sampling_candidates, 1)

      context = tf.concat([context_class, negative_sampling_candidates], 0)
      label = tf.constant([1] + [0]*num_ns, dtype="int64")

      # Append each element from the training example to global lists.
      targets.append(target_word)
      contexts.append(context)
      labels.append(label)

  return targets, contexts, labels

In [None]:
targets, contexts, labels = generate_training_data(
    sequences=sequences, 
    window_size=2, 
    num_ns=4, 
    vocab_size=vocab_size, 
    seed=SEED)
print(len(targets), len(contexts), len(labels))

100%|██████████| 3536/3536 [00:08<00:00, 422.62it/s]

57072 57072 57072





In [None]:
BATCH_SIZE = 1024
BUFFER_SIZE = 10000
dataset = tf.data.Dataset.from_tensor_slices(((targets, contexts), labels))
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)
dataset = dataset.cache().prefetch(buffer_size=AUTOTUNE)
print(dataset)

<PrefetchDataset shapes: (((1024,), (1024, 5, 1)), (1024, 5)), types: ((tf.int32, tf.int64), tf.int64)>


In [None]:
class Word2Vec(Model):
  def __init__(self, vocab_size, embedding_dim):
    super(Word2Vec, self).__init__()
    self.target_embedding = Embedding(vocab_size, 
                                      embedding_dim,
                                      input_length=1,
                                      name="w2v_embedding", )
    self.context_embedding = Embedding(vocab_size, 
                                       embedding_dim, 
                                       input_length=num_ns+1)
    self.dots = Dot(axes=(3,2))
    self.flatten = Flatten()

  def call(self, pair):
    target, context = pair
    we = self.target_embedding(target)
    ce = self.context_embedding(context)
    dots = self.dots([ce, we])
    return self.flatten(dots)

In [None]:
def custom_loss(x_logit, y_true):
      return tf.nn.sigmoid_cross_entropy_with_logits(logits=x_logit, labels=y_true)

In [None]:
embedding_dim = 128
num_ns = 4
word2vec = Word2Vec(vocab_size, embedding_dim)
word2vec.compile(optimizer='adam',
              loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])

tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir="logs")

word2vec.fit(dataset, epochs=20, callbacks=[tensorboard_callback])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x7f20b5824d50>

In [None]:
import io
weights = word2vec.get_layer('w2v_embedding').get_weights()[0]
vocab = vectorize_layer.get_vocabulary()
out_v = io.open('vectors.tsv', 'w', encoding='utf-8')
out_m = io.open('metadata.tsv', 'w', encoding='utf-8')
out_m.write("header" + "\n")

for index, word in enumerate(vocab):
  if  index == 0: continue # skip 0, it's padding.
  vec = weights[index] 
  out_v.write('\t'.join([str(x) for x in vec]) + "\n")
  out_m.write(word + "\n")
out_v.close()
out_m.close()

try:
  from google.colab import files
  files.download('vectors.tsv')
  files.download('metadata.tsv')
except Exception as e:
  pass

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

TFIDF code start here


In [2]:
!git clone https://github.com/Data-Mining-2021/project.git

Cloning into 'project'...
remote: Enumerating objects: 194, done.[K
remote: Counting objects: 100% (194/194), done.[K
remote: Compressing objects: 100% (132/132), done.[K
remote: Total 194 (delta 81), reused 116 (delta 43), pack-reused 0[K
Receiving objects: 100% (194/194), 19.98 MiB | 10.27 MiB/s, done.
Resolving deltas: 100% (81/81), done.


In [11]:
import pandas as pd
from langdetect import detect
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
from sklearn import metrics
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from langdetect import detect

europe_csv = pd.read_csv('/content/project/regions/Europe/europe_tweets.csv')

vectorizer = TfidfVectorizer(ngram_range=(2,2),stop_words={'english'})
X = vectorizer.fit_transform(europe_csv["Text"].values.astype('U'))
print(vectorizer.vocabulary_)
print(X.shape)
svd = TruncatedSVD(100)
normalizer = Normalizer(copy=False)
lsa = make_pipeline(svd, normalizer)

X = lsa.fit_transform(X)
print(X)


explained_variance = svd.explained_variance_ratio_.sum()
print("Explained variance of the SVD step: {}%".format(
    int(explained_variance * 100)))
print()

(13980, 112553)
[[ 1.91208777e-02  8.78647199e-02  1.38549830e-02 ... -5.94216306e-02
   1.46601402e-01 -1.56147150e-01]
 [ 2.29458757e-02  3.40233636e-02  1.51057228e-02 ... -1.10469274e-01
  -7.99969915e-02  4.52872934e-02]
 [ 5.24402908e-03  1.00321662e-02  2.63923005e-02 ... -6.05785782e-02
  -2.48624935e-01 -2.67448312e-02]
 ...
 [ 4.54708118e-03  1.53097978e-02  1.63224522e-02 ... -5.26815354e-02
  -4.88399582e-02  7.86297011e-02]
 [ 1.22919940e-06  5.28240520e-06  9.31185636e-06 ...  9.68657015e-02
  -2.63409379e-01  3.36302571e-01]
 [-1.10533153e-07  1.25057793e-07  8.71798698e-07 ... -2.08314117e-01
   4.76582620e-02 -1.33347763e-01]]
Explained variance of the SVD step: 15%



In [14]:
import re

# https://stackoverflow.com/a/49146722/330558
def remove_emoji(string):
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002500-\U00002BEF"  # chinese char
                               u"\U00002702-\U000027B0"
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               u"\U0001f926-\U0001f937"
                               u"\U00010000-\U0010ffff"
                               u"\u2640-\u2642"
                               u"\u2600-\u2B55"
                               u"\u200d"
                               u"\u23cf"
                               u"\u23e9"
                               u"\u231a"
                               u"\ufe0f"  # dingbats
                               u"\u3030"
                               "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', string)

european_csv = pd.read_csv('/content/project/regions/Europe/europe_tweets.csv')

european_english = []

# filter out non-english tweets - also checks for NaN and tweets only containing emojis
european_english = [content.encode('utf-8') for content in european_csv['Text'] 
                    if content == content and remove_emoji(content).strip() and detect(content) == 'en']





In [58]:
tweets = european_csv['Text'].tolist()
users = european_csv['Username'].tolist()
users_list = list(set(users))
vectorizer = TfidfVectorizer(ngram_range=(2,2),max_df = 0.8, stop_words={'english'})
users_tweet_vectorizer = {}

for user in users_list:
  users_tweet_vectorizer[user] = []

for index, tweet in enumerate(tweets):
  if tweet == tweet and remove_emoji(tweet).strip() and detect(tweet) == 'en':
    tweet = tweet.encode('utf-8')
    users_tweet_vectorizer[users[index]].append(tweet)


In [59]:
svd = TruncatedSVD(50)
normalizer = Normalizer(copy=False)
lsa = make_pipeline(svd, normalizer)

for user in users_tweet_vectorizer.keys():
  X = vectorizer.fit_transform(users_tweet_vectorizer[user])
  X = lsa.fit_transform(X)
  users_tweet_vectorizer[user] = X
  print(X.shape)

(2279, 50)
(933, 50)
(235, 50)
(4270, 50)
(100, 50)
(5343, 50)
(54, 50)


In [62]:
from sklearn.cluster import KMeans
import numpy as np

km = KMeans(n_clusters=5, init='k-means++', max_iter=100, n_init=1,
                verbose=False)

tweets = np.vstack(users_tweet_vectorizer.values())

tweets = np.array(tweets)
print(tweets.shape)
km.fit(tweets)
original_space_centroids = svd.inverse_transform(km.cluster_centers_)
order_centroids = original_space_centroids.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
for i in range(5):
  print("Cluster %d:" % i, end='')
  for ind in order_centroids[i, :10]:
    print(' %s' % terms[ind], end='')
  print()

  import sys


(13214, 50)
Cluster 0: with the together with the new president joebiden ready to governments and the pandemic of our and eu_commission the us
Cluster 1: overcome the response to exchange of of views the health health response on the the european to overcome the pandemic
Cluster 2: of the to the response to full support overcome the with president on the the joint let keep the pandemic
Cluster 3: of the the eu for the in the at the and the on the heart of the heart to the
Cluster 4: for the to working forward to president joebiden the us the next of our better future on the eu and
