# Imports

In [121]:
import pandas as pd
import os
import json
import torch
from sklearn.decomposition import PCA
from tqdm import tqdm
import plotly.graph_objects as go
from transformers import AutoTokenizer, AutoModelForMaskedLM

# Setup

In [122]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

# Input Parameters

In [123]:
PATH = "twitter-stream/2021/06/01/00/29.json"


In [124]:
tqdm.pandas()
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModelForMaskedLM.from_pretrained("bert-base-uncased")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [125]:
list_tweet = []
with open(PATH) as f:
        for line in f.readlines():
            dic=json.loads(line)
            list_tweet.append(dic)

In [163]:
df_twitter = pd.DataFrame(list_tweet)
df_twitter = df_twitter.dropna(subset=["text"]).reset_index()
df_twitter = df_twitter.head(500)
df_twitter["text"] = df_twitter["text"].astype("str")
df_twitter = df_twitter[["id","text","source"]]
df_twitter

Unnamed: 0,id,text,source
0,1.399614e+18,MEU DEUS EU QJERO ME MATAT,"<a href=""http://twitter.com/download/android"" ..."
1,1.399614e+18,الواحد اذا هو مجبور يتعامل مع هالطاقات السلبية...,"<a href=""http://twitter.com/download/iphone"" r..."
2,1.399614e+18,RT @Hafizsaaid33: #تازه\nد آزادۍ او خپلواکۍ په...,"<a href=""http://twitter.com/download/android"" ..."
3,1.399614e+18,せんせー\n俺が一位になる,"<a href=""http://twittbot.net/"" rel=""nofollow"">..."
4,1.399614e+18,@Sssfreshx それは見てねーんじゃね？w,"<a href=""http://twitter.com/download/android"" ..."
...,...,...,...
495,1.399614e+18,RT @YnahGives: ₱ 200 GCASH\n\n— follow me &amp...,"<a href=""http://twitter.com/download/android"" ..."
496,1.399614e+18,Pernah gak sih elu capek update mulu,"<a href=""http://twitter.com/download/android"" ..."
497,1.399614e+18,RT @MissinCurfew: ⚠️ RT FOR A CHANCE TO WIN ⚠️...,"<a href=""https://mobile.twitter.com"" rel=""nofo..."
498,1.399614e+18,Next month no vex,"<a href=""http://twitter.com/download/android"" ..."


In [141]:
#list_embeddings = [embedding for embedding in map(get_embedding, df_twitter["text"].tolist())]
#list_embeddings

# Embedding

In [164]:
ids = torch.tensor(tokenizer.encode(df_twitter["text"].tolist(), add_special_tokens=False)).unsqueeze(0)
list_embeddings = model(ids)[0][0].tolist()

In [165]:
pca = PCA(n_components=2)
pca.fit(list_embeddings)
feature = pca.transform(list_embeddings)
feature

array([[-8.42431481e+01,  7.52021779e+00],
       [ 2.40221719e+02,  1.08051288e+02],
       [ 2.35159690e+02,  1.10351759e+02],
       [ 2.26596214e+02,  1.12776357e+02],
       [ 2.23018068e+02,  1.13750276e+02],
       [ 2.17301322e+02,  1.14882253e+02],
       [ 2.12485618e+02,  1.16060178e+02],
       [ 2.09236777e+02,  1.17076212e+02],
       [ 2.07544275e+02,  1.17843214e+02],
       [ 2.06862024e+02,  1.18363380e+02],
       [ 2.10979300e+02,  1.18080970e+02],
       [ 2.12570019e+02,  1.17970930e+02],
       [ 2.15716780e+02,  1.17325801e+02],
       [ 2.19511206e+02,  1.16722946e+02],
       [ 2.18588256e+02,  1.16351824e+02],
       [ 2.22457861e+02,  1.15011389e+02],
       [ 2.19622687e+02,  1.14682209e+02],
       [ 2.17697159e+02,  1.14294901e+02],
       [ 2.21392004e+02,  1.12661447e+02],
       [ 2.21451190e+02,  1.11422082e+02],
       [ 2.27120401e+02,  1.09179912e+02],
       [ 2.30348501e+02,  1.07097650e+02],
       [ 2.36426417e+02,  1.04224324e+02],
       [ 2.

In [166]:
df_embedding = pd.DataFrame(feature)
df_embedding.columns = ["X","Y"]
df_embedding

Unnamed: 0,X,Y
0,-84.243148,7.520218
1,240.221719,108.051288
2,235.159690,110.351759
3,226.596214,112.776357
4,223.018068,113.750276
...,...,...
495,-104.110080,7.898079
496,-108.623395,7.212427
497,-107.531080,6.297289
498,-112.779237,9.657824


# Plot Embeddings

In [167]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=df_embedding["X"], y=df_embedding["Y"], mode="markers", text=df_twitter["text"], textposition="top center"))
