In [1]:
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import IsolationForest


In [3]:

def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0]
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)


In [4]:
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')



In [5]:


sentences = [ i for i in pd.read_csv("tweets.csv")['full_text'].values]



encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')

with torch.no_grad():
    model_output = model(**encoded_input)

sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)

print("Sentence embeddings:")
print(sentence_embeddings)

Sentence embeddings:
tensor([[-0.0645,  0.0362,  0.0130,  ...,  0.0098, -0.1269,  0.0379],
        [-0.0820, -0.0504,  0.0285,  ..., -0.0773, -0.0445,  0.0358],
        [-0.1216, -0.0063, -0.0568,  ..., -0.1257, -0.0639,  0.0431],
        ...,
        [-0.0808,  0.0510,  0.0449,  ...,  0.0067,  0.0294, -0.0139],
        [-0.0393,  0.0346, -0.0354,  ..., -0.0456, -0.0700, -0.0284],
        [ 0.0138,  0.0756, -0.0354,  ..., -0.0495, -0.0546,  0.0173]])


In [6]:
np.shape(sentence_embeddings)

torch.Size([1564, 384])

In [7]:
len(sentences)

1564

In [8]:
sentences

['You have diverted our attention. Tumekubali. Lets now show you what we can do when we are diverted https://t.co/iLxvaBzVE5',
 'LIKE this post if you believe 2027 Morara will be our president. https://t.co/V4x3na6fwK',
 'Morara \u2066@MoraraKebasoSnr\u2069 is the real deal https://t.co/R7zCRomAJp',
 '#RutoMustGo\nNO ONE: Why did William Ruto arrest Morara Kebaso and arraign him in court with some stupid, unstickable bullshit charge?\n\nTHE TRUTH: https://t.co/gGXESYdirr',
 "I don't care if Gen Z and Morara's party is called INJECT or Suruali \n\nImmaterial. \n\nMwenye anaona haifai, aanzishe yake na wafuasi wake!\n\nODM is called Orange DEMOCRATIC Party but the D is silent \n\nHiyo huwaga personal Fiefdom ya Opoda \n\nAs long as Inject lives to its Vision,… https://t.co/xvymshoFyL",
 'Guys the venue has changed. Its Bomas of Kenya. Twende Kazi. Kenya inatuita\nMessage is simple: WAENDE WOTE',
 'Yeees. YES. YES.',
 'For the love of Morara Kebaso,just REPOST https://t.co/2kvr14tDdW',
 '

In [9]:
from scipy.spatial import distance

In [10]:
sent_df = pd.DataFrame(sentence_embeddings)

In [None]:
sent_df

In [None]:
mu = np.mean(sent_df, axis=0)

In [None]:
mu

In [None]:
sigma = np.cov(sent_df.T)

In [None]:
# sent_df['mahalanobis_distance'] = [distance.mahalanobis(sent_df.iloc[i], mu, np.linalg.inv(sigma)) for i in range(len(sent_df)) ]

In [None]:
# sent_df['mahalanobis_distance']

In [11]:
model = IsolationForest(contamination=0.1, random_state=42)

model.fit(sentence_embeddings)

sent_df['anomaly_score'] = model.predict(sentence_embeddings)

In [12]:
sent_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,375,376,377,378,379,380,381,382,383,anomaly_score
0,-0.064491,0.036154,0.013039,-0.005820,0.075423,-0.015837,0.042137,-0.017676,-0.037878,-0.061579,...,-0.015175,-0.043174,-0.028628,-0.029501,-0.000650,0.048148,0.009820,-0.126930,0.037897,1
1,-0.082021,-0.050372,0.028531,-0.023357,-0.095442,0.011380,-0.073425,-0.007924,0.046453,0.032234,...,0.087825,-0.071429,-0.000567,-0.019803,0.041592,-0.002150,-0.077341,-0.044507,0.035785,1
2,-0.121641,-0.006341,-0.056848,-0.017421,-0.078184,-0.014171,-0.006592,0.048153,0.116218,-0.008938,...,0.024707,-0.048200,0.007657,0.056465,0.050261,-0.046894,-0.125748,-0.063928,0.043113,1
3,-0.030637,0.062008,-0.055178,0.053755,0.001159,0.027639,0.051667,0.015196,-0.004344,0.024245,...,0.046470,0.038370,-0.000353,-0.026137,0.021446,0.038474,-0.007472,-0.021843,-0.009009,1
4,-0.070025,0.049080,-0.047494,-0.021787,-0.043071,-0.048617,0.077527,-0.032837,0.009821,-0.027530,...,-0.017851,-0.005247,0.048579,0.013573,0.031826,-0.014228,-0.024365,0.072619,-0.043643,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1559,-0.067280,0.010237,0.009814,-0.085341,-0.019217,-0.010918,0.043107,0.002892,0.033525,0.068969,...,-0.020112,-0.059690,0.077531,-0.019477,0.035012,0.027967,-0.067361,-0.014620,-0.055828,1
1560,-0.083171,0.112508,0.019847,-0.033636,-0.037080,-0.007475,0.043630,-0.060464,-0.008669,0.077407,...,-0.015676,0.002601,0.048231,0.006711,0.076537,0.084809,-0.023600,-0.130420,-0.013914,1
1561,-0.080761,0.051034,0.044911,-0.056523,-0.043520,0.031467,0.040238,-0.052106,-0.058910,0.085842,...,0.090339,0.004545,0.023931,-0.032631,0.014429,0.030322,0.006698,0.029448,-0.013912,1
1562,-0.039308,0.034598,-0.035357,-0.002735,-0.074493,-0.014571,0.076784,-0.045135,-0.037178,0.017972,...,-0.036090,-0.021220,0.034805,-0.053583,-0.000060,0.010456,-0.045649,-0.069972,-0.028390,1


In [13]:
sentences

['You have diverted our attention. Tumekubali. Lets now show you what we can do when we are diverted https://t.co/iLxvaBzVE5',
 'LIKE this post if you believe 2027 Morara will be our president. https://t.co/V4x3na6fwK',
 'Morara \u2066@MoraraKebasoSnr\u2069 is the real deal https://t.co/R7zCRomAJp',
 '#RutoMustGo\nNO ONE: Why did William Ruto arrest Morara Kebaso and arraign him in court with some stupid, unstickable bullshit charge?\n\nTHE TRUTH: https://t.co/gGXESYdirr',
 "I don't care if Gen Z and Morara's party is called INJECT or Suruali \n\nImmaterial. \n\nMwenye anaona haifai, aanzishe yake na wafuasi wake!\n\nODM is called Orange DEMOCRATIC Party but the D is silent \n\nHiyo huwaga personal Fiefdom ya Opoda \n\nAs long as Inject lives to its Vision,… https://t.co/xvymshoFyL",
 'Guys the venue has changed. Its Bomas of Kenya. Twende Kazi. Kenya inatuita\nMessage is simple: WAENDE WOTE',
 'Yeees. YES. YES.',
 'For the love of Morara Kebaso,just REPOST https://t.co/2kvr14tDdW',
 '

In [14]:
_ =sent_df.loc[sent_df['anomaly_score'] == -1]

In [15]:
_

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,375,376,377,378,379,380,381,382,383,anomaly_score
6,-0.066397,-0.000976,0.063291,0.008859,-0.043541,0.019035,0.126242,-0.066348,0.017261,-0.073609,...,-0.033215,-0.001738,0.022351,-0.043165,0.034363,0.032252,-0.025110,0.040339,-0.104809,-1
17,-0.011536,0.070418,-0.041236,0.023121,0.005816,-0.043866,0.040399,-0.041419,0.005238,0.047020,...,0.027324,-0.081155,0.017054,0.019800,0.014365,0.057534,-0.079817,0.021998,-0.047237,-1
19,0.041624,-0.051486,0.071880,-0.004310,0.045525,-0.014266,-0.035487,-0.035959,-0.060572,0.033572,...,0.005879,-0.057455,-0.053907,-0.019930,0.079279,0.064040,-0.092045,-0.053371,0.052285,-1
26,-0.053478,0.128055,-0.045938,-0.025384,0.030552,0.065126,-0.014215,-0.085315,0.046248,0.074831,...,0.042944,0.040932,0.055590,-0.054465,0.035705,0.097148,0.016274,-0.048076,-0.012900,-1
33,-0.016116,0.048043,0.026314,-0.042411,-0.020524,0.016107,0.092829,0.019183,-0.006024,0.136658,...,-0.026794,-0.062091,-0.042537,0.057309,0.061145,0.046529,-0.100924,0.068287,0.015826,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1484,0.009747,-0.012632,0.012876,-0.026694,0.099809,-0.028623,-0.013376,0.070085,0.002478,-0.053748,...,0.023419,0.007214,0.080020,-0.006935,0.058639,-0.006227,0.102726,0.066127,0.018266,-1
1502,-0.045499,0.136062,-0.011272,-0.025440,-0.038294,0.041978,0.068367,0.013380,0.064405,-0.046351,...,0.059081,-0.009417,0.020761,0.052438,-0.035138,0.028813,-0.000698,-0.085459,0.038360,-1
1510,-0.059977,0.031737,-0.030585,0.003299,0.015440,0.080075,0.086058,-0.004838,0.039727,0.096675,...,0.062674,-0.027721,0.012735,0.013194,0.032925,0.053532,-0.057081,-0.092461,-0.070495,-1
1526,-0.002203,0.086178,0.023867,-0.046581,0.043344,-0.010578,0.070445,0.029257,0.026756,-0.021212,...,0.004709,0.017693,-0.011040,-0.020739,-0.021894,0.037004,-0.010200,0.019413,-0.013482,-1


In [16]:
sent_df.iloc[_.index]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,375,376,377,378,379,380,381,382,383,anomaly_score
6,-0.066397,-0.000976,0.063291,0.008859,-0.043541,0.019035,0.126242,-0.066348,0.017261,-0.073609,...,-0.033215,-0.001738,0.022351,-0.043165,0.034363,0.032252,-0.025110,0.040339,-0.104809,-1
17,-0.011536,0.070418,-0.041236,0.023121,0.005816,-0.043866,0.040399,-0.041419,0.005238,0.047020,...,0.027324,-0.081155,0.017054,0.019800,0.014365,0.057534,-0.079817,0.021998,-0.047237,-1
19,0.041624,-0.051486,0.071880,-0.004310,0.045525,-0.014266,-0.035487,-0.035959,-0.060572,0.033572,...,0.005879,-0.057455,-0.053907,-0.019930,0.079279,0.064040,-0.092045,-0.053371,0.052285,-1
26,-0.053478,0.128055,-0.045938,-0.025384,0.030552,0.065126,-0.014215,-0.085315,0.046248,0.074831,...,0.042944,0.040932,0.055590,-0.054465,0.035705,0.097148,0.016274,-0.048076,-0.012900,-1
33,-0.016116,0.048043,0.026314,-0.042411,-0.020524,0.016107,0.092829,0.019183,-0.006024,0.136658,...,-0.026794,-0.062091,-0.042537,0.057309,0.061145,0.046529,-0.100924,0.068287,0.015826,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1484,0.009747,-0.012632,0.012876,-0.026694,0.099809,-0.028623,-0.013376,0.070085,0.002478,-0.053748,...,0.023419,0.007214,0.080020,-0.006935,0.058639,-0.006227,0.102726,0.066127,0.018266,-1
1502,-0.045499,0.136062,-0.011272,-0.025440,-0.038294,0.041978,0.068367,0.013380,0.064405,-0.046351,...,0.059081,-0.009417,0.020761,0.052438,-0.035138,0.028813,-0.000698,-0.085459,0.038360,-1
1510,-0.059977,0.031737,-0.030585,0.003299,0.015440,0.080075,0.086058,-0.004838,0.039727,0.096675,...,0.062674,-0.027721,0.012735,0.013194,0.032925,0.053532,-0.057081,-0.092461,-0.070495,-1
1526,-0.002203,0.086178,0.023867,-0.046581,0.043344,-0.010578,0.070445,0.029257,0.026756,-0.021212,...,0.004709,0.017693,-0.011040,-0.020739,-0.021894,0.037004,-0.010200,0.019413,-0.013482,-1


In [17]:
sentences[i for i in _.index]

SyntaxError: invalid syntax (4223557691.py, line 1)

In [18]:
sentences[0]

'You have diverted our attention. Tumekubali. Lets now show you what we can do when we are diverted https://t.co/iLxvaBzVE5'

In [19]:
_sent_df = pd.read_csv("tweets.csv")

In [23]:
outlier_df = _sent_df.iloc[_.index]

outlier_df.to_csv("outliers.csv")

In [27]:
proper = _sent_df.loc[~(_sent_df.index.isin(outlier_df.index))]

In [29]:
proper.to_csv("proper.csv")