In [None]:
# For my solution, I looked towards machine learning algorithms for anomaly detection, in particular IsolationForest
# Also, I looked at how many True/False responses each annotator gave, and submitted as an attribute.

In [51]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.ensemble import IsolationForest
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import warnings
warnings.filterwarnings('ignore')

In [40]:
df = pd.read_csv('/kaggle/input/challenging/nlp-challenge.csv')

In [47]:
result = df.groupby('Annotator ID')['Answer'].value_counts().unstack().fillna(0)

scaler = MinMaxScaler(feature_range=(-1, 1))
scaled_data = scaler.fit_transform(result)
scaled_df = pd.DataFrame(scaled_data, columns=result.columns, index=result.index)

In [5]:
dct = {True: 1, False: -1}
df['Answer'] = df['Answer'].map(dct)

annotator_vectors = df.pivot_table(index='Annotator ID', columns='Text', values='Answer')
annotator_vectors.fillna(0, inplace=True)

In [61]:
full_df = pd.concat([scaled_df, annotator_vectors], axis=1)
full_df.columns = full_df.columns.astype(str)

In [83]:
model = IsolationForest(contamination=0.1, random_state=42)
clusters = model.fit_predict(full_df)

In [84]:
full_df['anomaly'] = clusters

pca = PCA(n_components=2, random_state=42)
X_2d = pca.fit_transform(full_df.drop('anomaly', axis=1))

In [85]:
fig = go.Figure(data=go.Scatter(
    x = X_2d[:, 0],
    y = X_2d[:, 1],
    mode='markers',  
    text=annotator_vectors.index.values, 
    marker=dict(
        size=8,
        color=['red' if anomaly == -1 else 'blue' for anomaly in clusters],  # Красные для аномалий, синие для обычных точек
    )
))

fig.update_traces(textposition='top center')

fig.show()

# A1MG8KNVSVZ365, AQIP3DSYXEXX5, A3MV3PT4TOO69P, A3OCJJMRKAIJZA, A2LU259QPV1I4V, A3BISMR4GI02ZG - anomalies(bots)