In [1]:
%%bash

pip install -q kaggle
mkdir -p ~/.kaggle
cp drive/MyDrive/kaggle.json ~/.kaggle/
chmod 600 /root/.kaggle/kaggle.json
kaggle datasets download -d shriyashjagtap/kaggle-bot-account-detection
unzip -qq kaggle-bot-account-detection.zip

Downloading kaggle-bot-account-detection.zip to /content



  0%|          | 0.00/55.5M [00:00<?, ?B/s] 16%|█▌        | 9.00M/55.5M [00:00<00:00, 60.6MB/s] 45%|████▌     | 25.0M/55.5M [00:00<00:00, 63.3MB/s] 74%|███████▍  | 41.0M/55.5M [00:00<00:00, 84.2MB/s]100%|██████████| 55.5M/55.5M [00:00<00:00, 95.9MB/s]


In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing  import LabelEncoder, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from statistics import mode

In [2]:
import xgboost 
xgboost.__version__

'1.7.3'

In [33]:
df = pd.read_csv("kaggle_bot_accounts.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,NAME,GENDER,EMAIL_ID,IS_GLOGIN,FOLLOWER_COUNT,FOLLOWING_COUNT,DATASET_COUNT,CODE_COUNT,DISCUSSION_COUNT,AVG_NB_READ_TIME_MIN,REGISTRATION_IPV4,REGISTRATION_LOCATION,TOTAL_VOTES_GAVE_NB,TOTAL_VOTES_GAVE_DS,TOTAL_VOTES_GAVE_DC,ISBOT
0,0,Johnny KerrThomas,Male,jacksonalan@example.com,False,53.0,87.0,5.0,3.0,124.0,,81.88.75.170,Argentina,16.0,10.0,3.0,
1,1,Dwayne LarsenLara,Male,calvin80@example.com,True,16.0,67.0,5.0,,26.0,24.97,,New Zealand,14.0,5.0,2.0,
2,2,,Male,qbrown@example.net,True,44.0,81.0,4.0,17.0,125.0,7.75,159.202.103.178,Costa Rica,16.0,4.0,0.0,False
3,3,Russell SimmonsPhillips,Male,kimberlywagner@example.com,True,23.0,114.0,5.0,24.0,67.0,13.4,196.11.132.51,Italy,21.0,10.0,1.0,False
4,4,Jamie WilsonMartinez,Female,shaunbrooks@example.com,False,46.0,112.0,2.0,12.0,63.0,24.83,159.196.199.20,Belgium,10.0,6.0,2.0,False


In [34]:
df.drop(["Unnamed: 0", 'NAME', "EMAIL_ID"], axis=1, inplace=True)

In [35]:
df.drop("REGISTRATION_IPV4", axis=1, inplace=True)

In [36]:
df.isna().sum()

GENDER                   77879
IS_GLOGIN                77916
FOLLOWER_COUNT           77712
FOLLOWING_COUNT          78445
DATASET_COUNT            78567
CODE_COUNT               77926
DISCUSSION_COUNT         77722
AVG_NB_READ_TIME_MIN     78316
REGISTRATION_LOCATION    78290
TOTAL_VOTES_GAVE_NB      77705
TOTAL_VOTES_GAVE_DS      77934
TOTAL_VOTES_GAVE_DC      78030
ISBOT                    78500
dtype: int64

In [37]:
df["GENDER"].fillna(mode(df["GENDER"]), inplace=True)
df["IS_GLOGIN"].fillna(mode(df["IS_GLOGIN"]), inplace=True)
df["FOLLOWER_COUNT"].fillna(np.mean(df["FOLLOWER_COUNT"]), inplace=True)
df["FOLLOWING_COUNT"].fillna(np.mean(df["FOLLOWING_COUNT"]), inplace=True)
df["DATASET_COUNT"].fillna(np.mean(df["DATASET_COUNT"]), inplace=True) 
df["CODE_COUNT"].fillna(np.mean(df["CODE_COUNT"]), inplace=True)
df["DISCUSSION_COUNT"].fillna(np.mean(df["DISCUSSION_COUNT"]), inplace=True)
df["AVG_NB_READ_TIME_MIN"].fillna(np.mean(df["AVG_NB_READ_TIME_MIN"]), inplace=True)
df["REGISTRATION_LOCATION"].fillna(mode(df["REGISTRATION_LOCATION"]), inplace=True)
df["TOTAL_VOTES_GAVE_NB"].fillna(np.mean(df["TOTAL_VOTES_GAVE_NB"]), inplace=True)
df["TOTAL_VOTES_GAVE_DS"].fillna(np.mean(df["TOTAL_VOTES_GAVE_DS"]), inplace=True)
df["TOTAL_VOTES_GAVE_DC"].fillna(np.mean(df["TOTAL_VOTES_GAVE_DC"]), inplace=True)
df["ISBOT"].fillna(mode(df["ISBOT"]), inplace=True)

In [38]:
df.isna().sum()

GENDER                       0
IS_GLOGIN                    0
FOLLOWER_COUNT               0
FOLLOWING_COUNT              0
DATASET_COUNT                0
CODE_COUNT                   0
DISCUSSION_COUNT             0
AVG_NB_READ_TIME_MIN         0
REGISTRATION_LOCATION    78290
TOTAL_VOTES_GAVE_NB          0
TOTAL_VOTES_GAVE_DS          0
TOTAL_VOTES_GAVE_DC          0
ISBOT                        0
dtype: int64

In [39]:
mode(df["REGISTRATION_LOCATION"])

nan

In [40]:
labels = {}
for column in df.keys():
  if pd.api.types.is_object_dtype(df[column]):
    le = LabelEncoder()
    df[column] = le.fit_transform(df[column])
    labels[column] = le.classes_

In [41]:
labels

{'GENDER': array(['Female', 'Male'], dtype=object),
 'REGISTRATION_LOCATION': array(['Afghanistan', 'Albania', 'Algeria', 'American Samoa', 'Andorra',
        'Angola', 'Anguilla',
        'Antarctica (the territory South of 60 deg S)',
        'Antigua and Barbuda', 'Argentina', 'Armenia', 'Aruba',
        'Australia', 'Austria', 'Azerbaijan', 'Bahamas', 'Bahrain',
        'Bangladesh', 'Barbados', 'Belarus', 'Belgium', 'Belize', 'Benin',
        'Bermuda', 'Bhutan', 'Bolivia', 'Bosnia and Herzegovina',
        'Botswana', 'Bouvet Island (Bouvetoya)', 'Brazil',
        'British Indian Ocean Territory (Chagos Archipelago)',
        'British Virgin Islands', 'Brunei Darussalam', 'Bulgaria',
        'Burkina Faso', 'Burundi', 'Cambodia', 'Cameroon', 'Canada',
        'Cape Verde', 'Cayman Islands', 'Central African Republic', 'Chad',
        'Chile', 'China', 'Christmas Island', 'Cocos (Keeling) Islands',
        'Colombia', 'Comoros', 'Congo', 'Cook Islands', 'Costa Rica',
        "Cote

In [42]:
X = df.drop("ISBOT", axis=1)
y = df["ISBOT"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [43]:
rclf = RandomForestClassifier()
rclf.fit(X_train.values, y_train.values)

RandomForestClassifier()

In [44]:
rclf.score(X_test.values, y_test.values)

0.9842755394757756

In [45]:
from xgboost import XGBClassifier

xclf = XGBClassifier()

xclf.fit(X_train.values, y_train.values)
xclf.score(X_test.values, y_test.values)

0.9842868928768761