In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
import warnings, time
from sklearn import metrics
mpl.rcParams['patch.force_edgecolor'] = True
warnings.filterwarnings("ignore")

In [3]:
class social_media_bot_detector(object):
    @staticmethod
    def __init__(self):
        pass
    
    @staticmethod
    def split_data(df):
        msk = np.random.rand(len(df)) < 0.8
        train, test = df[msk], df[~msk]
        x_train, y_train = train, train.iloc[:,-1]
        x_test, y_test = test, test.iloc[:, -1]
        return x_train, y_train, x_test, y_test
    
    @staticmethod
    def predict(df):
        # Creating a copy of dataset
        train_df = df.copy()
        # Performing feature engineering
        # Converting id to int
        train_df['id'] = train_df.id.apply(lambda x: int(x))
        train_df['followers_count'] = train_df.followers_count.apply(lambda x: 0 if x=='None' else int(x))
        
        # Suspicious words
        # |bioRxiv|Game|game|botALLY|thricedotted
        suspicious_words = r'bot|b0t|cannabis|tweet me|mishear|follow me|updates every|gorilla|yes_ofc|forget' \
                    r'expos|kill|clit|bbb|butt|fuck|XXX|sex|truthe|fake|anony|free|virus|funky|RNA|kuck|jargon' \
                    r'nerd|swag|jack|bang|bonsai|chick|prison|paper|pokem|xx|freak|ffd|dunia|clone|genie|bbb' \
                    r'ffd|onlyman|emoji|joke|troll|droop|free|every|wow|cheese|yeah|bio|magic|wizard|face|bioRxiv|Game|game|botALLY|thricedotted'

        # Converting verified into vectors
        train_df['verified'] = train_df.verified.apply(lambda x: 1 if ((x == True) or x == 'TRUE') else 0)
        # Checking is name contains Suspicious words
        condition = ((train_df.name.str.contains(suspicious_words, case=False, na=False)) |
                     (train_df.description.str.contains(suspicious_words, case=False, na=False)) |
                     (train_df.screen_name.str.contains(suspicious_words, case=False, na=False)) |
                     (train_df.status.str.contains(suspicious_words, case=False, na=False)))
        # Bots
        predicted_df = train_df[condition]
        predicted_df.bot = 1
        predicted_df = predicted_df[['id', 'bot']]

        # Non bots
        verified_df = train_df[~condition]
        condition = (verified_df.verified == 1)
        predicted_df1 = verified_df[condition][['id', 'bot']]
        predicted_df1.bot = 0
        predicted_df = pd.concat([predicted_df, predicted_df1])
        
        # Checking followers and status count
        followers_following_df = verified_df[~condition]
        condition = ((followers_following_df.followers_count < 50) &
                     (followers_following_df.statuses_count > 1000)) 
        # Bots
        predicted_df1 = followers_following_df[condition][['id', 'bot']]
        predicted_df1.bot = 1
        predicted_df = pd.concat([predicted_df, predicted_df1])
        # Bots
        followers_retweet_df = followers_following_df[~condition]
        condition = ((followers_retweet_df.followers_count < 150) &
                     (followers_retweet_df.statuses_count > 10000))
        predicted_df1 = followers_retweet_df[condition][['id', 'bot']]
        predicted_df1.bot = 1
        predicted_df = pd.concat([predicted_df, predicted_df1])

        # Checking if listed_count>16000
        # Non bots
        listed_count_df = followers_retweet_df[~condition]
        listed_count_df.listed_count = listed_count_df.listed_count.apply(lambda x: 0 if x == 'None' else x)
        listed_count_df.listed_count = listed_count_df.listed_count.apply(lambda x: int(x))
        condition = (listed_count_df.listed_count > 16000)
        predicted_df1 = listed_count_df[condition][['id', 'bot']]
        predicted_df1.bot = 0
        predicted_df = pd.concat([predicted_df, predicted_df1])

        # Remaining
        # Non bots
        predicted_df1 = listed_count_df[~condition][['id', 'bot']]
        predicted_df1.bot = 0
        predicted_df = pd.concat([predicted_df, predicted_df1])
        return predicted_df
    
    @staticmethod
    def get_prediction_with_expected_values(features, target):
        y_pred, y_true = social_media_bot_detector.predict(features).bot.tolist(), target.tolist()
        return y_pred, y_true
    
    @staticmethod
    def get_accuracy(df):
        x_train, y_train, x_test, y_test = social_media_bot_detector.split_data(df)
        # Predictions on train data
        y_pred_train, y_true_train = social_media_bot_detector.get_prediction_with_expected_values(x_train, y_train)
        train_acc = metrics.accuracy_score(y_pred_train, y_true_train)
        # Predictions on test data
        y_pred_test, y_true_test = social_media_bot_detector.get_prediction_with_expected_values(x_test, y_test)
        test_acc = metrics.accuracy_score(y_pred_test, y_true_test)
        return train_acc, test_acc


if __name__ == '__main__':
    train_df = pd.read_csv('./dataset.csv')
    test_df = pd.read_csv('./test.csv', sep='\t')
    tma = 0
    tpa = 0
    for i in range(10):
        ma, pa = social_media_bot_detector.get_accuracy(train_df)
        tma = tma + ma
        tpa = tpa + pa
    print("Model accuracy: ", tma/10)
    print("Prediction accuracy: ", tpa/10)
    predicted_df = social_media_bot_detector.predict(test_df)
    predicted_df.to_csv('predicted.csv', index=False)
    print("Done!")

Model accuracy:  0.9685722672471087
Prediction accuracy:  0.968064873053083
Done!
