# Twitter Bot Detection (Final - Kaggle)

## Import necessary Libraries :

In [1]:
import csv
import pandas as pd
import numpy as np

from sklearn import model_selection
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier

from sklearn.metrics import *
%matplotlib inline
import matplotlib.pyplot as plt
from collections import Counter
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score

## Importing the Data :

In [2]:
#import train and test data sets
train = pd.read_csv("/Users/akshay/Desktop/GitHub/Twitter_Bot_Detection/Training_data.csv")
test = pd.read_csv("/Users/akshay/Desktop/GitHub/Twitter_Bot_Detection/Testing_data.csv")

### Check if 'Name', 'Screen_Name' or 'Description' contain  String 'bot' or 'Bot' :

In [3]:
### Check if the Screen_Name of the User has 'bot' in it
test['screen_name'] = test['screen_name'].astype(str)
test['name'] = test['name'].astype(str)
test['description'] = test['description'].astype(str)

#Check if 'description' attribute is null or not
train['description'] = train['description'].fillna('No Description')
test['description'] = test['description'].fillna('No Description')  

### Check if the SCREEN_Name of the User has 'bot' in it
train_sname_bot=[]
for row in train.screen_name:
    if ('bot' in row or 'Bot' in row):
        train_sname_bot.append(1)
    else:
        train_sname_bot.append(0)

test_sname_bot=[]
for row in test.screen_name:
    if ('bot' in row or 'Bot' in row):
        test_sname_bot.append(1)
    else:
        test_sname_bot.append(0)

### Check if the Name of the User has 'bot' in it
train_name_bot=[]
for row in train.name:
    if ('bot' in row or 'Bot' in row):
        train_name_bot.append(1)
    else:
        train_name_bot.append(0)

test_name_bot=[]
for row in test.name:
    if ('bot' in row or 'Bot' in row):
        test_name_bot.append(1)
    else:
        test_name_bot.append(0)

### Check if the Description of the User has 'bot' in it
train_des_bot=[]
for line in train['description']:
        if ('bot' in row and 'obot' not in row):
            train_des_bot.append(1)
        else:
            train_des_bot.append(0)

test_des_bot=[]
for row in test['description']:
        if ('bot' in row and 'obot' not in row):
            test_des_bot.append(1)
        else:
            test_des_bot.append(0)

### if Either of Name, Screen name or Description has 'bot' in it 
### Combine the Result as a Feature for Test Data:

In [4]:
print Counter(test_des_bot)
print Counter(test_sname_bot)
print Counter(test_name_bot)
for i in range(0,len(test_des_bot)):
    if(test_sname_bot[i]==1):
        test_des_bot[i]=1
    if(test_name_bot[i]==1):
        test_des_bot[i]=1
print Counter(test_des_bot)

Counter({0: 436, 1: 139})
Counter({0: 372, 1: 203})
Counter({0: 417, 1: 158})
Counter({0: 322, 1: 253})


### Combine the Result as a Feature for Training Data:

In [5]:
# if Either of Name, Screen name or Description has 'bot' in it 
print Counter(train_des_bot)
print Counter(train_name_bot)
print Counter(train_sname_bot)
for i in range(0,len(train_des_bot)):
    if(train_sname_bot[i]==1):
        train_des_bot[i]=1
    if(train_name_bot[i]==1):
        train_des_bot[i]=1
print Counter(train_des_bot)

Counter({0: 2797})
Counter({0: 2604, 1: 193})
Counter({0: 2572, 1: 225})
Counter({0: 2535, 1: 262})


### Adding the Feature to the Respective Dataframe:

In [6]:
test['name_bot']=test_des_bot
train['name_bot']=train_des_bot

### Correcting the Missclassified Data in Train:

In [7]:
print Counter(train['bot'])
print Counter(train_des_bot)
for i in range(0,train.shape[0]):
    if(train_des_bot[i]==1):
        train['bot'][i]=1
print Counter(train['bot'])

Counter({0: 1476, 1: 1321})
Counter({0: 2535, 1: 262})


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Counter({0: 1465, 1: 1332})


### DATA CLEANING AND FORMATTING (Test Data)

In [8]:
# Check if 'has_extended_profile' attribute is null or not
train['has_extended_profile'] = train['has_extended_profile'].fillna(False)
test['has_extended_profile'] = test['has_extended_profile'].fillna(False)

# Check if 'location' attribute is null or not
train['location']=pd.isnull(train.location).astype(int)
test['location']=pd.isnull(test.location).astype(int)

test['followers_count'].replace('None',0,inplace=True)
test['friends_count'].replace('None',0,inplace=True)
test['favorites_count'].replace('None',0,inplace=True)
test['statuses_count'].replace('None',0,inplace=True)

test['verified'].replace('None','FALSE',inplace=True)
test['verified'] = test['verified'].fillna(False)

test['default_profile'] = test['default_profile'].fillna(False)
test['default_profile'].replace('None','FALSE',inplace=True)

test['default_profile_image'] = test['default_profile_image'].fillna(False)
test['default_profile_image'].replace('None','FALSE',inplace=True)

test['has_extended_profile'] = test['has_extended_profile'].fillna(False)
test['has_extended_profile'].replace('None',False,inplace=True)

test.replace('FALSE',False,inplace=True)
test.replace('TRUE',True,inplace=True)

train['listedcount'].replace('None','-1',inplace=True)
test['listed_count'].replace('None','-1',inplace=True)

## Check How much every parameter contributes:

The 10 Features used to Train the models are:
1. Verified
2. default_profile
3. default_profile_image
4. has_extended_profile
5. name_bot
6. friends_count
7. followers_count
8. favorites_count
9. statuses_count
10. location

The code we used to find these features is in the Twitter_Bot_Detection(Midterm) Ipython Notebook

## Generating our X_train, Y_train and X_test

In [9]:
X_train=train[['followers_count','friends_count','favourites_count','statuses_count','verified','default_profile','default_profile_image','has_extended_profile','location','name_bot']].astype(int)
Y_train=train.bot

X_test=test[['followers_count','friends_count','favorites_count','statuses_count','verified','default_profile','default_profile_image','has_extended_profile','location','name_bot']].astype(int)
test.id = test.id.astype(np.int64)

### Using GBC to make a Prediciton... Our Final Kaggle Prediction

In [10]:
clf=GradientBoostingClassifier()
clf.fit(X_train,Y_train)
prediction=clf.predict(X_test)
# Here 'prediction' is our Best score on Kaggle, obtained using GBC modified by GridSearch CV.
#We are using it here to show the confusion matrix and accuracy of other models.

In [11]:
count=0
for i in range(0,prediction.shape[0]):
    if(test_name_bot[i]==1):
        count=count+1
        prediction[i]=1
    if(test_sname_bot[i]==1):
        count=count+1
        prediction[i]=1
    if(test_des_bot[i]==1):
        count=count+1
        prediction[i]=1
print count

614


In [12]:
### Creating the file to Submit on Kaggle:
z=pd.DataFrame()
z['id']=test.id.astype(np.int64)
z['bot']=prediction
#z.to_csv('FINAL.csv',index=False,encoding='utf-8')

## Predicting the Result with other Classifiers

In [13]:
#Classifiers Used are:
AdaBoostClassifier()
DecisionTreeClassifier()
GradientBoostingClassifier()
RandomForestClassifier()
SVC(kernel='rbf', gamma=1e-10, C=10)

clf1 = RandomForestClassifier()
clf2 = GradientBoostingClassifier()
clf3 = DecisionTreeClassifier()
clf4 = SVC(kernel='rbf', gamma=1e-10, C=10)
clf5 = AdaBoostClassifier()

idx=test.id.astype(np.int64)

## Training all the default classifiers:
clf1.fit(X_train, Y_train)
clf2.fit(X_train, Y_train)
clf3.fit(X_train, Y_train)
clf4.fit(X_train,Y_train)
clf5.fit(X_train,Y_train)

## Using our trained models to predict Results:
train_predictions1 = clf1.predict(X_test)
train_predictions2 = clf2.predict(X_test)
train_predictions3 = clf3.predict(X_test)
train_predictions4 = clf4.predict(X_test)
train_predictions5 = clf5.predict(X_test)

## Adding the Predictions to a Dataframe 
Predictions=pd.DataFrame()
Predictions['RF']=train_predictions1
Predictions['GBC']=train_predictions2
Predictions['DTC']=train_predictions3
Predictions['SVM']=train_predictions4
Predictions['ADB']=train_predictions5

## We use our best submission as a basis to show the accuracy of the other models we trained:
Predictions['X']=prediction

In [14]:
## Accuracy, Confusion Matrix for the model with respect to our best prediction:
print ("The results for RF")
print(cross_val_score(clf1, X_train, Y_train, cv=10, scoring='accuracy').mean())
print accuracy_score(Predictions['RF'],z.bot)
print confusion_matrix(Predictions['RF'],z.bot)
print Counter(Predictions['RF'])
print '\n'

print ("The results for GBC")
print(cross_val_score(clf2, X_train, Y_train, cv=10, scoring='accuracy').mean())
print accuracy_score(Predictions['GBC'],z.bot)
print confusion_matrix(Predictions['GBC'],z.bot)
print Counter(Predictions['GBC'])
print '\n'
# GBC works best here 

print ("The results for DTC")
print(cross_val_score(clf3, X_train, Y_train, cv=10, scoring='accuracy').mean())
print accuracy_score(Predictions['DTC'],z.bot)
print confusion_matrix(Predictions['DTC'],z.bot)
print Counter(Predictions['DTC'])
print '\n'

print ("The results for SVM")
print(cross_val_score(clf4, X_train, Y_train, cv=10, scoring='accuracy').mean())
print accuracy_score(Predictions['SVM'],z.bot)
print confusion_matrix(Predictions['SVM'],z.bot)
print Counter(Predictions['SVM'])
print '\n'

print ("The results for ADB")
print(cross_val_score(clf5, X_train, Y_train, cv=10, scoring='accuracy').mean())
print accuracy_score(Predictions['ADB'],z.bot)
print confusion_matrix(Predictions['ADB'],z.bot)
print Counter(Predictions['ADB'])
print '\n'

The results for RF
0.892379162088
0.986086956522
[[301   5]
 [  3 266]]
Counter({0: 306, 1: 269})


The results for GBC
0.904542546097
0.998260869565
[[304   1]
 [  0 270]]
Counter({0: 305, 1: 270})


The results for DTC
0.864507701447
0.961739130435
[[290   8]
 [ 14 263]]
Counter({0: 298, 1: 277})


The results for SVM
0.727174854636
0.833043478261
[[215   7]
 [ 89 264]]
Counter({1: 353, 0: 222})


The results for ADB
0.887384282416
0.984347826087
[[296   1]
 [  8 270]]
Counter({0: 297, 1: 278})




## We use GridSearch CV to identify the best possible configuration to Train our model

In [None]:
features=list(X_train)

### GridSearch for Random Forest:

In [None]:
rfc = RandomForestClassifier(n_jobs=-1,max_features='sqrt',n_estimators=10, oob_score = True) 

param_grid = { 
    'n_estimators': [100, 2000],
    'max_features': ['auto', 'sqrt', 'log2']
}

CV_rfc = GridSearchCV(estimator=rfc,scoring='accuracy',param_grid=param_grid, cv=5)
CV_rfc.fit(X_train,Y_train)
print CV_rfc.best_params_
print '\n',CV_rfc.best_estimator_

### GridSearch for Gradient Boost:

In [None]:
gbc= GradientBoostingClassifier()

gb_grid_params = {'learning_rate': [0.1, 0.05, 0.02, 0.01],
              'max_depth': [2,4, 6, 8],
              'min_samples_leaf': [20, 50,100,150],
              'max_features': [10, 3, 1],
              'n_estimators':[100,500]
              }

CV_gbc = GridSearchCV(estimator=gbc,scoring='accuracy',param_grid=gb_grid_params, cv=5)
CV_gbc.fit(X_train,Y_train)
print CV_gbc.best_params_
print '\n',CV_gbc.best_estimator_

### GridSearch for Decision Tree:

In [None]:
dtc=DecisionTreeClassifier()
n_estimators = range(50, 400, 50)

parameters={
    "min_samples_split": [2, 10, 20],
    "max_depth": [None, 2, 5, 10],
    "min_samples_leaf": [1, 5, 10],
    "max_leaf_nodes": [None, 5, 10, 20]
}

CV_dtc=GridSearchCV(estimator=dtc,scoring='accuracy',param_grid=parameters,cv=5)
CV_dtc.fit(X_train,Y_train)
print CV_dtc.best_params_
print CV_dtc.best_estimator_

### GridSearch for AdaBoost:

In [None]:
params = {'base_estimator__max_depth':[1,50],
          'base_estimator':[DecisionTreeClassifier(max_features=2), 
                            DecisionTreeClassifier(max_features=10)]}
gs = GridSearchCV(AdaBoostClassifier(base_estimator=DecisionTreeClassifier()), params)
gs.fit(X_train,Y_train)
print(gs.best_estimator_)

## We now Individually train the best found configuration for each model and compare the results:

In [15]:
##Random Forest 
rf=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='sqrt', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=2000, n_jobs=-1, oob_score=True,
            random_state=None, verbose=0, warm_start=False)
rf.fit(X_train,Y_train)
predRF=rf.predict(X_test)
print ("The results for modified Random Forest")
print Counter(predRF)
print ("Cross Validation Score")
print(cross_val_score(rf, X_train, Y_train, cv=10, scoring='accuracy').mean())
print ("Accuarcy with respect to the final prediction")
print accuracy_score(predRF,Predictions['X'])
print ("Confusion Matrix")
print confusion_matrix(predRF,Predictions['X'])
print '\n'




## Gradient Boost 
gbc=GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=2,
              max_features=10, max_leaf_nodes=None,
              min_impurity_split=1e-07, min_samples_leaf=20,
              min_samples_split=2, min_weight_fraction_leaf=0.0,
              n_estimators=500, presort='auto', random_state=None,
              subsample=1.0, verbose=0, warm_start=False)
gbc.fit(X_train,Y_train)
predGBC=gbc.predict(X_test)
print ("The results for modified GBC")
print Counter(predGBC)
print ("Cross Validation Score")
print(cross_val_score(gbc, X_train, Y_train, cv=10, scoring='accuracy').mean())
print ("Accuarcy with respect to the final prediction")
print accuracy_score(predGBC,Predictions['X'])
print ("Confusion Matrix")
print confusion_matrix(predGBC,Predictions['X'])
print '\n'




## Decision Tree:
dtc=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=20, min_impurity_split=1e-07,
            min_samples_leaf=5, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')
dtc.fit(X_train,Y_train)
predDTC=dtc.predict(X_test)
print ("The results for modified Decision Tree")
print Counter(predDTC)
print ("Cross Validation Score")
print(cross_val_score(dtc, X_train, Y_train, cv=10, scoring='accuracy').mean())
print ("Accuarcy with respect to the final prediction")
print accuracy_score(predDTC,Predictions['X'])
print ("Confusion Matrix")
print confusion_matrix(predDTC,Predictions['X'])
print '\n'




## SVM 
svm=SVC(C=10, kernel='rbf',gamma=1e-10)
svm.fit(X_train,Y_train)
predSVM=svm.predict(X_test)
print ("The results for modified SVM")
print Counter(predSVM)
print ("Cross Validation Score")
print(cross_val_score(svm, X_train, Y_train, cv=10, scoring='accuracy').mean())
print ("Accuarcy with respect to the final prediction")
print accuracy_score(predSVM,Predictions['X'])
print ("Confusion Matrix")
print confusion_matrix(predSVM,Predictions['X'])
print '\n'




#AdaBoost Classifier:
adb=AdaBoostClassifier(algorithm='SAMME.R',
          base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=2, max_leaf_nodes=None, min_impurity_split=1e-07,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
          learning_rate=1.0, n_estimators=50, random_state=None)
adb.fit(X_train,Y_train)
predADB=adb.predict(X_test)
print ("The results for Modified AdaBoost ")
print Counter(predADB)
print ("Cross Validation Score")
print(cross_val_score(adb, X_train, Y_train, cv=10, scoring='accuracy').mean())
print ("Accuarcy with respect to the final prediction")
print accuracy_score(predADB,Predictions['X'])
print ("Confusion Matrix")
print confusion_matrix(predADB,Predictions['X'])

The results for modified Random Forest
Counter({0: 302, 1: 273})
Cross Validation Score
0.90025167504
Accuarcy with respect to the final prediction
0.993043478261
Confusion Matrix
[[301   1]
 [  3 270]]


The results for modified GBC
Counter({0: 301, 1: 274})
Cross Validation Score
0.901680273801
Accuarcy with respect to the final prediction
0.991304347826
Confusion Matrix
[[300   1]
 [  4 270]]


The results for modified Decision Tree
Counter({0: 310, 1: 265})
Cross Validation Score
0.872358589669
Accuarcy with respect to the final prediction
0.986086956522
Confusion Matrix
[[303   7]
 [  1 264]]


The results for modified SVM
Counter({1: 353, 0: 222})
Cross Validation Score
0.727174854636
Accuarcy with respect to the final prediction
0.833043478261
Confusion Matrix
[[215   7]
 [ 89 264]]


The results for Modified AdaBoost 
Counter({0: 299, 1: 276})
Cross Validation Score
0.882002763337
Accuarcy with respect to the final prediction
0.966956521739
Confusion Matrix
[[292   7]
 [ 12 264

### As GBC gives us the Best Cros_val_score, we use GBC for our FINAL prediction: