In [20]:
import numpy as np
import pandas as pd
import sklearn
from sklearn.preprocessing import LabelEncoder 
import seaborn as sns
from pandas.plotting import scatter_matrix
sns.set_style("whitegrid")
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

In [21]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import CountVectorizer

In [67]:
from mlxtend.classifier import EnsembleVoteClassifier
from mlxtend.feature_selection import ColumnSelector
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline

# Load Data

In [23]:
accts = pd.read_csv('data/Cumulative_raw_data.csv')
pd.set_option('display.max_colwidth', None) # prevents descriptions from being cut off
accts['description'] = accts['description'].values.astype('U') #formating as unicode for Tfidf
#accts = accts[:10000] #shortening for exploratory data work to make easier on computer

In [58]:
#

In [168]:
# Train test split

X_train,X_test,y_train,y_test = train_test_split(accts,
                                                accts['bot'], #CHANGED FROM TARGET
                                                train_size=0.75,
                                                shuffle=True,
                                                stratify=accts['bot'], #CHANGED FROM TARGET
                                                random_state=855)

In [161]:
y = accts[['bot']].to_numpy()
X = accts[['description']]
X_train, X_test, y_train, y_test = train_test_split(
     X, y, test_size=0.25, random_state=1400)

In [162]:
target = accts['species'] == 'bot'
source = accts['source']

## Random Forest

The train-test split stratifies by target right now, and not by source. We should stratify by both. (Note to Chris: Think of stratifying by a source-target ordered pair.)

In [29]:
# This is an array of ordered pairs. Each pair combines the bot/human value of
# an account with the source dataset from which the account came. We need this
# so that train_test_split can stratify by bot/human value AND data source.

# This codeblock takes ten seconds to run on my PC.

stratify_guide = np.asarray([(target[acct],source[acct]) for acct in accts.index]).reshape(-1,1)

In [163]:
#NEW: One hot encoding bot
accts['bot'] = 0
accts.loc[accts.species == "bot",'bot'] = 1

I tried hyperparameter tuning of `rf_model` but I couldn't really get the accuracy to budge. Adding more attributes to the training data would probably help.

In [75]:
rf_model = RandomForestClassifier(n_estimators = 100,
                                  max_features = 'auto',
                                  criterion='entropy')
rf_model.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

## Test model

y is True for bots, False for humans

In [76]:
pred = rf_model.predict(X_test)
print('overall accuracy:', np.sum(pred == y_test)/len(y_test))
print()

for src in set(source.values):
    print(src)
    print('accuracy:', np.sum( (pred == y_test) & (source == src) )/source.value_counts()[src])
    print()

overall accuracy: 0.7233832630729213

vendor-purchased-2019
accuracy: 0.16022099447513813

verified-2019
accuracy: 0.1691842900302115

botwiki-2019
accuracy: 0.18794835007173602

cresci-rtbust-2019
accuracy: 0.12572254335260116

celebrity-2019
accuracy: 0.11441608923440932

cresci-stock-2018
accuracy: 0.14847457627118643

pronbots-2019
accuracy: 0.23896873776634417

botometer-feedback-2019
accuracy: 0.1276595744680851

political-bots-2019
accuracy: 0.22950819672131148

gilani-2017
accuracy: 0.15787354007249296



## Constructing Pipes

### Basic KNN (TfidfVectorization, Hashing, CountVectorization) & Random Forests

In [169]:
# Train test split

X_train,X_test,y_train,y_test = train_test_split(accts,
                                                accts['bot'], #CHANGED FROM TARGET
                                                train_size=0.75,
                                                shuffle=True,
                                                stratify=accts['bot'], #CHANGED FROM TARGET
                                                random_state=855)

In [98]:
pipe_t = Pipeline(steps=[#('transform', TfidfVectorizer()), 
       # ('get_col', ColumnSelector(cols=(20))),
        ('tfidf', TfidfVectorizer()),
        ('knn', KNeighborsClassifier(n_neighbors = 10))] #20
                )

pipe_c = Pipeline(steps=[#('transform', TfidfVectorizer()), 
       # ('get_col', ColumnSelector(cols=(20))),
        ('count', CountVectorizer()),
        ('knn', KNeighborsClassifier(n_neighbors = 10))]
                )

pipe_h = Pipeline(steps=[#('transform', TfidfVectorizer()), 
       # ('get_col', ColumnSelector(cols=(20))),
        ('hashing', HashingVectorizer(n_features = 20)),
        ('knn', KNeighborsClassifier(n_neighbors = 10))])
#pipe_r = Pipeline(steps=[
#                ('get_col', ColumnSelector(cols=(32))),
#                ('rf_model', RandomForestClassifier(n_estimators = 100, 
#                                                  max_features = 'auto',
 #                                                 criterion='entropy'))]) #32

In [151]:
pipe_t = make_pipeline(#('transform', TfidfVectorizer()), 
        ColumnSelector(cols=(1,20)),
        TfidfVectorizer(),
        KNeighborsClassifier(n_neighbors = 10) #20
                )
pipe_c = make_pipeline(#('transform', TfidfVectorizer()), 
        ColumnSelector(cols=(1,20)),
        CountVectorizer(),
        KNeighborsClassifier(n_neighbors = 10) #20
                )
pipe_h = make_pipeline(#('transform', TfidfVectorizer()), 
        ColumnSelector(cols=(1,20)),
        HashingVectorizer(n_features = 20),
        KNeighborsClassifier(n_neighbors = 10) #20
                )
#pipe_r = make_pipeline(
#                ('get_col', ColumnSelector(cols=(32))),
#                ('rf_model', RandomForestClassifier(n_estimators = 100, 
#                                                  max_features = 'auto',
#                                                 criterion='entropy')))

In [171]:
#ERROR HERE
#error is it needs to handle a (n,) shape numpy array
#the data is currently in a dataframe 
#code works when column selector is removed and data is loaded as accts['description']  for x

vote = EnsembleVoteClassifier(clfs=[pipe_t, pipe_c, pipe_h, 
                                   # pipe_r
                                   ], 
                              #voting='soft'
                             )
X_train = X_train.to_numpy()
y_train = y_train.to_numpy()
vote = vote.fit(X_train, y_train.ravel()) #['description'] 
#eclf.fit(X, y)

AttributeError: 'numpy.ndarray' object has no attribute 'to_numpy'

In [72]:
y_predict = vote.predict(X_test['description'])

In [73]:
print("Our model has a ",
      np.round(sum(y_predict == y_test.ravel())/len(y_test)*100,2),
      "% accuracy on the testing set")

Our model has a  80.66 % accuracy on the testing set


In [115]:
#for col in accts.columns: 
#    print(col) 
#print(accts.iloc[:,45])

In [120]:
col_selector = ColumnSelector(cols=(1,20))
col_selector.transform(X_train)

array([[False,
        'Do you like it gently? 💫 Come in! 😍 https://t.co/CGFnsmj94b'],
       [False, 'nan'],
       [False, 'Do you like fast? Come in! 😻😻😻 https://t.co/Za2tNtmsBt'],
       ...,
       [False,
        'Explore the stars with @rantzien and @AlaynaMCole of @HornedLlama.'],
       [False, 'Cp manager : Mas Fafa - @fafa8888'],
       [False, 'Rapper, Hustla, Taurus.']], dtype=object)

In [139]:
from sklearn.datasets import load_iris
iris = load_iris()
X = iris.data
print(X)

[[5.1 3.5 1.4 0.2]
 [4.9 3.  1.4 0.2]
 [4.7 3.2 1.3 0.2]
 [4.6 3.1 1.5 0.2]
 [5.  3.6 1.4 0.2]
 [5.4 3.9 1.7 0.4]
 [4.6 3.4 1.4 0.3]
 [5.  3.4 1.5 0.2]
 [4.4 2.9 1.4 0.2]
 [4.9 3.1 1.5 0.1]
 [5.4 3.7 1.5 0.2]
 [4.8 3.4 1.6 0.2]
 [4.8 3.  1.4 0.1]
 [4.3 3.  1.1 0.1]
 [5.8 4.  1.2 0.2]
 [5.7 4.4 1.5 0.4]
 [5.4 3.9 1.3 0.4]
 [5.1 3.5 1.4 0.3]
 [5.7 3.8 1.7 0.3]
 [5.1 3.8 1.5 0.3]
 [5.4 3.4 1.7 0.2]
 [5.1 3.7 1.5 0.4]
 [4.6 3.6 1.  0.2]
 [5.1 3.3 1.7 0.5]
 [4.8 3.4 1.9 0.2]
 [5.  3.  1.6 0.2]
 [5.  3.4 1.6 0.4]
 [5.2 3.5 1.5 0.2]
 [5.2 3.4 1.4 0.2]
 [4.7 3.2 1.6 0.2]
 [4.8 3.1 1.6 0.2]
 [5.4 3.4 1.5 0.4]
 [5.2 4.1 1.5 0.1]
 [5.5 4.2 1.4 0.2]
 [4.9 3.1 1.5 0.2]
 [5.  3.2 1.2 0.2]
 [5.5 3.5 1.3 0.2]
 [4.9 3.6 1.4 0.1]
 [4.4 3.  1.3 0.2]
 [5.1 3.4 1.5 0.2]
 [5.  3.5 1.3 0.3]
 [4.5 2.3 1.3 0.3]
 [4.4 3.2 1.3 0.2]
 [5.  3.5 1.6 0.6]
 [5.1 3.8 1.9 0.4]
 [4.8 3.  1.4 0.3]
 [5.1 3.8 1.6 0.2]
 [4.6 3.2 1.4 0.2]
 [5.3 3.7 1.5 0.2]
 [5.  3.3 1.4 0.2]
 [7.  3.2 4.7 1.4]
 [6.4 3.2 4.5 1.5]
 [6.9 3.1 4.

In [None]:
#print(X_train.reshape)
#print(y_train.shape)

X_train = X_train.reshape(-1,)
X_train.shape