In [39]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_validate
import numpy as np

In [21]:
df = pd.read_csv('./Data/TwitterDataset.csv',index_col=0)
df.head()

Unnamed: 0,created_at,default_profile,default_profile_image,description,favourites_count,followers_count,friends_count,geo_enabled,id,lang,location,profile_background_image_url,profile_image_url,screen_name,statuses_count,verified,average_tweets_per_day,account_age_days,account_type
0,15/10/16 21:32,False,False,"Blame @xaiax, Inspired by @MakingInvisible, us...",4,1589,4,False,7.87406e+17,en,unknown,http://abs.twimg.com/images/themes/theme1/bg.png,http://pbs.twimg.com/profile_images/7874121826...,best_in_dumbest,11041,False,7.87,1403,bot
1,09/11/16 5:01,False,False,Photographing the American West since 1980. I ...,536,860,880,False,7.96216e+17,en,Estados Unidos,http://abs.twimg.com/images/themes/theme1/bg.png,http://pbs.twimg.com/profile_images/8023296328...,CJRubinPhoto,252,False,0.183,1379,human
2,17/06/17 5:34,False,False,Scruffy looking nerf herder and @twitch broadc...,3307,172,594,True,8.7595e+17,en,"Los Angeles, CA",http://abs.twimg.com/images/themes/theme1/bg.png,http://pbs.twimg.com/profile_images/1278890453...,SVGEGENT,1001,False,0.864,1159,human
3,21/07/16 13:32,True,False,Wife.Godmother.Friend.Feline Fanatic! Assistan...,8433,517,633,True,7.5612e+17,en,"Birmingham, AL",,http://pbs.twimg.com/profile_images/1284884924...,TinkerVHELPK5,1324,False,0.889,1489,human
4,15/01/12 16:32,False,False,Loan coach at @mancity & Aspiring DJ,88,753678,116,True,464781300.0,en,"England, United Kingdom",http://abs.twimg.com/images/themes/theme1/bg.png,http://pbs.twimg.com/profile_images/9952566258...,JoleonLescott,4202,True,1.339,3138,human



## Dropping useless columns

In [47]:
remove = ['location', 'screen_name', 'created_at' , 'profile_image_url', 'profile_background_image_url', 'id', 'description']
df.drop(remove, inplace= True, axis=1)
df.head()

Unnamed: 0,default_profile,default_profile_image,favourites_count,followers_count,friends_count,geo_enabled,lang,statuses_count,verified,average_tweets_per_day,account_age_days,account_type
0,False,False,4,1589,4,False,9,11041,False,7.87,1403,0
1,False,False,536,860,880,False,9,252,False,0.183,1379,1
2,False,False,3307,172,594,True,9,1001,False,0.864,1159,1
3,True,False,8433,517,633,True,9,1324,False,0.889,1489,1
4,False,False,88,753678,116,True,9,4202,True,1.339,3138,1


In [48]:
#Label Encoding language column
df['lang'] = LabelEncoder().fit_transform(df['lang'])
df['account_type'] = LabelEncoder().fit_transform(df['account_type'])
# 0 - Bot 1 - Human
df.head()

Unnamed: 0,default_profile,default_profile_image,favourites_count,followers_count,friends_count,geo_enabled,lang,statuses_count,verified,average_tweets_per_day,account_age_days,account_type
0,False,False,4,1589,4,False,9,11041,False,7.87,1403,0
1,False,False,536,860,880,False,9,252,False,0.183,1379,1
2,False,False,3307,172,594,True,9,1001,False,0.864,1159,1
3,True,False,8433,517,633,True,9,1324,False,0.889,1489,1
4,False,False,88,753678,116,True,9,4202,True,1.339,3138,1


## Splitting twitter df into training and testing

In [49]:
training_data = df.sample(frac=0.67, random_state=25)
testing_data = df.drop(training_data.index)
print(f"No. of training examples: {training_data.shape[0]}")
print(f"No. of testing examples: {testing_data.shape[0]}")

No. of training examples: 25083
No. of testing examples: 12355


In [50]:
def load_train_data():
    X_train = training_data.drop(columns='account_type')
    y_train = training_data['account_type']
    
    return X_train, y_train

def load_test_data():
    X_test = testing_data.drop(columns='fake')
    y_test = testing_data['fake']
    
    return X_test, y_test

In [51]:
def get_classifier_cv_score(model, X, y, scoring='accuracy', cv=7):
    scores = cross_validate(model, X, y, cv=cv, scoring=scoring, return_train_score=True)
    train_scores = scores['train_score']
    val_scores = scores['test_score']
    
    train_mean = np.mean(train_scores)
    val_mean = np.mean(val_scores)
    
    return train_mean, val_mean

In [52]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB

model_list = [LogisticRegression(max_iter=600),
              SVC(), 
              GaussianNB(),
              RandomForestClassifier(random_state=55),
              GradientBoostingClassifier(random_state=56)]

train_scores = []
val_scores = []

X_train, y_train = load_train_data()
for model in model_list:
    train, val = get_classifier_cv_score(model, X_train, y_train,'average_precision')
    train_scores.append(train)
    val_scores.append(val)
    
models_score = sorted(list(zip(val_scores, train_scores, model_list)), reverse=True)

print("-------------------------------------")
for val, train, model in models_score:
    print("Model: {} ".format(model.__class__.__name__))

    print("train_score: {:.3f}".format(train)) 

    print("validation_score: {:.3f}".format(val)) 

    print("-------------------------------------")

-------------------------------------
Model: RandomForestClassifier 
train_score: 1.000
validation_score: 0.960
-------------------------------------
Model: GradientBoostingClassifier 
train_score: 0.960
validation_score: 0.955
-------------------------------------
Model: SVC 
train_score: 0.890
validation_score: 0.889
-------------------------------------
Model: LogisticRegression 
train_score: 0.871
validation_score: 0.871
-------------------------------------
Model: GaussianNB 
train_score: 0.848
validation_score: 0.848
-------------------------------------
