# Import Libraries

In [2]:
import pandas as pd
import pickle

In [3]:
from sklearn.naive_bayes import *
from sklearn.dummy import *
from sklearn.ensemble import *
from sklearn.neighbors import *
from sklearn.tree import *
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.calibration import *
from sklearn.linear_model import *
from sklearn.multiclass import *
from sklearn.svm import *

loading the tweets dataset into a pandas dataframe

In [5]:
df = pd.read_csv('data/data.csv', encoding="latin-1")
df = df.sample(frac = 1)
df.head()

Unnamed: 0,value,tweet
9878,0,@Freedom_speech2 If #Palmyra fall they will c...
20947,1,"Will The Chip Work? Standards, tools and secur..."
16549,0,Poem by Ibn Kalakis
19041,1,"@OllieBeavis shame on the boyfriends, brothers..."
33502,1,@NolteNC right #GOPe and #dem polls


select tweets to train and test dataset

In [7]:
train_data = df[:250]
test_data = df[250:]
df.head()

Unnamed: 0,value,tweet
9878,0,@Freedom_speech2 If #Palmyra fall they will c...
20947,1,"Will The Chip Work? Standards, tools and secur..."
16549,0,Poem by Ibn Kalakis
19041,1,"@OllieBeavis shame on the boyfriends, brothers..."
33502,1,@NolteNC right #GOPe and #dem polls


In [8]:
def perform(classifiers, vectorizers, train_data, test_data):
    max_score = 0
    max_name = 0
    for classifier in classifiers:
        for vectorizer in vectorizers:
        
            # train
            vectorize_text = vectorizer.fit_transform(train_data.tweet)
            classifier.fit(vectorize_text, train_data.value)

            # score
            vectorize_text = vectorizer.transform(test_data.tweet)
            score = classifier.score(vectorize_text, test_data.value)
            name = classifier.__class__.__name__ + ' with ' + vectorizer.__class__.__name__ 
            print(name, score)
        if score > max_score:
            max_score = score
            max_name = name
    print ('===========================================')
    print ('===========================================')
    print (max_name, max_score)
    print ('===========================================')
    print ('===========================================')

list of various classifiers we are going to use

In [9]:
classifiers = [
        BernoulliNB(),
        RandomForestClassifier(n_estimators=100, n_jobs=-1),
        AdaBoostClassifier(),
        BaggingClassifier(),
        ExtraTreesClassifier(),
        GradientBoostingClassifier(),
        DecisionTreeClassifier(),
        CalibratedClassifierCV(),
        DummyClassifier(),
        PassiveAggressiveClassifier(),
        RidgeClassifier(),
        RidgeClassifierCV(),
        SGDClassifier(),
        OneVsRestClassifier(SVC(kernel='linear')),
        OneVsRestClassifier(LogisticRegression()),
        KNeighborsClassifier()
    ]

list of various vectorizers we are going to use

In [10]:
vectorizers = [
        CountVectorizer(),
        TfidfVectorizer(),
        HashingVectorizer()
    ]

perform classification and save results to a new dataframe

In [11]:
perform(
    classifiers,
    vectorizers,
    train_data,
    test_data
)

BernoulliNB with CountVectorizer 0.7538564901908089
BernoulliNB with TfidfVectorizer 0.7538564901908089
BernoulliNB with HashingVectorizer 0.5348562214458479
RandomForestClassifier with CountVectorizer 0.8778016662187584
RandomForestClassifier with TfidfVectorizer 0.8663531308787961
RandomForestClassifier with HashingVectorizer 0.8401236226820747
AdaBoostClassifier with CountVectorizer 0.8816715936576189
AdaBoostClassifier with TfidfVectorizer 0.8227358237033056
AdaBoostClassifier with HashingVectorizer 0.844638538027412
BaggingClassifier with CountVectorizer 0.8720236495565709
BaggingClassifier with TfidfVectorizer 0.7860521365224402
BaggingClassifier with HashingVectorizer 0.8699811878527277
ExtraTreesClassifier with CountVectorizer 0.8911045417898414
ExtraTreesClassifier with TfidfVectorizer 0.8743348562214458
ExtraTreesClassifier with HashingVectorizer 0.8492340768610589
GradientBoostingClassifier with CountVectorizer 0.8806772373018006
GradientBoostingClassifier with TfidfVectoriz

In [14]:
# train the classifier with best accuracy
Classifier = OneVsRestClassifier(SVC(kernel='linear', probability=True))
Vectorizer = TfidfVectorizer()
vectorize_text = Vectorizer.fit_transform(train_data.tweet)
Classifier.fit(vectorize_text, train_data.value)

#export model to pickle
pickle.dump(Classifier, open('model/classifier.pkl', 'wb'))
pickle.dump(Vectorizer, open('model/vectorizer.pkl', 'wb'))

In [None]:
Tweet = 'And her woe began to run afresh,'
vectorize_message = Vectorizer.transform([Tweet])
predict = Classifier.predict(vectorize_message)[0]

In [None]:
if predict == 0:
    print ('Terror')
else:
    print ('Normal')Vectorizer = TfidfVectorizer()

Normal
