In [18]:
import pandas as pd
import numpy as np

In [31]:
# Modelling Algorithms
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import Perceptron

# Modelling Helpers
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import metrics

from sklearn.feature_extraction.text import HashingVectorizer

# Computations
import itertools

# Visualization
import matplotlib.pyplot as plt

In [7]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [8]:
train.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [9]:
train.shape,test.shape

((20800, 5), (5200, 4))

In [10]:
test=test.fillna(' ')
train=train.fillna(' ')

In [11]:
train_data = (train['title'] + ' ' + train['author'] + ' ' + train['text']).values
test_data = (test['title'] + ' ' + test['author'] + ' ' + test['text']).values

In [13]:
X_train, X_test, y_train, y_test = train_test_split(train_data, train.label, test_size=0.20, random_state=0)

# **Vectorize**

In [19]:
# Initialize the `count_vectorizer | Bag of words` 
count_vectorizer = CountVectorizer(ngram_range=(1, 2), stop_words='english') 
# Fit and transform the training data.
count_train = count_vectorizer.fit_transform(X_train)
# Transform the test set 
count_test = count_vectorizer.transform(X_test)

In [16]:
#Initialize the `tfidf_vectorizer` 
tfidf_vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1, 2))
#Fit and transform the training data 
tfidf_train = tfidf_vectorizer.fit_transform(X_train)
#Transform the test set 
tfidf_test = tfidf_vectorizer.transform(X_test)

# **Model**

**MultinomialNB**

In [20]:
for alpha in np.arange(0,1,.05):
    nb_classifier_tune = MultinomialNB(alpha=alpha)
    nb_classifier_tune.fit(count_train, y_train)
    pred_tune = nb_classifier_tune.predict(count_test)
    score = metrics.accuracy_score(y_test, pred_tune)
    print("Alpha: {:.2f} Score: {:.5f}".format(alpha, score))

  'setting alpha = %.1e' % _ALPHA_MIN)


Alpha: 0.00 Score: 0.91106
Alpha: 0.05 Score: 0.94183
Alpha: 0.10 Score: 0.94255
Alpha: 0.15 Score: 0.94279
Alpha: 0.20 Score: 0.94231
Alpha: 0.25 Score: 0.94111
Alpha: 0.30 Score: 0.93990
Alpha: 0.35 Score: 0.93822
Alpha: 0.40 Score: 0.93750
Alpha: 0.45 Score: 0.93630
Alpha: 0.50 Score: 0.93510
Alpha: 0.55 Score: 0.93486
Alpha: 0.60 Score: 0.93341
Alpha: 0.65 Score: 0.93149
Alpha: 0.70 Score: 0.93029
Alpha: 0.75 Score: 0.92909
Alpha: 0.80 Score: 0.92788
Alpha: 0.85 Score: 0.92644
Alpha: 0.90 Score: 0.92428
Alpha: 0.95 Score: 0.92308


In [21]:
for alpha in np.arange(0,0.1,.01):
    nb_classifier_tune = MultinomialNB(alpha=alpha)
    nb_classifier_tune.fit(tfidf_train, y_train)
    pred_tune = nb_classifier_tune.predict(tfidf_test)
    score = metrics.accuracy_score(y_test, pred_tune)
    print("Alpha: {:.2f}  Score: {:.5f}".format(alpha, score))

  'setting alpha = %.1e' % _ALPHA_MIN)


Alpha: 0.00  Score: 0.90649
Alpha: 0.01  Score: 0.94375
Alpha: 0.02  Score: 0.94375
Alpha: 0.03  Score: 0.94159
Alpha: 0.04  Score: 0.93966
Alpha: 0.05  Score: 0.93822
Alpha: 0.06  Score: 0.93462
Alpha: 0.07  Score: 0.93317
Alpha: 0.08  Score: 0.93005
Alpha: 0.09  Score: 0.92861


**PassiveAggressiveClassifier**

In [24]:
from sklearn.linear_model import PassiveAggressiveClassifier
linear_classifier = PassiveAggressiveClassifier(max_iter=10)
linear_classifier.fit(count_train, y_train)
pred_linear_count = linear_classifier.predict(count_test)
acc_linear_count = metrics.accuracy_score(y_test, pred_linear_count)
print(acc_linear_count)


0.9668269230769231




In [25]:
linear_classifier = PassiveAggressiveClassifier(max_iter=10)
linear_classifier.fit(tfidf_train, y_train)
pred_linear_tfidf = linear_classifier.predict(tfidf_test)
acc_linear_tfidf = metrics.accuracy_score(y_test, pred_linear_tfidf)
print(acc_linear_tfidf)

0.98125


**SGDClassifier**

In [36]:
sgd_classifier = SGDClassifier(max_iter=10)
sgd_classifier.fit(count_train, y_train)
pred_sgd_cv = sgd_classifier.predict(count_test)
acc_sgd_cv= metrics.accuracy_score(y_test, pred_sgd_cv)
print(acc_sgd_cv)

0.9704326923076924




In [35]:
sgd_classifier = SGDClassifier(max_iter=10)
sgd_classifier.fit(tfidf_train, y_train)
pred_sgd_tfidf = sgd_classifier.predict(tfidf_test)
acc_sgd_tfidf = metrics.accuracy_score(y_test, pred_sgd_tfidf)
print(acc_sgd_tfidf)

0.9764423076923077




**Perceptron**

In [34]:
per_classifier = Perceptron(max_iter=10)
per_classifier.fit(count_train, y_train)
pred_per_cv = per_classifier.predict(count_test)
acc_per_cv = metrics.accuracy_score(y_test, pred_per_cv)
print(acc_per_cv)

0.9617788461538461




In [33]:
per_classifier = Perceptron(max_iter=10)
per_classifier.fit(tfidf_train, y_train)
pred_per_tfidf = per_classifier.predict(tfidf_test)
acc_per_tfidf = metrics.accuracy_score(y_test, pred_per_tfidf)
print(acc_per_tfidf)

0.9774038461538461
