# Imports

In [29]:
import matplotlib
%matplotlib inline
import numpy as np
import pandas as pd
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn import metrics
import numpy as np
import itertools
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.dummy import DummyClassifier
from sklearn.model_selection import cross_validate

# Dataset

In [31]:
df = pd.read_csv('~/Documents/wagon_data/data.csv')

In [32]:
del df['Unnamed: 0']

In [36]:
df.head()

Unnamed: 0,title,text,label
0,Four ways Bob Corker skewered Donald Trump,Image copyright Getty Images\nOn Sunday mornin...,0
1,Linklater's war veteran comedy speaks to moder...,"LONDON (Reuters) - “Last Flag Flying”, a comed...",0
2,Trump’s Fight With Corker Jeopardizes His Legi...,The feud broke into public view last week when...,0
3,Egypt's Cheiron wins tie-up with Pemex for Mex...,MEXICO CITY (Reuters) - Egypt’s Cheiron Holdin...,0
4,Jason Aldean opens 'SNL' with Vegas tribute,"Country singer Jason Aldean, who was performin...",0


In [34]:
df_sample = df.sample(frac=0.3, random_state=3) 

In [37]:
df_sample.shape

(19489, 3)

In [38]:
df_sample['label'].value_counts()

0    10944
1     8545
Name: label, dtype: int64

In [39]:
df_sample = df_sample.reset_index(drop=True)

# X & y 

In [40]:
y = df_sample['label']

In [41]:
X = df_sample.drop('label',axis=1)

# Preprocessing

In [42]:
def clean(text):
    for punctuation in string.punctuation:
        text = text.replace(punctuation, ' ')  # Remove Punctuation
    lowercased = text.lower()  # Lower Case
    tokenized = word_tokenize(lowercased)  # Tokenize
    words_only = [word for word in tokenized if word.isalpha()
                  ]  # Remove numbers
    stop_words = set(stopwords.words('english'))  # Make stopword list
    # Remove Stop Words
    without_stopwords = [word for word in words_only if not word in stop_words]
    lemma = WordNetLemmatizer()  # Initiate Lemmatizer
    lemmatized = [lemma.lemmatize(word)
                  for word in without_stopwords]  # Lemmatize
    return lemmatized

In [43]:
df_sample_text = df_sample['text'].apply(lambda text: clean(text))

In [None]:
df_sample_text_joined = df_sample_text.apply(lambda x: " ".join(x))

In [None]:
df_sample_title = df_sample['title'].apply(lambda text: clean(text))

In [None]:
df_sample_title_joined = df_sample_title.apply(lambda x: " ".join(x))

In [None]:
df_smaple_title_joined

# Vectorizer

In [20]:
## TFidf Vectorizer
tfidf_vec=TfidfVectorizer(max_features=10000,ngram_range=(1,3))

In [21]:
df_text = tfidf_vec.fit_transform(df_sample_text_joined).toarray()

In [22]:
df_title = tfidf_vec.fit_transform(df_sample_title_joined).toarray()

In [23]:
X_tfidf = np.hstack(( df_title, df_text))

In [20]:
X_tfidf.shape

(19489, 20000)

# Train Test Set

In [44]:
## Divide the dataset into Train and Test TFidf Vectorizer
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.3, random_state=0)

# Model

In [22]:
#PassiveAgressiveClassifier

In [46]:
linear_clf = PassiveAggressiveClassifier(max_iter=50)

In [47]:
linear_clf.fit(X_train, y_train)
pred_test = linear_clf.predict(X_test)
pred_train = linear_clf.predict(X_train)

In [51]:
linear_clf.score(X_test, y_test)

0.9500598597571405

In [25]:
pred_train

array([0, 0, 0, ..., 0, 0, 1])

In [26]:
X_train[2]

array([0., 0., 0., ..., 0., 0., 0.])

In [27]:
confusion_matrix(y_test, pred_test)

array([[3128,  170],
       [ 140, 2409]])

In [28]:
confusion_matrix(y_train, pred_train)

array([[7646,    0],
       [   0, 5996]])

In [29]:
score_pac = metrics.accuracy_score(y_test, pred_test)
print("accuracy:   %0.3f" % score_pac)

accuracy:   0.947


In [30]:
from sklearn.metrics import classification_report

In [31]:
print(classification_report(y_train, pred_train))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      7646
           1       1.00      1.00      1.00      5996

   micro avg       1.00      1.00      1.00     13642
   macro avg       1.00      1.00      1.00     13642
weighted avg       1.00      1.00      1.00     13642



In [32]:
print(classification_report(y_test, pred_test))

              precision    recall  f1-score   support

           0       0.96      0.95      0.95      3298
           1       0.93      0.95      0.94      2549

   micro avg       0.95      0.95      0.95      5847
   macro avg       0.95      0.95      0.95      5847
weighted avg       0.95      0.95      0.95      5847



# Example test 

In [131]:
df['title'][30000]

' Democratic Leadership BLASTS Trump And Republicans For Targeting Medicare'

In [132]:
df['text'][30000]



In [133]:
df['label'][30000]

1

In [134]:
text_1 = clean(df['text'][30000])

In [135]:
text_joined_1 = " ".join(text_1)

In [136]:
title_1 = clean(df['title'][30000])

In [137]:
title_joined_1 = " ".join(title_1)

In [138]:
df_text_1 = tfidf_vec.transform([text_joined_1]).toarray() 

In [139]:
df_title_1 = tfidf_vec.transform([title_joined_1]).toarray()

In [140]:
X_1 = np.hstack((df_title_1, df_text_1))

In [141]:
X_1

array([[0., 0., 0., ..., 0., 0., 0.]])

In [142]:
linear_clf.predict(X_1)

array([1])