In [21]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn import preprocessing
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.metrics import classification_report
import pandas as pd


In [22]:
#This code prevents the kernel from stopping when XGBoost is running
import os
os.environ['KMP_DUPLICATE_LIB_OK']='True'

<h3>Load in DF

In [23]:
df = pd.read_csv('df_with_gensim_summaries.csv')

In [24]:
df = df.drop(['Unnamed: 0', 'Unnamed: 0.1.1'], axis = 1)

In [52]:
df.head()

Unnamed: 0,title,content,category,gensim_summary,first_100,sent_tokenized
0,Agent Cooper in Twin Peaks is the audience: on...,And never more so than in Showtime’s new...,Longform,"In the second season finale, back in 1991, the...",And never more so than in Showtime’s new serie...,[' And never more so than in Showtime’s n...
1,"AI, the humanity!",AlphaGo’s victory isn’t a defeat for hum...,Longform,When speaking to DeepMind and Google developer...,AlphaGo’s victory isn’t a defeat for humans — ...,[' AlphaGo’s victory isn’t a defeat for h...
2,Massive attack,How a weapon against war became a weapon...,Longform,International visitors for the event are commo...,How a weapon against war became a weapon again...,[' How a weapon against war became a weap...
3,Brain drain,Genius quietly laid off a bunch of its e...,Longform,"In a post on the Genius blog at the time, co-f...",Genius quietly laid off a bunch of its enginee...,[' Genius quietly laid off a bunch of its...
4,Facebook takes flight,Inside the test flight of Facebook’s fir...,Longform,But if your goal is to stay in the air for a l...,Inside the test flight of Facebook’s first int...,[' Inside the test flight of Facebook’s f...


<h3>Check for null values & drop rows

In [108]:
df.content[0]

'      And never more so than in Showtime’s new series revival Some spoilers ahead through episode 4 of season 3 of Twin Peaks. On May 21st, Showtime brought back David Lynch’s groundbreaking TV series Twin Peaks, and fulfilled a prophecy in the process. In the second season finale, back in 1991, the spirit of series-defining murder victim Laura Palmer told FBI special agent and series protagonist Dale Cooper, “I’ll see you again in 25 years.” That clip plays again in the first episode of Lynch’s Twin Peaks revival, as a reminder that decades have in fact gone by, Laura’s promise has been carried out, and a series canceled mid-story is back on the air.A lot has changed in 25 years. The original cast members, who are mostly back on board, have all aged heavily and visibly. Many of the characters have moved on in life, getting new jobs, forming families, or taking up new obsessions. But in the opening episode, Dale Cooper was still where the show left him in 1991: trapped in the spirit d

In [26]:
df.isnull().sum()

title             0
content           0
category          0
gensim_summary    0
first_100         0
sent_tokenized    0
dtype: int64

In [25]:
df = df.dropna(subset = ['category'])

In [27]:
df.shape

(84965, 6)

<h3>Label Encoding

In [28]:
# Create a label encoder object
le = preprocessing.LabelEncoder()

In [29]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 84965 entries, 0 to 97063
Data columns (total 6 columns):
title             84965 non-null object
content           84965 non-null object
category          84965 non-null object
gensim_summary    84965 non-null object
first_100         84965 non-null object
sent_tokenized    84965 non-null object
dtypes: object(6)
memory usage: 4.5+ MB


In [30]:
# Fit the encoder to the pandas column
le.fit(df.category)

LabelEncoder()

In [31]:
targets = list(le.classes_)
print(targets)

['Book Reviews', 'Exclusive', 'Longform', 'Reports', 'broadcast', 'business', 'general', 'newspaper', 'radio', 'wire']


In [32]:
# Apply the fitted encoder to the pandas column
le.transform(df['category']) 

array([2, 2, 2, ..., 7, 7, 7])

<h3>Train Test Split

In [33]:
X = df.content
y = df.category

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 42)

<h3>Naive Bayes Classifier

In [39]:
nb = Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('clf', MultinomialNB()),
              ])
nb.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...inear_tf=False, use_idf=True)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [44]:
%%time

y_pred = nb.predict(X_test)

print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred,target_names = targets))

accuracy 0.5535896429972538


  'precision', 'predicted', average, warn_for)


              precision    recall  f1-score   support

Book Reviews       0.00      0.00      0.00         3
   Exclusive       0.00      0.00      0.00        22
    Longform       0.00      0.00      0.00         3
     Reports       0.00      0.00      0.00       194
   broadcast       0.00      0.00      0.00      1590
    business       0.00      0.00      0.00       193
     general       0.97      0.02      0.04      5618
   newspaper       0.54      1.00      0.70     13533
       radio       0.00      0.00      0.00      1681
        wire       1.00      0.18      0.30      2653

   micro avg       0.55      0.55      0.55     25490
   macro avg       0.25      0.12      0.10     25490
weighted avg       0.61      0.55      0.41     25490

CPU times: user 30.4 s, sys: 748 ms, total: 31.1 s
Wall time: 31.2 s


<h3>Logistic Regression Classifier

In [45]:
from sklearn.linear_model import LogisticRegression

logreg = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', LogisticRegression(n_jobs=1, C=1e5)),
               ])
logreg.fit(X_train, y_train)

UsageError: Line magic function `%%time` not found.


In [46]:
%%time

y_pred = logreg.predict(X_test)

print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred,target_names=targets))

accuracy 0.8925853275794429
              precision    recall  f1-score   support

Book Reviews       0.00      0.00      0.00         3
   Exclusive       0.00      0.00      0.00        22
    Longform       0.00      0.00      0.00         3
     Reports       0.77      0.53      0.63       194
   broadcast       0.85      0.73      0.78      1590
    business       0.84      0.51      0.63       193
     general       0.85      0.87      0.86      5618
   newspaper       0.91      0.94      0.92     13533
       radio       0.79      0.67      0.73      1681
        wire       1.00      1.00      1.00      2653

   micro avg       0.89      0.89      0.89     25490
   macro avg       0.60      0.52      0.56     25490
weighted avg       0.89      0.89      0.89     25490

CPU times: user 32.4 s, sys: 811 ms, total: 33.2 s
Wall time: 33.7 s


<h3>Random Forest Classifier

In [57]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
# text_classifier = RandomForestClassifier(n_estimators=200, random_state=0)  
# text_classifier.fit(X_train, y_train)

In [60]:
#set up paramaters dictionary for grid search
param_grid = { 
    'n_estimators': [100, 200,300],
    'max_features': ['auto',0.25, 0.33, 0.5],
    'max_depth' : [None,5,6,7,8],
    'min_samples_leaf': [0.03,0.04,0.05, 1, 2]
    }

In [61]:
rfc = RandomForestClassifier()

In [62]:
#create instance of gridsearchCV
#use params from above
CV_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv= 3,n_jobs=-1)

In [63]:
rf = Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('clf', CV_rfc),
              ])
rf.fit(X_train, y_train)

KeyboardInterrupt: 

In [None]:
#check best paramaters from the grid search
CV_rfc.best_params_

In [None]:
#use model to predict on test set
rfc_pred = CV_rfc.best_estimator_.predict(X_test)
#check accuracy on test set
print('Test Accuracy score: ', accuracy_score(y_test, rfc_pred))
#check F1 of test set
print('Test F1 score: ', f1_score(y_test, rfc_pred))

In [None]:
#plot confusion matrix of random forest using SMOTE oversampling
cnf_matrix_rf_smote_CV = confusion_matrix(y_test, rfc_pred)
print('Confusion Matrix:\n',cnf_matrix_rf_smote_CV)

In [None]:
# save the model to disk
filename = 'rfc_model.sav'
pickle.dump(rf, open(filename, 'wb'))

<h3>XG Boost Classifier

In [15]:
from xgboost import XGBClassifier

In [46]:
xgb = Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('clf', XGBClassifier()),
              ])
xgb.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1))])

In [47]:
%%time

y_pred = xgb.predict(X_test)

print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred,target_names=targets))

accuracy 0.8858768144370341


  'precision', 'predicted', average, warn_for)


              precision    recall  f1-score   support

Book Reviews       0.00      0.00      0.00         3
   Exclusive       0.00      0.00      0.00        22
    Longform       0.00      0.00      0.00         3
     Reports       0.82      0.48      0.61       194
   broadcast       0.86      0.91      0.88      1590
    business       0.92      0.64      0.76       193
     general       0.94      0.74      0.83      5618
   newspaper       0.86      0.97      0.91     13533
       radio       0.82      0.63      0.71      1681
        wire       1.00      1.00      1.00      2653

   micro avg       0.89      0.89      0.89     25490
   macro avg       0.62      0.54      0.57     25490
weighted avg       0.89      0.89      0.88     25490

CPU times: user 37.9 s, sys: 1.64 s, total: 39.6 s
Wall time: 41.6 s


In [48]:
import pickle

In [50]:
# save the model to disk
filename = 'xgboost_model.sav'
pickle.dump(xgb, open(filename, 'wb'))

In [51]:
xgb

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1))])

<h3>KERAS NEURAL NETWORK

In [64]:
import itertools
import os
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.preprocessing import LabelBinarizer, LabelEncoder
from sklearn.metrics import confusion_matrix
from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from keras.preprocessing import text, sequence
from keras import utils

Using TensorFlow backend.


In [87]:
#train test split
X = df.content
y = df.category

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 40)

In [88]:
#tokenize the unique words in the vocab
max_words = 1000

#limit the vocab to the top words (max_words)
tokenize = text.Tokenizer(num_words=max_words, char_level=False)

#creates a word index lookup of the vocab
tokenize.fit_on_texts(X_train) # only fit on train

In [89]:
#create matrix to pass into the neural network
x_train = tokenize.texts_to_matrix(X_train)
x_test = tokenize.texts_to_matrix(X_test)

In [90]:
#encode labels and fit to training
encoder = LabelEncoder()
encoder.fit(y_train)

#transform labels
y_train = encoder.transform(y_train)
y_test = encoder.transform(y_test)

In [91]:
num_classes = np.max(y_train) + 1
y_train = utils.to_categorical(y_train, num_classes)
y_test = utils.to_categorical(y_test, num_classes)

In [101]:
batch_size = 32
epochs = 10

In [102]:
# Build the model
model = Sequential()
model.add(Dense(512, input_shape=(max_words,)))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(num_classes))
model.add(Activation('softmax'))

In [103]:
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [104]:
history = model.fit(x_train, y_train,
                    batch_size=batch_size,
                    epochs=epochs,
                    verbose=1,
                    validation_split=0.1)

Train on 53527 samples, validate on 5948 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [105]:
score = model.evaluate(x_test, y_test,
                       batch_size=batch_size, verbose=1)
print('Test accuracy:', score[1])

Test accuracy: 0.9176539819396772


In [107]:
score

[0.2788812780559133, 0.9176539819396772]

In [106]:
# save the model to disk
filename = 'keras_model.sav'
pickle.dump(history, open(filename, 'wb'))