In [21]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn import preprocessing
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.metrics import classification_report
import pandas as pd


In [22]:
#This code prevents the kernel from stopping when XGBoost is running
import os
os.environ['KMP_DUPLICATE_LIB_OK']='True'

<h3>Load in DF

In [23]:
df = pd.read_csv('df_with_gensim_summaries.csv')

In [24]:
df = df.drop(['Unnamed: 0', 'Unnamed: 0.1.1'], axis = 1)

<h3>Check for null values drop rows

In [25]:
df = df.dropna(subset = ['category'])

In [26]:
df.isnull().sum()

title             0
content           0
category          0
gensim_summary    0
first_100         0
sent_tokenized    0
dtype: int64

In [27]:
df.shape

(84965, 6)

<h3>Label Encoding

In [28]:
# Create a label encoder object
le = preprocessing.LabelEncoder()

In [29]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 84965 entries, 0 to 97063
Data columns (total 6 columns):
title             84965 non-null object
content           84965 non-null object
category          84965 non-null object
gensim_summary    84965 non-null object
first_100         84965 non-null object
sent_tokenized    84965 non-null object
dtypes: object(6)
memory usage: 4.5+ MB


In [30]:
# Fit the encoder to the pandas column
le.fit(df.category)

LabelEncoder()

In [31]:
targets = list(le.classes_)
print(targets)

['Book Reviews', 'Exclusive', 'Longform', 'Reports', 'broadcast', 'business', 'general', 'newspaper', 'radio', 'wire']


In [32]:
# Apply the fitted encoder to the pandas column
le.transform(df['category']) 

array([2, 2, 2, ..., 7, 7, 7])

<h3>Train Test Split

In [33]:
X = df.content
y = df.category

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 42)


<h3>Naive Bayes Classifier

In [39]:
nb = Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('clf', MultinomialNB()),
              ])
nb.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...inear_tf=False, use_idf=True)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [44]:
%%time

y_pred = nb.predict(X_test)

print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred,target_names = targets))

accuracy 0.5535896429972538


  'precision', 'predicted', average, warn_for)


              precision    recall  f1-score   support

Book Reviews       0.00      0.00      0.00         3
   Exclusive       0.00      0.00      0.00        22
    Longform       0.00      0.00      0.00         3
     Reports       0.00      0.00      0.00       194
   broadcast       0.00      0.00      0.00      1590
    business       0.00      0.00      0.00       193
     general       0.97      0.02      0.04      5618
   newspaper       0.54      1.00      0.70     13533
       radio       0.00      0.00      0.00      1681
        wire       1.00      0.18      0.30      2653

   micro avg       0.55      0.55      0.55     25490
   macro avg       0.25      0.12      0.10     25490
weighted avg       0.61      0.55      0.41     25490

CPU times: user 30.4 s, sys: 748 ms, total: 31.1 s
Wall time: 31.2 s


<h3>Logistic Regression

In [45]:
from sklearn.linear_model import LogisticRegression

logreg = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', LogisticRegression(n_jobs=1, C=1e5)),
               ])
logreg.fit(X_train, y_train)

UsageError: Line magic function `%%time` not found.


In [46]:
%%time

y_pred = logreg.predict(X_test)

print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred,target_names=targets))

accuracy 0.8925853275794429
              precision    recall  f1-score   support

Book Reviews       0.00      0.00      0.00         3
   Exclusive       0.00      0.00      0.00        22
    Longform       0.00      0.00      0.00         3
     Reports       0.77      0.53      0.63       194
   broadcast       0.85      0.73      0.78      1590
    business       0.84      0.51      0.63       193
     general       0.85      0.87      0.86      5618
   newspaper       0.91      0.94      0.92     13533
       radio       0.79      0.67      0.73      1681
        wire       1.00      1.00      1.00      2653

   micro avg       0.89      0.89      0.89     25490
   macro avg       0.60      0.52      0.56     25490
weighted avg       0.89      0.89      0.89     25490

CPU times: user 32.4 s, sys: 811 ms, total: 33.2 s
Wall time: 33.7 s


<h3>XG Boost

In [15]:
from xgboost import XGBClassifier

In [46]:
xgb = Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('clf', XGBClassifier()),
              ])
xgb.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1))])

In [47]:
%%time

y_pred = xgb.predict(X_test)

print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred,target_names=targets))

accuracy 0.8858768144370341


  'precision', 'predicted', average, warn_for)


              precision    recall  f1-score   support

Book Reviews       0.00      0.00      0.00         3
   Exclusive       0.00      0.00      0.00        22
    Longform       0.00      0.00      0.00         3
     Reports       0.82      0.48      0.61       194
   broadcast       0.86      0.91      0.88      1590
    business       0.92      0.64      0.76       193
     general       0.94      0.74      0.83      5618
   newspaper       0.86      0.97      0.91     13533
       radio       0.82      0.63      0.71      1681
        wire       1.00      1.00      1.00      2653

   micro avg       0.89      0.89      0.89     25490
   macro avg       0.62      0.54      0.57     25490
weighted avg       0.89      0.89      0.88     25490

CPU times: user 37.9 s, sys: 1.64 s, total: 39.6 s
Wall time: 41.6 s


In [48]:
import pickle

In [50]:
# save the model to disk
filename = 'xgboost_model.sav'
pickle.dump(xgb, open(filename, 'wb'))

In [50]:
X_train.shape

(59475,)

In [51]:
y_train.shape

(59475,)