In [1]:
import pandas as pd
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score
from sklearn.ensemble import RandomForestClassifier

In [2]:
data_stem = pickle.load(open('stemmed.pkl', "rb"))

In [7]:
data_stem[0]

'car wonder enlighten car saw dai door sport car look late earli call bricklin door small addit bumper separ rest bodi know tellm model engin spec year product car histori info funki look car mail thank'

In [3]:
df = pd.DataFrame(data_stem, columns = ['preprocessed'])
df.head()

Unnamed: 0,preprocessed
0,car wonder enlighten car saw dai door sport ca...
1,clock poll final final clock report acceler cl...
2,question folk mac plu final gave ghost weekend...
3,weitek robert kyanko rob rjck uucp wrote abrax...
4,shuttl launch question articl cowcb world std ...


In [4]:
news_group = pd.read_json('https://raw.githubusercontent.com/selva86/datasets/master/newsgroups.json')

In [30]:
news_group.head()

Unnamed: 0,content,target,target_names
0,From: lerxst@wam.umd.edu (where's my thing)\nS...,7,rec.autos
1,From: guykuo@carson.u.washington.edu (Guy Kuo)...,4,comp.sys.mac.hardware
2,From: twillis@ec.ecn.purdue.edu (Thomas E Will...,4,comp.sys.mac.hardware
3,From: jgreen@amber (Joe Green)\nSubject: Re: W...,1,comp.graphics
4,From: jcm@head-cfa.harvard.edu (Jonathan McDow...,14,sci.space


In [5]:
df['target'] = news_group['target']

In [6]:
df['target_names'] = news_group['target_names']

In [7]:
df.head()

Unnamed: 0,preprocessed,target,target_names
0,car wonder enlighten car saw dai door sport ca...,7,rec.autos
1,clock poll final final clock report acceler cl...,4,comp.sys.mac.hardware
2,question folk mac plu final gave ghost weekend...,4,comp.sys.mac.hardware
3,weitek robert kyanko rob rjck uucp wrote abrax...,1,comp.graphics
4,shuttl launch question articl cowcb world std ...,14,sci.space


In [8]:
filtered = df[df['target_names'].isin(['soc.religion.christian', 'rec.sport.hockey', 'talk.politics.mideast', 'rec.motorcycles'])]

In [14]:
filtered.reset_index()

Unnamed: 0,index,preprocessed,target,target_names
0,10,recommend duc worth ducati gt line ducati gt m...,8,rec.motorcycles
1,21,nhl team captain articl apr samba oit unc edu ...,10,rec.sport.hockey
2,28,pantheism environment articl apr atho rutger e...,15,soc.religion.christian
3,33,isra expans lust articl spam math adelaid edu ...,17,talk.politics.mideast
4,35,goali mask articl netnew upenn edu kkeller mai...,10,rec.sport.hockey
...,...,...,...,...
2356,11298,nhl team milwauke read report possibl nhl move...,10,rec.sport.hockey
2357,11299,turkei cypru bosnia serbia greec armenia azeri...,17,talk.politics.mideast
2358,11300,arrog christian previou articl phsd vaxc monas...,15,soc.religion.christian
2359,11305,hezbollah apr yuma acn colost edu repli long l...,17,talk.politics.mideast


In [27]:
import numpy as np
np.array(filtered.target)

array([ 8, 10, 15, ..., 15, 17,  8], dtype=int64)

In [10]:
vectorizer = TfidfVectorizer(max_df=0.7, min_df=0.1, smooth_idf=False) # TF_IDF2

In [11]:
data_stem2 = pd.DataFrame(vectorizer.fit_transform(filtered['preprocessed']).toarray())

In [12]:
data_stem2

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,80,81,82,83,84,85,86,87,88,89
0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.677363,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.280832,0.0,0.000000,0.00000,0.000000,0.000000
1,0.166164,0.133056,0.000000,0.0,0.220015,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.00000,0.117268,0.000000
2,0.090753,0.072670,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.136545,0.000000,...,0.000000,0.000000,0.133939,0.000000,0.000000,0.0,0.000000,0.00000,0.064048,0.000000
3,0.174473,0.209564,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.120475,0.000000,0.000000,0.000000,0.209820,0.0,0.000000,0.00000,0.184698,0.000000
4,0.000000,0.136935,0.000000,0.0,0.000000,0.260950,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.00000,0.120687,0.196256
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2356,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.00000,0.000000,0.000000
2357,0.169755,0.135931,0.082243,0.0,0.074923,0.000000,0.087828,0.000000,0.000000,0.000000,...,0.234435,0.000000,0.000000,0.194958,0.000000,0.0,0.000000,0.07963,0.079868,0.000000
2358,0.029449,0.047163,0.128408,0.0,0.194965,0.044938,0.000000,0.000000,0.000000,0.045318,...,0.040670,0.043904,0.217314,0.000000,0.035415,0.0,0.038141,0.00000,0.020783,0.000000
2359,0.183103,0.048873,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.093924,...,0.000000,0.000000,0.000000,0.070096,0.000000,0.0,0.000000,0.00000,0.043074,0.140091


In [28]:
data_stem2['target2'] = np.array(filtered.target)

In [32]:
data_stem2.drop('target', axis =1, inplace=True)

In [34]:
data_stem2

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,81,82,83,84,85,86,87,88,89,target2
0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.677363,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.280832,0.0,0.000000,0.00000,0.000000,0.000000,8
1,0.166164,0.133056,0.000000,0.0,0.220015,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.00000,0.117268,0.000000,10
2,0.090753,0.072670,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.136545,0.000000,...,0.000000,0.133939,0.000000,0.000000,0.0,0.000000,0.00000,0.064048,0.000000,15
3,0.174473,0.209564,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.209820,0.0,0.000000,0.00000,0.184698,0.000000,17
4,0.000000,0.136935,0.000000,0.0,0.000000,0.260950,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.00000,0.120687,0.196256,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2356,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.00000,0.000000,0.000000,10
2357,0.169755,0.135931,0.082243,0.0,0.074923,0.000000,0.087828,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.194958,0.000000,0.0,0.000000,0.07963,0.079868,0.000000,17
2358,0.029449,0.047163,0.128408,0.0,0.194965,0.044938,0.000000,0.000000,0.000000,0.045318,...,0.043904,0.217314,0.000000,0.035415,0.0,0.038141,0.00000,0.020783,0.000000,15
2359,0.183103,0.048873,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.093924,...,0.000000,0.000000,0.070096,0.000000,0.0,0.000000,0.00000,0.043074,0.140091,17


In [37]:
def text_train(df):
 return train_test_split(df.iloc[:, :-2], df.target2,test_size = 0.20, random_state = 12)

In [38]:
docs_train_s, docs_test_s, y_train_s, y_test_s = text_train(data_stem2)

In [39]:
clf = MultinomialNB()
clf.fit(docs_train_s, y_train_s)

MultinomialNB()

In [40]:
 y_pred_s = clf.predict(docs_test_s)

In [41]:
clf.score(docs_train_s, y_train_s)


0.868114406779661

In [42]:
accuracy_score(y_pred_s, y_test_s)


0.8372093023255814

In [44]:
print(classification_report(y_pred_s, y_test_s))

              precision    recall  f1-score   support

           8       0.87      0.82      0.84       143
          10       0.87      0.92      0.90       112
          15       0.87      0.79      0.83       116
          17       0.73      0.82      0.77       102

    accuracy                           0.84       473
   macro avg       0.84      0.84      0.84       473
weighted avg       0.84      0.84      0.84       473



In [45]:
clf2 = RandomForestClassifier(random_state = 42)
clf2.fit(docs_train_s, y_train_s)


RandomForestClassifier(random_state=42)

In [46]:
clf2.score(docs_train_s, y_train_s)

0.9936440677966102

In [58]:
y_pred_s2 = clf2.predict(docs_test_s)

In [49]:
accuracy_score(y_pred_s2, y_test_s)

0.86892177589852

In [51]:
print(classification_report(y_pred_s2, y_test_s))

              precision    recall  f1-score   support

           8       0.89      0.88      0.88       135
          10       0.91      0.86      0.88       124
          15       0.84      0.86      0.85       103
          17       0.83      0.86      0.85       111

    accuracy                           0.87       473
   macro avg       0.87      0.87      0.87       473
weighted avg       0.87      0.87      0.87       473



In [52]:
param_grid = {'min_samples_leaf': [5,10], 'n_estimators': [3,5]}
rf = RandomForestClassifier(random_state = 42)
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, cv = 10)
grid_search.fit(docs_train_s, y_train_s)

GridSearchCV(cv=10, estimator=RandomForestClassifier(random_state=42),
             param_grid={'min_samples_leaf': [5, 10], 'n_estimators': [3, 5]})

In [53]:
grid_search.best_params_

{'min_samples_leaf': 5, 'n_estimators': 5}

In [62]:
clf3 = grid_search.best_estimator_

In [63]:
y_pred_s3 = clf3.predict(docs_test_s)

In [64]:
accuracy_score(y_pred_s3, y_test_s)

0.8245243128964059