In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split

# Dataset for NLP
from sklearn.datasets import fetch_20newsgroups

In [2]:
train_data = fetch_20newsgroups(subset='train', remove=('headers', 'footers'))

In [3]:
train_data.keys()

dict_keys(['data', 'filenames', 'target_names', 'target', 'DESCR'])

In [4]:
df = pd.DataFrame(
    columns=['news', 'target']
)

In [5]:
df['news'] = train_data['data']
df['target'] = train_data['target']

In [6]:
df['news'].head()

0    I was wondering if anyone out there could enli...
1    A fair number of brave souls who upgraded thei...
2    well folks, my mac plus finally gave up the gh...
3    Robert J.C. Kyanko (rob@rjck.UUCP) wrote:\n> a...
4    From article <C5owCB.n3p@world.std.com>, by to...
Name: news, dtype: object

In [7]:
df.head()

Unnamed: 0,news,target
0,I was wondering if anyone out there could enli...,7
1,A fair number of brave souls who upgraded thei...,4
2,"well folks, my mac plus finally gave up the gh...",4
3,Robert J.C. Kyanko (rob@rjck.UUCP) wrote:\n> a...,1
4,"From article <C5owCB.n3p@world.std.com>, by to...",14


In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [17]:
words = [
    "In order to feed predictive or clustering models with the text data",
    "one first need to turn the text into vectors of numerical values suitable for statistical analysis",
    "Instruction on what to do if a byte sequence is given to analyze that contains characters not of the given",
    "Remove accents and perform other character normalization during the preprocessing step",
    "is a fast method that only works on characters that have",
    "It is easy for a classifier to overfit on particular things that appear in the 20 Newsgroups data, such as newsgroup headers",
    "Many classifiers achieve very high F-scores, but their results would not generalize to other documents that aren’t from this window of time",
    "For example, let’s look at the results of a multinomial Naive Bayes classifier, which is fast to train and achieves a decent F-score"
]

In [18]:
words

['In order to feed predictive or clustering models with the text data',
 'one first need to turn the text into vectors of numerical values suitable for statistical analysis',
 'Instruction on what to do if a byte sequence is given to analyze that contains characters not of the given',
 'Remove accents and perform other character normalization during the preprocessing step',
 'is a fast method that only works on characters that have',
 'It is easy for a classifier to overfit on particular things that appear in the 20 Newsgroups data, such as newsgroup headers',
 'Many classifiers achieve very high F-scores, but their results would not generalize to other documents that aren’t from this window of time',
 'For example, let’s look at the results of a multinomial Naive Bayes classifier, which is fast to train and achieves a decent F-score']

In [19]:
tfidf = TfidfVectorizer()


In [28]:
tfidf_tran = tfidf.fit_transform(words)

In [21]:
tfidf.idf_

array([2.5040774 , 2.5040774 , 2.5040774 , 2.5040774 , 2.5040774 ,
       2.5040774 , 2.09861229, 2.5040774 , 2.5040774 , 2.5040774 ,
       2.5040774 , 2.5040774 , 2.5040774 , 2.5040774 , 2.5040774 ,
       2.09861229, 2.09861229, 2.5040774 , 2.5040774 , 2.5040774 ,
       2.09861229, 2.5040774 , 2.5040774 , 2.5040774 , 2.5040774 ,
       2.5040774 , 2.5040774 , 2.09861229, 2.5040774 , 2.5040774 ,
       1.81093022, 2.5040774 , 2.5040774 , 2.5040774 , 2.5040774 ,
       2.5040774 , 2.5040774 , 2.5040774 , 2.09861229, 2.5040774 ,
       2.5040774 , 1.58778666, 2.5040774 , 2.5040774 , 2.5040774 ,
       2.5040774 , 2.5040774 , 2.5040774 , 2.5040774 , 2.5040774 ,
       2.5040774 , 2.5040774 , 2.5040774 , 2.5040774 , 2.09861229,
       2.5040774 , 1.58778666, 1.81093022, 2.5040774 , 2.5040774 ,
       2.5040774 , 2.5040774 , 2.09861229, 2.5040774 , 2.5040774 ,
       2.5040774 , 2.5040774 , 2.5040774 , 2.5040774 , 2.09861229,
       2.5040774 , 2.5040774 , 2.5040774 , 2.5040774 , 2.50407

In [22]:
tfidf.vocabulary_

{'in': 38,
 'order': 61,
 'to': 84,
 'feed': 28,
 'predictive': 66,
 'or': 60,
 'clustering': 18,
 'models': 47,
 'with': 93,
 'the': 79,
 'text': 77,
 'data': 20,
 'one': 58,
 'first': 29,
 'need': 50,
 'turn': 86,
 'into': 40,
 'vectors': 88,
 'of': 56,
 'numerical': 55,
 'values': 87,
 'suitable': 76,
 'for': 30,
 'statistical': 73,
 'analysis': 4,
 'instruction': 39,
 'on': 57,
 'what': 90,
 'do': 22,
 'if': 37,
 'byte': 13,
 'sequence': 72,
 'is': 41,
 'given': 33,
 'analyze': 5,
 'that': 78,
 'contains': 19,
 'characters': 15,
 'not': 54,
 'remove': 68,
 'accents': 1,
 'and': 6,
 'perform': 65,
 'other': 62,
 'character': 14,
 'normalization': 53,
 'during': 24,
 'preprocessing': 67,
 'step': 74,
 'fast': 27,
 'method': 46,
 'only': 59,
 'works': 94,
 'have': 34,
 'it': 42,
 'easy': 25,
 'classifier': 16,
 'overfit': 63,
 'particular': 64,
 'things': 81,
 'appear': 7,
 '20': 0,
 'newsgroups': 52,
 'such': 75,
 'as': 9,
 'newsgroup': 51,
 'headers': 35,
 'many': 45,
 'classifiers'

In [23]:
tfidf.get_feature_names()

['20',
 'accents',
 'achieve',
 'achieves',
 'analysis',
 'analyze',
 'and',
 'appear',
 'aren',
 'as',
 'at',
 'bayes',
 'but',
 'byte',
 'character',
 'characters',
 'classifier',
 'classifiers',
 'clustering',
 'contains',
 'data',
 'decent',
 'do',
 'documents',
 'during',
 'easy',
 'example',
 'fast',
 'feed',
 'first',
 'for',
 'from',
 'generalize',
 'given',
 'have',
 'headers',
 'high',
 'if',
 'in',
 'instruction',
 'into',
 'is',
 'it',
 'let',
 'look',
 'many',
 'method',
 'models',
 'multinomial',
 'naive',
 'need',
 'newsgroup',
 'newsgroups',
 'normalization',
 'not',
 'numerical',
 'of',
 'on',
 'one',
 'only',
 'or',
 'order',
 'other',
 'overfit',
 'particular',
 'perform',
 'predictive',
 'preprocessing',
 'remove',
 'results',
 'score',
 'scores',
 'sequence',
 'statistical',
 'step',
 'such',
 'suitable',
 'text',
 'that',
 'the',
 'their',
 'things',
 'this',
 'time',
 'to',
 'train',
 'turn',
 'values',
 'vectors',
 'very',
 'what',
 'which',
 'window',
 'with',


In [24]:
tfidf.vocabulary()

TypeError: 'NoneType' object is not callable

In [26]:
tfidf

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [29]:
print(tfidf_tran)

  (0, 38)	0.27039635158906633
  (0, 61)	0.3226386292699499
  (0, 84)	0.16122599582824834
  (0, 28)	0.3226386292699499
  (0, 66)	0.3226386292699499
  (0, 60)	0.3226386292699499
  (0, 18)	0.3226386292699499
  (0, 47)	0.3226386292699499
  (0, 93)	0.3226386292699499
  (0, 79)	0.16122599582824834
  (0, 77)	0.27039635158906633
  (0, 20)	0.27039635158906633
  (1, 84)	0.13792349143640176
  (1, 79)	0.13792349143640176
  (1, 77)	0.23131510952214943
  (1, 58)	0.27600664516019446
  (1, 29)	0.27600664516019446
  (1, 50)	0.27600664516019446
  (1, 86)	0.27600664516019446
  (1, 40)	0.27600664516019446
  (1, 88)	0.27600664516019446
  (1, 56)	0.17501043345302084
  (1, 55)	0.27600664516019446
  (1, 87)	0.27600664516019446
  (1, 76)	0.27600664516019446
  :	:
  (6, 31)	0.22845015903937335
  (6, 82)	0.22845015903937335
  (6, 92)	0.22845015903937335
  (6, 83)	0.22845015903937335
  (7, 84)	0.12251624165002666
  (7, 79)	0.12251624165002666
  (7, 56)	0.15546025070060823
  (7, 30)	0.17730824400874548
  (7, 41)	0

In [32]:
print(tfidf.get_stop_words())

None


In [35]:
vect_new = TfidfVectorizer()

In [36]:
vect_new_fit = vect_new.fit_transform(df['news'])

In [38]:
vect_new_dense = vect_new_fit.todense()

In [39]:
new_df = pd.DataFrametaFrame(
    data=vect_new_dense
)

AttributeError: module 'pandas' has no attribute 'DataFrametaFrame'

In [40]:
new_df = pd.DataFrame(
    data=vect_new_dense
)

In [41]:
new_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,114741,114742,114743,114744,114745,114746,114747,114748,114749,114750
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [43]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [44]:
rfc = RandomForestClassifier(n_jobs=-1, verbose=2)

In [47]:
rfc.fit(new_df.iloc[:2000, :], df.iloc[:2000, -1])

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.


building tree 1 of 10building tree 2 of 10building tree 3 of 10building tree 4 of 10



building tree 5 of 10
building tree 6 of 10
building tree 7 of 10
building tree 8 of 10
building tree 9 of 10
building tree 10 of 10


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    4.9s finished


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,
                       oob_score=False, random_state=None, verbose=2,
                       warm_start=False)

In [None]:
rfc.fit(new_df, df['target'])