# IMPORT PRE-PROCESSED BBC NEWS DATASET

In [1]:
import pandas as pd

In [2]:
pd.set_option('display.max_colwidth', 150)
dataset = pd.read_csv("clean_bbc_news_dataset", index_col=0)
dataset.head()

Unnamed: 0,category,clean_text
0,tech,"['tv', 'futur', 'hand', 'viewer', 'home', 'theatr', 'system', 'plasma', 'highdefinit', 'tv', 'digit', 'video', 'record', 'move', 'live', 'room', '..."
1,business,"['worldcom', 'boss', 'left', 'book', 'alon', 'former', 'worldcom', 'boss', 'berni', 'ebber', 'accus', 'overse', '11bn', '£58bn', 'fraud', 'never',..."
2,sport,"['tiger', 'wari', 'farrel', 'gambl', 'leicest', 'say', 'rush', 'make', 'bid', 'andi', 'farrel', 'great', 'britain', 'rugbi', 'leagu', 'captain', '..."
3,sport,"['yead', 'face', 'newcastl', 'fa', 'cup', 'premiership', 'side', 'newcastl', 'unit', 'face', 'trip', 'ryman', 'premier', 'leagu', 'leader', 'yead'..."
4,entertainment,"['ocean', 'twelv', 'raid', 'box', 'offic', 'ocean', 'twelv', 'crime', 'caper', 'sequel', 'star', 'georg', 'clooney', 'brad', 'pitt', 'julia', 'rob..."


# spliting of dataset into training and testing 

In [3]:
from sklearn import model_selection 

X = dataset['clean_text']
y = dataset['category']

train_X, test_X, train_y, test_y = model_selection.train_test_split(X, y, test_size=.20)
print(f'{len(train_X)} Training Sample  AND  {len(test_y)} testing Sample')

1780 Training Sample  AND  445 testing Sample


# VECTORIZATION USING COUNT VECTORIZER

In [4]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(max_features=5000)
#  learn vocabulary and tranform to vector representation
X_f = cv.fit(train_X)
# print (X.get_feature_names())
X_train_cv = cv.fit_transform(train_X)
print(X_train_cv.shape)

# WE ONLY TRANFROM THUS CONVERT TEST DATA TO VECTOR WITHOUT LEARNING IS VOCABULARY
X_test_cv = cv.transform(test_X)

# print(cv_X.vocabulary_)
print('No of Feature learned...' , len(X_f.get_feature_names()))
# print(X_train_cv.toarray())


(1780, 5000)
No of Feature learned... 5000




In [5]:
vector_rep = pd.DataFrame(X_train_cv.toarray(), columns=X_f.get_feature_names())
vector_rep.head()

Unnamed: 0,000,000m,01,02,03,05,07,10,100,100m,...,yuan,yugansk,yuganskneftega,yuko,yushchenko,zealand,zero,zombi,zone,zurich
0,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,1,3,12,0,0,0,0,0,0
4,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# VECTORIZATION USING TF-IDF

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [13]:
tf = TfidfVectorizer(max_features=5000)
#  learn vocabulary and tranform to vector representation
X = tf.fit(train_X)
# print (X.get_feature_names())
print(X.vocabulary_)
X_train_tf = tf.fit_transform(train_X)
print(X_train_tf.shape)

# WE ONLY TRANFROM THUS CONVERT TEST DATA TO VECTOR WITHOUT LEARNING IS VOCABULARY
X_test_tf = tf.transform(test_X)

print(len(X.vocabulary_))
print('No of Feature learned...' , len(X.get_feature_names()))
# X_t    rain_tf[2]


{'moya': 3004, 'fight': 1790, 'back': 535, 'indian': 2332, 'titl': 4539, 'carlo': 840, 'becam': 589, 'first': 1816, 'man': 2792, 'success': 4345, 'defend': 1301, 'open': 3191, 'beat': 584, 'finalist': 1802, '36': 128, '64': 175, '76': 190, '75': 188, 'spaniard': 4182, 'donat': 1442, '28': 109, '000': 0, 'prize': 3504, 'money': 2969, 'relief': 3709, 'effort': 1545, 'victim': 4779, 'asian': 469, 'tsunami': 4637, 'final': 1801, 'close': 992, 'throughout': 4519, 'thai': 4488, 'second': 3960, 'seed': 3971, '2003': 76, 'winner': 4918, 'set': 4007, 'took': 4556, 'bounc': 707, '52': 159, 'decid': 1288, 'forc': 1858, 'win': 4912, 'tiebreak': 4528, 'confirm': 1089, 'afterward': 297, 'tournament': 4573, 'hope': 2236, 'make': 2784, 'differ': 1379, 'live': 2697, 'contribut': 1135, 'follow': 1851, 'pledg': 3397, 'four': 1885, 'player': 3392, 'went': 4884, 'ahead': 311, '26': 105, 'decemb': 1285, 'disast': 1397, 'far': 1741, 'claim': 964, 'least': 2624, '150': 29, 'said': 3888, 'would': 4956, 'amount

(1780, 5000)
5000
No of Feature learned... 5000


<1x5000 sparse matrix of type '<class 'numpy.float64'>'
	with 95 stored elements in Compressed Sparse Row format>

In [8]:
vector_rep = pd.DataFrame(X_train_tf.toarray(), columns=X.get_feature_names())
vector_rep.head()

Unnamed: 0,000,000m,01,02,03,05,07,10,100,100m,...,yuan,yugansk,yuganskneftega,yuko,yushchenko,zealand,zero,zombi,zone,zurich
0,0.197834,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.027505,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.055031,0.170534,0.585169,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.060937,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# VECTORIZATION USING WORD EMBEDDING 

In [9]:
train_X

25      ['moya', 'fight', 'back', 'indian', 'titl', 'carlo', 'moya', 'becam', 'first', 'man', 'success', 'defend', 'chennai', 'open', 'titl', 'beat', 'fou...
463     ['dozen', 'held', 'id', 'fraud', 'site', 'twentyeight', 'peopl', 'includ', 'briton', 'arrest', 'global', 'oper', 'websit', 'allegedli', 'involv', ...
1731    ['crossrail', 'link', 'get', 'goahead', '£10bn', 'crossrail', 'transport', 'plan', 'back', 'busi', 'group', 'get', 'goahead', 'month', 'accord', '...
76      ['yuko', 'sue', 'four', 'firm', '20bn', 'russian', 'oil', 'firm', 'yuko', 'su', 'four', 'compani', 'role', 'last', 'year', 'forc', 'state', 'aucti...
2022    ['us', 'data', 'spark', 'inflat', 'worri', 'wholesal', 'price', 'us', 'rose', 'fastest', 'rate', 'six', 'year', 'januari', 'accord', 'govern', 'da...
                                                                                ...                                                                          
924     ['india', 'open', 'sky', 'competit', 'india'