In [1]:
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
import joblib

In [3]:
import spacy

In [4]:
 df = pd.read_csv('learn-ai-bbc/BBC News Train.csv')

In [5]:
df.head()

Unnamed: 0,ArticleId,Text,Category
0,1833,worldcom ex-boss launches defence lawyers defe...,business
1,154,german business confidence slides german busin...,business
2,1101,bbc poll indicates economic gloom citizens in ...,business
3,1976,lifestyle governs mobile choice faster bett...,tech
4,917,enron bosses in $168m payout eighteen former e...,business


In [46]:
pd.set_option('display.max_colwidth', None)

In [47]:
df['Text'][df.ArticleId==1582]

5    howard  truanted to play snooker  conservative leader michael howard has admitted he used to play truant to spend time with his school friends at a snooker hall.  mr howard said his time at jack s snooker hall in llanelli in the 1950s had not done him  any lasting damage . but he told the times educational supplement that truancy was  very bad  and said  firm action  was needed. mr howard also called for a return to o-levels and more classroom discipline.  mr howard eventually left llanelli grammar school - and the snooker hall - to go to cambridge university. he said:  i don t think it s done me any lasting damage. nor has it made me a snooker world champion.  there might have been some occasions when we left early of an afternoon.   i m just being honest. i think truancy is a very bad thing and that firm action should be taken to deal with it.  another player who has failed to win snooker s world championship - jimmy  the whirlwind   white - has previously admitted missing lesso

In [6]:
df.Category.value_counts()

sport            346
business         336
politics         274
entertainment    273
tech             261
Name: Category, dtype: int64

In [7]:
df['Cat']=df['Category'].apply(lambda x: 1 if x=='sport' else 2 if x=='business' else 3 if x=='politics' else 4 if x=='entertainment' else 5 )

In [8]:
df.head()

Unnamed: 0,ArticleId,Text,Category,Cat
0,1833,worldcom ex-boss launches defence lawyers defe...,business,2
1,154,german business confidence slides german busin...,business,2
2,1101,bbc poll indicates economic gloom citizens in ...,business,2
3,1976,lifestyle governs mobile choice faster bett...,tech,5
4,917,enron bosses in $168m payout eighteen former e...,business,2


In [9]:
df.drop(['Category'],axis=1)

Unnamed: 0,ArticleId,Text,Cat
0,1833,worldcom ex-boss launches defence lawyers defe...,2
1,154,german business confidence slides german busin...,2
2,1101,bbc poll indicates economic gloom citizens in ...,2
3,1976,lifestyle governs mobile choice faster bett...,5
4,917,enron bosses in $168m payout eighteen former e...,2
...,...,...,...
1485,857,double eviction from big brother model caprice...,4
1486,325,dj double act revamp chart show dj duo jk and ...,4
1487,1590,weak dollar hits reuters revenues at media gro...,2
1488,1587,apple ipod family expands market apple has exp...,5


In [10]:
from sklearn.model_selection import train_test_split

In [11]:
X_train, X_test, Y_train, Y_test = train_test_split(df.Text, df.Cat, test_size=0.3)

In [12]:
X_train.shape

(1043,)

In [13]:
from sklearn.feature_extraction.text import CountVectorizer

In [14]:
v = CountVectorizer()

cv=v.fit_transform(X_train.values)

In [41]:
joblib.dump(v, 'pre_fitted_vectorizer.pkl')

['pre_fitted_vectorizer.pkl']

In [15]:
X_train

387     moya fights back for indian title carlos moya ...
1192    parker s saxophone heads auction a saxophone b...
1291    text message record smashed uk mobile owners c...
1289    eminem secret gig venue revealed rapper eminem...
491     what now for kelly holmes  last april  kelly h...
                              ...                        
781     comic morris returns with sitcom comedian chri...
1477    web logs aid disaster recovery some of the mos...
942     microsoft sets sights on spyware windows users...
1137    us peer-to-peer pirates convicted the first co...
207     soderling wins tense milan final fifth seed ro...
Name: Text, Length: 1043, dtype: object

In [16]:
cv

<1043x21312 sparse matrix of type '<class 'numpy.int64'>'
	with 210923 stored elements in Compressed Sparse Row format>

In [17]:
cv.toarray()

array([[0, 5, 0, ..., 0, 0, 0],
       [0, 3, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 2, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [18]:
cv.shape

(1043, 21312)

In [19]:
v.get_feature_names_out()[1300:1329]

array(['advises', 'advising', 'advisor', 'advisory', 'advocate',
       'advocated', 'advocates', 'adware', 'ae', 'aer', 'aeroplane',
       'aerospace', 'aesthetics', 'affable', 'affair', 'affairs',
       'affect', 'affected', 'affecting', 'affection', 'affectionately',
       'affects', 'affiliate', 'affiliates', 'afflicting', 'afford',
       'affordable', 'afforded', 'afghanistan'], dtype=object)

In [20]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(cv, Y_train)

MultinomialNB()

In [39]:
joblib.dump(model,'model1.pkl')

['model1.pkl']

In [21]:
X_test_cv = v.transform(X_test)

In [22]:
from sklearn.metrics import classification_report

In [23]:
y_pred = model.predict(X_test_cv)

In [24]:
print(classification_report(Y_test,y_pred))

              precision    recall  f1-score   support

           1       1.00      0.99      1.00       105
           2       0.96      0.96      0.96       101
           3       0.96      0.96      0.96        80
           4       0.99      0.99      0.99        81
           5       0.98      0.99      0.98        80

    accuracy                           0.98       447
   macro avg       0.98      0.98      0.98       447
weighted avg       0.98      0.98      0.98       447



In [25]:
ch = ["I love to watch and play "]
ch_cv = v.transform(ch)

In [26]:
model.predict(ch_cv)

array([4], dtype=int64)

In [27]:
test = pd.read_csv('learn-ai-bbc/BBC News Test.csv')

In [28]:
test_cv = v.transform(test.Text)

In [29]:
sol = model.predict(test_cv)

In [30]:
sol

array([1, 5, 1, 2, 1, 1, 3, 3, 4, 2, 2, 5, 3, 5, 4, 1, 3, 5, 4, 4, 2, 3,
       1, 2, 3, 1, 2, 1, 1, 2, 3, 5, 2, 2, 1, 1, 1, 2, 4, 4, 5, 3, 4, 5,
       1, 5, 4, 2, 3, 2, 3, 2, 2, 2, 5, 3, 5, 5, 1, 5, 1, 4, 5, 3, 2, 4,
       1, 5, 1, 1, 5, 1, 2, 3, 5, 1, 5, 5, 5, 4, 3, 1, 4, 4, 2, 4, 2, 4,
       2, 5, 3, 3, 1, 5, 1, 1, 1, 1, 1, 1, 3, 1, 3, 4, 2, 1, 3, 1, 3, 4,
       1, 2, 4, 1, 3, 1, 3, 1, 3, 2, 4, 2, 4, 4, 5, 1, 2, 4, 2, 4, 2, 3,
       3, 5, 2, 2, 3, 5, 4, 1, 2, 5, 1, 4, 3, 1, 1, 4, 4, 5, 2, 5, 3, 4,
       1, 1, 1, 1, 4, 5, 2, 5, 2, 5, 2, 5, 5, 5, 5, 3, 2, 3, 2, 2, 4, 3,
       5, 2, 2, 5, 1, 3, 1, 3, 5, 5, 3, 2, 3, 5, 3, 2, 4, 1, 5, 5, 2, 5,
       3, 2, 1, 3, 2, 4, 2, 2, 1, 5, 2, 1, 4, 4, 1, 4, 1, 5, 3, 4, 1, 4,
       1, 4, 3, 2, 5, 4, 2, 3, 2, 5, 2, 1, 3, 3, 3, 3, 1, 2, 5, 3, 1, 3,
       2, 1, 5, 2, 3, 2, 3, 2, 2, 1, 5, 3, 4, 5, 4, 5, 1, 1, 5, 1, 1, 1,
       4, 1, 3, 5, 5, 1, 2, 1, 2, 1, 4, 2, 2, 4, 3, 2, 1, 1, 5, 1, 1, 4,
       2, 1, 5, 3, 4, 2, 2, 3, 1, 4, 3, 2, 1, 1, 5,

In [31]:
len(sol)

735

In [32]:
len(test)

735

In [33]:
submit = pd.read_csv('learn-ai-bbc/BBC News Sample Solution.csv')

In [34]:
submit['Category'] = sol

In [35]:
submit

Unnamed: 0,ArticleId,Category
0,1018,1
1,1319,5
2,1138,1
3,459,2
4,1020,1
...,...,...
730,1923,2
731,373,4
732,1704,3
733,206,2


In [36]:
submit['Category']=submit['Category'].apply(lambda x: 'sport' if x==1 else 'business' if x==2 else 'politics' if x==3 else 'entertainment' if x==4 else 'tech')

In [37]:
submit

Unnamed: 0,ArticleId,Category
0,1018,sport
1,1319,tech
2,1138,sport
3,459,business
4,1020,sport
...,...,...
730,1923,business
731,373,entertainment
732,1704,politics
733,206,business


In [38]:
submit.to_csv('submit.csv',index=False)