In [1]:
import pandas as pd

In [2]:
def get_data(path):
    return pd.read_csv('./dataset/'+path+'.csv')

test_df = get_data('BBC News Test')
train_df = get_data('BBC News Train')

In [3]:
def diagnose_data(df):
    print(f'shape:\n{df.shape}\n')
    print(f'columns:\n{sorted(df.columns)}\n')
    print(f'n dtypes:\n{df.dtypes.value_counts()}\n')
    print(f'n uniques:')
    for col in df.columns:
        print(f'{col}: {df[col].nunique()}')
    print(f'\nnull values:')
    for col in df.columns:
        print(f'{col}: {df[col].isnull().sum()}')
    print('\ninfo:')
    df.info()
    return df.head()

In [4]:
diagnose_data(train_df)

shape:
(1490, 3)

columns:
['ArticleId', 'Category', 'Text']

n dtypes:
object    2
int64     1
dtype: int64

n uniques:
ArticleId: 1490
Text: 1440
Category: 5

null values:
ArticleId: 0
Text: 0
Category: 0

info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1490 entries, 0 to 1489
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   ArticleId  1490 non-null   int64 
 1   Text       1490 non-null   object
 2   Category   1490 non-null   object
dtypes: int64(1), object(2)
memory usage: 35.0+ KB


Unnamed: 0,ArticleId,Text,Category
0,1833,worldcom ex-boss launches defence lawyers defe...,business
1,154,german business confidence slides german busin...,business
2,1101,bbc poll indicates economic gloom citizens in ...,business
3,1976,lifestyle governs mobile choice faster bett...,tech
4,917,enron bosses in $168m payout eighteen former e...,business


In [5]:
diagnose_data(test_df)

shape:
(735, 2)

columns:
['ArticleId', 'Text']

n dtypes:
int64     1
object    1
dtype: int64

n uniques:
ArticleId: 735
Text: 722

null values:
ArticleId: 0
Text: 0

info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 735 entries, 0 to 734
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   ArticleId  735 non-null    int64 
 1   Text       735 non-null    object
dtypes: int64(1), object(1)
memory usage: 11.6+ KB


Unnamed: 0,ArticleId,Text
0,1018,qpr keeper day heads for preston queens park r...
1,1319,software watching while you work software that...
2,1138,d arcy injury adds to ireland woe gordon d arc...
3,459,india s reliance family feud heats up the ongo...
4,1020,boro suffer morrison injury blow middlesbrough...


In [6]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression

In [7]:
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [8]:
port_stem = PorterStemmer()
def stemming(content):
    review = re.sub(r'[^a-z]', ' ', content, flags=re.I).split()
    return ' '.join(port_stem.stem(word) for word in review if word not in stopwords.words('english'))

In [9]:
train_df['Text'] = train_df['Text'].apply(stemming)

In [10]:
train_df['Text']

0       worldcom ex boss launch defenc lawyer defend f...
1       german busi confid slide german busi confid fe...
2       bbc poll indic econom gloom citizen major nati...
3       lifestyl govern mobil choic faster better funk...
4       enron boss payout eighteen former enron direct...
                              ...                        
1485    doubl evict big brother model capric holbi cit...
1486    dj doubl act revamp chart show dj duo jk joel ...
1487    weak dollar hit reuter revenu media group reut...
1488    appl ipod famili expand market appl expand ipo...
1489    santi worm make unwelcom visit thousand websit...
Name: Text, Length: 1490, dtype: object

In [11]:
X_train = train_df['Text']
y_train = train_df['Category'] 

In [12]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y_train = le.fit_transform(y_train)

In [13]:
y_train

array([0, 0, 0, ..., 0, 4, 4])

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(X_train)

In [15]:
print(X_train)

  (0, 15107)	0.03697623741705603
  (0, 3525)	0.030432635628283116
  (0, 2854)	0.03641750094875029
  (0, 15180)	0.06455322016132493
  (0, 2038)	0.06611751284175665
  (0, 259)	0.03452213347614023
  (0, 15489)	0.02370469329763093
  (0, 7996)	0.019317667186145658
  (0, 8883)	0.11993311690263511
  (0, 7818)	0.034983477267053205
  (0, 11149)	0.03641750094875029
  (0, 1101)	0.04686588897877218
  (0, 4487)	0.03921675168989014
  (0, 3060)	0.04978523361437186
  (0, 15888)	0.014385655149312605
  (0, 12580)	0.048818043480629916
  (0, 7355)	0.04613199223774085
  (0, 4878)	0.05629628238448262
  (0, 3147)	0.01953846519092076
  (0, 2846)	0.040901138264781686
  (0, 888)	0.056164238403013315
  (0, 14775)	0.06796767838820315
  (0, 622)	0.04062309852585978
  (0, 8995)	0.05085441270352502
  (0, 6695)	0.027191896034190928
  :	:
  (1489, 13561)	0.03641921708266678
  (1489, 12799)	0.024844224156539065
  (1489, 6416)	0.02605069054730865
  (1489, 1394)	0.03122416035395891
  (1489, 118)	0.035102432083059854
  (1

In [16]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(X_train, y_train)

LogisticRegression()

In [17]:
test_df['Text'] = test_df['Text'].apply(stemming)

In [18]:
test_df['Text'] 

0      qpr keeper day head preston queen park ranger ...
1      softwar watch work softwar monitor everi keyst...
2      arci injuri add ireland woe gordon arci rule i...
3      india relianc famili feud heat ongo public spa...
4      boro suffer morrison injuri blow middlesbrough...
                             ...                        
730    eu probe alitalia state aid european commiss o...
731    u play grammi award show irish rock band u pla...
732    sport bet rule spotlight group mp peer call ti...
733    alfa romeo get gm engin fiat stop make six cyl...
734    citizenship event tout citizenship ceremoni co...
Name: Text, Length: 735, dtype: object

In [19]:
X_test = test_df['Text']
X_test = vectorizer.transform(X_test)

In [20]:
print(X_test)

  (0, 15883)	0.10549159739907664
  (0, 14367)	0.09966621261885593
  (0, 13747)	0.0744754267042108
  (0, 13271)	0.10141936890990971
  (0, 12885)	0.17800337504577274
  (0, 12857)	0.12554744523618794
  (0, 12616)	0.04486999031421658
  (0, 12510)	0.049415784978612314
  (0, 12344)	0.03920435856588304
  (0, 12319)	0.07077381406772251
  (0, 12225)	0.022423941780162417
  (0, 12113)	0.27783968178129415
  (0, 11972)	0.08118779491681792
  (0, 11951)	0.07425002728808766
  (0, 11521)	0.0525622962053647
  (0, 11516)	0.1931523607990937
  (0, 11412)	0.22132285628521478
  (0, 11283)	0.0988460279722021
  (0, 11258)	0.27783968178129415
  (0, 11026)	0.24518239831091965
  (0, 10569)	0.10549159739907664
  (0, 10369)	0.08548809033272044
  (0, 10130)	0.071151697359118
  (0, 9629)	0.046927561920405855
  (0, 9326)	0.13287973833758918
  :	:
  (734, 2089)	0.025202575892139104
  (734, 1843)	0.05983147425195198
  (734, 1841)	0.09387057671867739
  (734, 1682)	0.03644275058148937
  (734, 1488)	0.1661538742823068
  (7

In [21]:
y_pred = model.predict(X_test)

In [22]:
sub_df = get_data('BBC News Sample Solution')
y_test = sub_df['Category']

In [23]:
y_test = le.transform(y_test)

In [24]:
y_test

array([3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4,
       0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1,
       2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3,
       4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0,
       1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2,
       3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4,
       0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1,
       2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3,
       4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0,
       1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2,
       3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4,
       0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1,
       2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3,
       4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3,

In [25]:
y_pred

array([3, 4, 3, 0, 3, 3, 2, 2, 1, 0, 0, 4, 2, 4, 1, 3, 2, 4, 1, 1, 0, 2,
       3, 0, 2, 3, 0, 3, 3, 0, 2, 4, 0, 0, 3, 3, 3, 0, 1, 1, 4, 2, 1, 4,
       3, 4, 1, 0, 2, 0, 2, 0, 0, 0, 4, 2, 4, 1, 3, 4, 3, 1, 4, 2, 1, 1,
       3, 4, 3, 3, 4, 3, 0, 2, 4, 3, 4, 4, 4, 1, 2, 3, 1, 1, 0, 1, 0, 1,
       0, 4, 0, 2, 3, 4, 3, 3, 3, 3, 3, 3, 2, 3, 0, 1, 0, 3, 2, 3, 2, 1,
       3, 0, 1, 3, 2, 3, 2, 3, 2, 0, 1, 0, 1, 1, 4, 3, 0, 1, 0, 1, 0, 2,
       2, 4, 0, 0, 2, 4, 1, 3, 0, 4, 3, 1, 2, 3, 3, 1, 1, 4, 0, 4, 2, 1,
       3, 3, 3, 3, 1, 4, 0, 4, 0, 4, 0, 4, 1, 4, 4, 2, 0, 2, 0, 0, 1, 2,
       4, 0, 0, 4, 3, 2, 3, 2, 4, 4, 2, 0, 2, 1, 2, 0, 1, 3, 4, 4, 0, 4,
       2, 0, 3, 2, 0, 1, 0, 0, 3, 4, 0, 3, 1, 1, 3, 1, 3, 4, 2, 1, 3, 1,
       3, 1, 2, 0, 4, 1, 0, 2, 0, 4, 0, 3, 2, 2, 0, 2, 3, 0, 1, 2, 3, 2,
       0, 3, 4, 0, 2, 0, 2, 0, 0, 3, 4, 2, 1, 4, 1, 4, 3, 3, 4, 3, 3, 3,
       1, 3, 2, 4, 0, 3, 0, 3, 0, 3, 1, 0, 0, 1, 2, 0, 3, 3, 4, 3, 3, 1,
       0, 3, 4, 2, 1, 0, 0, 2, 3, 1, 2, 0, 3, 3, 4,

In [26]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.21      0.24      0.23       147
           1       0.20      0.16      0.18       147
           2       0.16      0.16      0.16       147
           3       0.20      0.23      0.22       147
           4       0.18      0.16      0.17       147

    accuracy                           0.19       735
   macro avg       0.19      0.19      0.19       735
weighted avg       0.19      0.19      0.19       735



In [33]:
from sklearn.linear_model import PassiveAggressiveClassifier

PAC = PassiveAggressiveClassifier(max_iter=100)
PAC.fit(X_train, y_train)

PassiveAggressiveClassifier(max_iter=100)

In [34]:
y_pred_pac = PAC.predict(X_test)

print(classification_report(y_test, y_pred_pac))

              precision    recall  f1-score   support

           0       0.21      0.24      0.22       147
           1       0.21      0.16      0.18       147
           2       0.17      0.16      0.17       147
           3       0.20      0.23      0.22       147
           4       0.17      0.16      0.16       147

    accuracy                           0.19       735
   macro avg       0.19      0.19      0.19       735
weighted avg       0.19      0.19      0.19       735



In [42]:
get_id = test_df['ArticleId']
final_df = pd.concat([get_id, pd.Series(le.inverse_transform(y_pred_pac), name='Category')], axis='columns').set_index('ArticleId')
final_df.to_csv('./dataset/my_submission.csv')