In [26]:
import pandas as pd
import nltk
import re
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to C:\Users\Abhinav
[nltk_data]     Gunti\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Abhinav
[nltk_data]     Gunti\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

# Loading the dataset

In [2]:
df_train = pd.read_csv('./datasets/r8-train-all-terms.txt', header=None, sep='\t', names=['category', 'text'])
df_train = df_train.dropna()
df_train.head()

Unnamed: 0,category,text
0,earn,champion products ch approves stock split cham...
1,acq,computer terminal systems cpml completes sale ...
2,earn,cobanco inc cbco year net shr cts vs dlrs net ...
3,earn,am international inc am nd qtr jan oper shr lo...
4,earn,brown forman inc bfd th qtr net shr one dlr vs...


In [3]:
text_train = df_train.drop(['category'],axis=1)
y_train = df_train['category']

text_train, y_train

(                                                   text
 0     champion products ch approves stock split cham...
 1     computer terminal systems cpml completes sale ...
 2     cobanco inc cbco year net shr cts vs dlrs net ...
 3     am international inc am nd qtr jan oper shr lo...
 4     brown forman inc bfd th qtr net shr one dlr vs...
 ...                                                 ...
 5480  kelly oil and gas partners kly year dec shr ct...
 5481  japan seeks to strengthen paris currency accor...
 5482  tcw convertible securities cvt sets dividend t...
 5483  south korean won fixed at month high the bank ...
 5484  australian unions launch new south wales strik...
 
 [5485 rows x 1 columns],
 0           earn
 1            acq
 2           earn
 3           earn
 4           earn
           ...   
 5480        earn
 5481    money-fx
 5482        earn
 5483    money-fx
 5484        ship
 Name: category, Length: 5485, dtype: object)

In [4]:
y_train.value_counts()

earn        2840
acq         1596
crude        253
trade        251
money-fx     206
interest     190
ship         108
grain         41
Name: category, dtype: int64

In [5]:
df_test = pd.read_csv('datasets/r8-test-all-terms.txt',
                        header = None, 
                        sep='\t', 
                        names = ['category', 'text'])
text_test = df_test.drop(['category'],axis=1)
y_test = df_test['category']

y_test.value_counts()

earn        1083
acq          696
crude        121
money-fx      87
interest      81
trade         75
ship          36
grain         10
Name: category, dtype: int64

# Pre-processing the data

In [6]:
# Tokenizing, stop-words removal and lemmatization
from nltk.stem import WordNetLemmatizer

def text_preprocess(articles: pd.DataFrame) -> list:
    lemmatizer = WordNetLemmatizer()
    corpus=[]
    for i in range(0,len(articles)):
        review = re.sub('[^a-zA-Z]', ' ', articles['text'][i]) #replacing all charecters apart from A-Z,a-z with ' '
        review = review.lower()
        review = nltk.word_tokenize(review)

        review = [lemmatizer.lemmatize(word) for word in review if not word in nltk.corpus.stopwords.words('english')]
        review = ' '.join(review)
        corpus.append(review)
    return corpus

In [7]:
articles_train = text_train.copy()
articles_train.reset_index(inplace=True)
articles_train

Unnamed: 0,index,text
0,0,champion products ch approves stock split cham...
1,1,computer terminal systems cpml completes sale ...
2,2,cobanco inc cbco year net shr cts vs dlrs net ...
3,3,am international inc am nd qtr jan oper shr lo...
4,4,brown forman inc bfd th qtr net shr one dlr vs...
...,...,...
5480,5480,kelly oil and gas partners kly year dec shr ct...
5481,5481,japan seeks to strengthen paris currency accor...
5482,5482,tcw convertible securities cvt sets dividend t...
5483,5483,south korean won fixed at month high the bank ...


In [8]:
articles_test = text_test.copy()
articles_test.reset_index(inplace=True)
articles_test

Unnamed: 0,index,text
0,0,asian exporters fear damage from u s japan rif...
1,1,china daily says vermin eat pct grain stocks a...
2,2,australian foreign ship ban ends but nsw ports...
3,3,sumitomo bank aims at quick recovery from merg...
4,4,amatil proposes two for five bonus share issue...
...,...,...
2184,2184,balladur insists on maintenance of louvre acco...
2185,2185,philippine trade gap widens in january august ...
2186,2186,iran soviet union to swap crude refined produc...
2187,2187,n z s chase corp makes offer for entregrowth c...


In [14]:
X_train = text_preprocess(articles_train)
X_train

['champion product ch approves stock split champion product inc said board director approved two one stock split common share shareholder record april company also said board voted recommend shareholder annual meeting april increase authorized capital stock five mln mln share reuter',
 'computer terminal system cpml completes sale computer terminal system inc said completed sale share common stock warrant acquire additional one mln share sedio n v lugano switzerland dlrs company said warrant exercisable five year purchase price dlrs per share computer terminal said sedio also right buy additional share increase total holding pct computer terminal outstanding common stock certain circumstance involving change control company company said condition occur warrant would exercisable price equal pct common stock market price time exceed dlrs per share computer terminal also said sold technolgy right dot matrix impact technology including future improvement woodco inc houston tex dlrs said wo

In [15]:
X_test = text_preprocess(articles_test)
X_test

 'china daily say vermin eat pct grain stock survey province seven city showed vermin consume seven pct china grain stock china daily said also said year mln tonne pct china fruit output left rot mln tonne pct vegetable paper blamed waste inadequate storage bad preservation method said government launched national programme reduce waste calling improved technology storage preservation greater production additive paper gave detail reuter',
 'australian foreign ship ban end nsw port hit tug crew new south wale nsw victoria western australia yesterday lifted ban foreign flag ship carrying container nsw port still disrupted separate dispute shipping source said ban imposed week ago pay claim prevented movement port nearly vessel said pay dispute went hearing arbitration commission today meanwhile disruption began today cargo handling port sydney newcastle port kembla said industrial action nsw port part week action called nsw trade labour council protest change state worker compensation la

# TW-IDF Model

In [28]:
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion

from sklearn.model_selection import GridSearchCV
from pprint import pprint

from gowpy.feature_extraction.gow import TwidfVectorizer
from sklearn.naive_bayes import MultinomialNB

from sklearn.metrics import classification_report, accuracy_score, f1_score

In [30]:
from sklearn.metrics import matthews_corrcoef, make_scorer
scorer_mcc = make_scorer(matthews_corrcoef)

### Hyperparameter tuning and cross-validation score

In [31]:
# pipeline = Pipeline([
#     ('gow', TwidfVectorizer()),
#     ('nb', MultinomialNB()),
# ])

# parameters = {
#     'gow__directed' : [True, False],
#     'gow__window_size' : [2,3,4,6,8,16],
#     'gow__b' : [0.0, 0.003],
#     'gow__term_weighting' : ['degree', 'pagerank'],
#     'gow__min_df' : [0, 5, 10],
#     'gow__max_df' : [0.8, 0.9, 1.0],
# #
#     'nb__fit_prior' : [True, False],
# }

# Hyperparameter tuning and cross-validation score using SVM as classifier

In [39]:
from sklearn.svm import SVC
from sklearn import model_selection

pipeline = Pipeline([
    ('gow', TwidfVectorizer()),
    ('svc', SVC()),
])

parameters = {
    'gow__directed' : [True, False],
    'gow__window_size' : [4, 6, 8, 16],
    'gow__b' : [0.0, 0.003],
#     'gow__term_weighting' : ['degree'],
    'gow__min_df' : [0, 5, 10],
    'gow__max_df' : [0.8, 0.9, 1.0],
#
#     'svc__C':[0.1, 1, 10],
#     'svc__kernel':['linear', 'rbf', 'poly'],
#     'svc__degree':[0, 1, 2, 3],
#     'svc__gamma':['scale', 'auto'],
    
}

In [40]:
grid_search = GridSearchCV(pipeline, 
                           parameters, 
                           cv=3,
                           scoring=scorer_mcc,
                           n_jobs=-1, 
                           verbose=10)

print("Performing grid search...")
print("pipeline:", [name for name, _ in pipeline.steps])
pprint(parameters)

grid_search.fit(X_train, y_train)

print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Performing grid search...
pipeline: ['gow', 'svc']
{'gow__b': [0.0, 0.003],
 'gow__directed': [True, False],
 'gow__max_df': [0.8, 0.9, 1.0],
 'gow__min_df': [0, 5, 10],
 'gow__term_weighting': ['degree', 'pagerank'],
 'gow__window_size': [4, 6, 8, 16]}
Fitting 3 folds for each of 288 candidates, totalling 864 fits
Best score: 0.927
Best parameters set:
	gow__b: 0.0
	gow__directed: False
	gow__max_df: 0.8
	gow__min_df: 10
	gow__term_weighting: 'degree'
	gow__window_size: 4


### Fitting the TW-IDF model and evaluating on test data

In [28]:
# Manually setting best parameters
pipeline_gow = Pipeline([
    ('gow', TwidfVectorizer(b=0.0, directed=False, max_df=1.0, min_df=5, term_weighting='degree', window_size=2)),
    ('nb', MultinomialNB(fit_prior=False)),
])

pipeline_gow.fit(X_train, y_train)

Pipeline(steps=[('gow',
                 TwidfVectorizer(directed=False, min_df=5,
                                 tokenizer=<function default_tokenizer at 0x0000022ABB7421F0>,
                                 window_size=2)),
                ('nb', MultinomialNB(fit_prior=False))])

In [29]:
# Evaluation on test data
y_pred = pipeline_gow.predict(X_test)

print(classification_report(y_test, y_pred))
print('\nAccuracy: ', accuracy_score(y_test, y_pred))
print('MCC: ', matthews_corrcoef(y_test, y_pred))
print('f1-micro: ', f1_score(y_test, y_pred, average='micro'))

              precision    recall  f1-score   support

         acq       0.95      0.99      0.97       696
       crude       0.93      0.93      0.93       121
        earn       0.99      0.97      0.98      1083
       grain       1.00      0.50      0.67        10
    interest       0.91      0.78      0.84        81
    money-fx       0.85      0.89      0.87        87
        ship       1.00      0.61      0.76        36
       trade       0.74      0.97      0.84        75

    accuracy                           0.96      2189
   macro avg       0.92      0.83      0.86      2189
weighted avg       0.96      0.96      0.96      2189


Accuracy:  0.9566011877569667
MCC:  0.9333531447969023
f1-micro:  0.9566011877569667
