# Importing the libraries

In [1]:
import pandas as pd
import nltk
import re

# Loading the training dataset

In [2]:
df_train = pd.read_csv('./datasets/r8-train-all-terms.txt', header=None, sep='\t', names=['category', 'text'])
df_train = df_train.dropna()
df_train.head()

Unnamed: 0,category,text
0,earn,champion products ch approves stock split cham...
1,acq,computer terminal systems cpml completes sale ...
2,earn,cobanco inc cbco year net shr cts vs dlrs net ...
3,earn,am international inc am nd qtr jan oper shr lo...
4,earn,brown forman inc bfd th qtr net shr one dlr vs...


In [3]:
text_train = df_train.drop(['category'],axis=1)
y_train = df_train['category']

text_train, y_train

(                                                   text
 0     champion products ch approves stock split cham...
 1     computer terminal systems cpml completes sale ...
 2     cobanco inc cbco year net shr cts vs dlrs net ...
 3     am international inc am nd qtr jan oper shr lo...
 4     brown forman inc bfd th qtr net shr one dlr vs...
 ...                                                 ...
 5480  kelly oil and gas partners kly year dec shr ct...
 5481  japan seeks to strengthen paris currency accor...
 5482  tcw convertible securities cvt sets dividend t...
 5483  south korean won fixed at month high the bank ...
 5484  australian unions launch new south wales strik...
 
 [5485 rows x 1 columns],
 0           earn
 1            acq
 2           earn
 3           earn
 4           earn
           ...   
 5480        earn
 5481    money-fx
 5482        earn
 5483    money-fx
 5484        ship
 Name: category, Length: 5485, dtype: object)

In [4]:
y_train.value_counts()

earn        2840
acq         1596
crude        253
trade        251
money-fx     206
interest     190
ship         108
grain         41
Name: category, dtype: int64

# Loading the testing dataset

In [6]:
df_test = pd.read_csv('datasets/r8-test-all-terms.txt',header = None, sep='\t', names = ['category', 'text'])
text_test = df_test.drop(['category'],axis=1)
df_test = df_test.dropna()
y_test = df_test['category']

y_test.value_counts()

earn        1083
acq          696
crude        121
money-fx      87
interest      81
trade         75
ship          36
grain         10
Name: category, dtype: int64

# Pre-processing

In [8]:
# Tokenizing, stop-words removal and lemmatization
from nltk.stem import WordNetLemmatizer

def text_preprocess(articles: pd.DataFrame) -> list:
    lemmatizer = WordNetLemmatizer()
    corpus=[]
    for i in range(0,len(articles)):
        review = re.sub('[^a-zA-Z]', ' ', articles['text'][i]) #replacing all charecters apart from A-Z,a-z with ' '
        review = review.lower()
        review = nltk.word_tokenize(review)

        review = [lemmatizer.lemmatize(word) for word in review if not word in nltk.corpus.stopwords.words('english')]
        review = ' '.join(review)
        corpus.append(review)
    return corpus

In [9]:
articles_train = text_train.copy()
articles_train.reset_index(inplace=True)
articles_train

Unnamed: 0,index,text
0,0,champion products ch approves stock split cham...
1,1,computer terminal systems cpml completes sale ...
2,2,cobanco inc cbco year net shr cts vs dlrs net ...
3,3,am international inc am nd qtr jan oper shr lo...
4,4,brown forman inc bfd th qtr net shr one dlr vs...
...,...,...
5480,5480,kelly oil and gas partners kly year dec shr ct...
5481,5481,japan seeks to strengthen paris currency accor...
5482,5482,tcw convertible securities cvt sets dividend t...
5483,5483,south korean won fixed at month high the bank ...


In [10]:
articles_test = text_test.copy()
articles_test.reset_index(inplace=True)
articles_test

Unnamed: 0,index,text
0,0,asian exporters fear damage from u s japan rif...
1,1,china daily says vermin eat pct grain stocks a...
2,2,australian foreign ship ban ends but nsw ports...
3,3,sumitomo bank aims at quick recovery from merg...
4,4,amatil proposes two for five bonus share issue...
...,...,...
2184,2184,balladur insists on maintenance of louvre acco...
2185,2185,philippine trade gap widens in january august ...
2186,2186,iran soviet union to swap crude refined produc...
2187,2187,n z s chase corp makes offer for entregrowth c...


In [11]:
X_train = text_preprocess(articles_train)
X_train[0]

'champion product ch approves stock split champion product inc said board director approved two one stock split common share shareholder record april company also said board voted recommend shareholder annual meeting april increase authorized capital stock five mln mln share reuter'

In [12]:
X_test = text_preprocess(articles_test)
X_test[0]



# TW-IDF Model

In [15]:
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion

from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV
from sklearn.model_selection import HalvingRandomSearchCV
from pprint import pprint

from gowpy.feature_extraction.gow import TwidfVectorizer
from sklearn.svm import SVC
from sklearn.decomposition import TruncatedSVD

from sklearn.metrics import classification_report, accuracy_score, f1_score

In [16]:
from sklearn.metrics import matthews_corrcoef, make_scorer
scorer_mcc = make_scorer(matthews_corrcoef)

# Hyperparameter tuning and cross-validation score

In [21]:
# pipeline = Pipeline([
#     ('gow', TwidfVectorizer()),
#     ('svc', SVC()),
# ])

parameters = {
    'gow__directed' : [True, False],
    'gow__window_size' : [2, 4, 8, 16],
    'gow__b' : [0.0, 0.003],
    'gow__term_weighting' : ['degree', 'pagerank'],
    'gow__min_df' : [0, 5, 10],
    'gow__max_df' : [0.8, 0.9, 1.0],
    'svc__C':[1, 10, 100],
    'svc__kernel':['linear', 'rbf'],
    'svc__shrinking' : [True, False],
    'svc__probability' : [True, False],
    'svc__class_weight' : ['balanced'],
}

# HalvingRandomSearchCV

In [61]:
random_search = HalvingRandomSearchCV(pipeline, 
                           parameters, 
                           cv=10,
                           scoring=scorer_mcc,
                           n_jobs=-1, 
                           verbose=10,
                           min_resources='smallest',
                           factor=3)

print("Performing random search...")
print("pipeline:", [name for name, _ in pipeline.steps])
pprint(parameters)

random_search.fit(X_train, y_train)

print("Best score: %0.3f" % random_search.best_score_)
print("Best parameters set:")
best_parameters = random_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Performing random search...
pipeline: ['gow', 'svc']
{'gow__b': [0.0, 0.003],
 'gow__directed': [True, False],
 'gow__max_df': [0.8, 0.9, 1.0],
 'gow__min_df': [0, 5, 10],
 'gow__term_weighting': ['degree', 'pagerank'],
 'gow__window_size': [2, 3, 4, 6, 8],
 'svc__C': [0.1, 1, 10, 100, 1000],
 'svc__class_weight': [None, 'balanced'],
 'svc__degree': [0, 1, 2, 3, 4, 5],
 'svc__gamma': ['scale', 'auto'],
 'svc__kernel': ['linear', 'rbf', 'poly', 'sigmoid'],
 'svc__probability': [True, False],
 'svc__shrinking': [True, False]}
n_iterations: 4
n_required_iterations: 4
n_possible_iterations: 4
min_resources_: 160
max_resources_: 5485
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 34
n_resources: 160
Fitting 10 folds for each of 34 candidates, totalling 340 fits
----------
iter: 1
n_candidates: 12
n_resources: 480
Fitting 10 folds for each of 12 candidates, totalling 120 fits
----------
iter: 2
n_candidates: 4
n_resources: 1440
Fitting 10 folds for each of 4 candida

# HalvingGridSearchCV

In [71]:
grid_search = HalvingGridSearchCV(pipeline, 
                           parameters, 
                           cv=3,
                           scoring=scorer_mcc,
                           n_jobs=-1, 
                           verbose=10)

print("Performing grid search...")
print("pipeline:", [name for name, _ in pipeline.steps])
pprint(parameters)

grid_search.fit(X_train, y_train)

print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Performing grid search...
pipeline: ['gow', 'svc']
{'gow__b': [0.0, 0.003],
 'gow__directed': [True, False],
 'gow__max_df': [0.8, 0.9, 1.0],
 'gow__min_df': [0, 5, 10],
 'gow__term_weighting': ['degree', 'pagerank'],
 'gow__window_size': [2, 3, 4, 6],
 'svc__C': [1, 10, 100],
 'svc__class_weight': ['balanced'],
 'svc__kernel': ['linear', 'rbf'],
 'svc__probability': [True, False],
 'svc__shrinking': [True, False]}
n_iterations: 5
n_required_iterations: 9
n_possible_iterations: 5
min_resources_: 48
max_resources_: 5485
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 6912
n_resources: 48
Fitting 3 folds for each of 6912 candidates, totalling 20736 fits
----------
iter: 1
n_candidates: 2304
n_resources: 144
Fitting 3 folds for each of 2304 candidates, totalling 6912 fits
----------
iter: 2
n_candidates: 768
n_resources: 432
Fitting 3 folds for each of 768 candidates, totalling 2304 fits
----------
iter: 3
n_candidates: 256
n_resources: 1296
Fitting 3 folds for ea

# Fitting the TW-IDF model and evaluating on test data

In [14]:
model = TwidfVectorizer(b=0.003, directed=False, max_df=1.0, min_df=5, term_weighting='degree', window_size=4)
X = model.fit_transform(X_train)
X.shape

(5485, 4699)

# Pipeline with best parameters

In [22]:
# Manually setting best parameters
pipeline_gow = Pipeline([
    ('gow', TwidfVectorizer(b=0.003, directed=False, max_df=1.0, min_df=5, term_weighting='degree', window_size=4)),
    ('svd', TruncatedSVD(n_components=1000, random_state=0)),
    ('svc', SVC(C=10, gamma='scale', kernel='linear', class_weight='balanced', probability=True, shrinking=False)),
])

pipeline_gow.fit(X_train, y_train)

Pipeline(steps=[('gow',
                 TwidfVectorizer(b=0.003, directed=False, min_df=5,
                                 tokenizer=<function default_tokenizer at 0x0000020DF2F844C0>)),
                ('svd', TruncatedSVD(n_components=1000, random_state=0)),
                ('svc',
                 SVC(C=10, class_weight='balanced', kernel='linear',
                     probability=True, shrinking=False))])

# Results and Accuracy

In [19]:
# Evaluation on test data
y_pred = pipeline_gow.predict(X_test)

print(classification_report(y_test, y_pred))
print('\nAccuracy: ', accuracy_score(y_test, y_pred))
print('MCC: ', matthews_corrcoef(y_test, y_pred))
print('f1-micro: ', f1_score(y_test, y_pred, average='micro'))

              precision    recall  f1-score   support

         acq       0.97      0.98      0.98       696
       crude       0.95      0.98      0.96       121
        earn       0.99      0.99      0.99      1083
       grain       1.00      0.90      0.95        10
    interest       0.92      0.86      0.89        81
    money-fx       0.88      0.85      0.87        87
        ship       0.91      0.83      0.87        36
       trade       0.95      0.99      0.97        75

    accuracy                           0.98      2189
   macro avg       0.95      0.92      0.93      2189
weighted avg       0.98      0.98      0.98      2189


Accuracy:  0.9757880310644129
MCC:  0.9625331611743982
f1-micro:  0.9757880310644129


# Testing model with actual articles

In [25]:
from nltk.stem import WordNetLemmatizer

def textpreprocess(articles):
    lemmatizer = WordNetLemmatizer()
    corpus=[]
    for i in range(0,len(articles)):
        review = re.sub('[^a-zA-Z]', ' ', articles[i]) #replacing all charecters apart from A-Z,a-z with ' '
        review = review.lower()
        review = nltk.word_tokenize(review)

        review = [lemmatizer.lemmatize(word) for word in review if not word in nltk.corpus.stopwords.words('english')]
        review = ' '.join(review)
        corpus.append(review)
    return corpus

In [29]:
text=["Virat Kohli has not been among big runs for a while now. After a struggling Indian Premier League, where he played only a couple of good innings, Kohli was rested from the India’s immediate series against South Africa and Ireland, with the management hoping that the player would return fresh from his break. With the expectation that a break would do the former Indian captain some good, the team management opted for young players of IPL fame to play for India. While the youngsters did great and made their mark in the Indian team, Kohli’s return on the other hand did not turn out to be a fruitful one.","After many delays, SpaceX's Dragon spacecraft finally lifted off to begin its journey to the International Space Station. The uncrewed flight, known as CRS-25 will carry with it a slew of scientific experiments which include studies into the immune system aging and recovery, mapping the composition of Earths dust and its effect on the climate, how communities of microorganisms in soil are affected by microgravity, and several others. Taking to Twitter, the National Aeronautics and Space Administration, NASA, shared a video of the CR-25 launch. Like dust in the wind, liftoff is confirmed for the SpaceX CRS-25 Dragon spacecraft resupply mission to the International space station."]
text1=["asian exporter fear damage u japan rift mounting trade friction u japan raised fear among many asia exporting nation row could inflict far reaching economic damage businessmen official said told reuter correspondent asian capital u move japan might boost protectionist sentiment u lead curb american import product exporter said conflict would hurt long run short term tokyo loss might gain u said impose mln dlrs tariff import japanese electronics good april retaliation japan alleged failure stick pact sell semiconductor world market cost unofficial japanese estimate put impact tariff billion dlrs spokesman major electronics firm said would virtually halt export product hit new tax able business said spokesman leading japanese electronics firm matsushita electric industrial co ltd mc tariff remain place length time beyond month mean complete erosion export good subject tariff u said tom murtha stock analyst tokyo office broker james capel co taiwan businessmen official also worried aware seriousness u threat japan serf warning u said senior taiwanese trade official asked named taiwan trade trade surplus billion dlrs last year pct u surplus helped swell taiwan foreign exchange reserve billion dlrs among world largest must quickly open market remove trade barrier cut import tariff allow import u product want defuse problem possible u retaliation said paul sheen chairman textile exporter taiwan safe group senior official south korea trade promotion association said trade dispute u japan might also lead pressure south korea whose chief export similar japan last year south korea trade surplus billion dlrs u billion dlrs malaysia trade officer businessmen said tough curb japan might allow hard hit producer semiconductor third country expand sale u hong kong newspaper alleged japan selling cost semiconductor electronics manufacturer share view businessmen said short term commercial advantage would outweighed u pressure block import short term view said lawrence mill director general federation hong kong industry whole purpose prevent import one day extended source much serious hong kong disadvantage"]
pre_processed_text=textpreprocess(text)
pre_processed_text
# label=pipeline_gow.predict

['virat kohli among big run struggling indian premier league played couple good inning kohli rested india immediate series south africa ireland management hoping player would return fresh break expectation break would former indian captain good team management opted young player ipl fame play india youngster great made mark indian team kohli return hand turn fruitful one',
 'many delay spacex dragon spacecraft finally lifted begin journey international space station uncrewed flight known cr carry slew scientific experiment include study immune system aging recovery mapping composition earth dust effect climate community microorganism soil affected microgravity several others taking twitter national aeronautics space administration nasa shared video cr launch like dust wind liftoff confirmed spacex cr dragon spacecraft resupply mission international space station']

In [30]:
label=pipeline_gow.predict(text1)
label

array(['trade'], dtype=object)