In [1]:
import pandas as pd
import nltk
import re

# Loading the dataset

In [2]:
df_train = pd.read_csv('./datasets/webkb-train-stemmed.txt', header=None, sep='\t', names=['category', 'text'])
df_train = df_train.dropna()
df_train.head()

Unnamed: 0,category,text
0,student,brian comput scienc depart univers wisconsin d...
1,student,denni swanson web page mail pop uki offic hour...
2,faculty,russel impagliazzo depart comput scienc engin ...
3,student,dave phd student depart comput scienc univers ...
4,project,center lifelong learn design univers colorado ...


In [3]:
text_train = df_train.drop(['category'],axis=1)
y_train = df_train['category']

text_train, y_train

(                                                   text
 0     brian comput scienc depart univers wisconsin d...
 1     denni swanson web page mail pop uki offic hour...
 2     russel impagliazzo depart comput scienc engin ...
 3     dave phd student depart comput scienc univers ...
 4     center lifelong learn design univers colorado ...
 ...                                                 ...
 2798  faster harder kill laboratori experiment softw...
 2799  previou content steven faculti research guid p...
 2800  sandeep graduat student studi comput scienc cl...
 2801  web oper system uniqu mwf tai recent explos in...
 2802  rami melhem professor dept comput scienc phone...
 
 [2785 rows x 1 columns],
 0       student
 1       student
 2       faculty
 3       student
 4       project
          ...   
 2798    project
 2799    faculty
 2800    student
 2801     course
 2802    faculty
 Name: category, Length: 2785, dtype: object)

In [4]:
y_train.value_counts()

student    1085
faculty     745
course      620
project     335
Name: category, dtype: int64

In [5]:
df_test = pd.read_csv('datasets/webkb-test-stemmed.txt', header = None, sep='\t', names = ['category', 'text'])
df_test = df_test.dropna()
text_test = df_test.drop(['category'],axis=1)
y_test = df_test['category']

y_test.value_counts()

student    540
faculty    371
course     306
project    166
Name: category, dtype: int64

# Pre-processing the data

In [6]:
# Since dataset is already stemmed, only stop-word removal will be applied
def text_preprocess(articles: pd.DataFrame) -> list:
    corpus=[]
    for i in range(0,len(articles)):
        review = re.sub('[^a-zA-Z]', ' ', articles['text'][i]) #replacing all charecters apart from A-Z,a-z with ' '
        review = review.lower()
        review = nltk.word_tokenize(review)

        review = [word for word in review if not word in nltk.corpus.stopwords.words('english')]
        review = ' '.join(review)
        corpus.append(review)
    return corpus

In [7]:
articles_train = text_train.copy()
articles_train.reset_index(inplace=True)
articles_train

Unnamed: 0,index,text
0,0,brian comput scienc depart univers wisconsin d...
1,1,denni swanson web page mail pop uki offic hour...
2,2,russel impagliazzo depart comput scienc engin ...
3,3,dave phd student depart comput scienc univers ...
4,4,center lifelong learn design univers colorado ...
...,...,...
2780,2798,faster harder kill laboratori experiment softw...
2781,2799,previou content steven faculti research guid p...
2782,2800,sandeep graduat student studi comput scienc cl...
2783,2801,web oper system uniqu mwf tai recent explos in...


In [8]:
articles_test = text_test.copy()
articles_test.reset_index(inplace=True)
articles_test

Unnamed: 0,index,text
0,0,eric homepag eric wei tsinghua physic fudan genet
1,1,comput system perform evalu model new sept ass...
2,2,home page comput scienc grad student ucsd work...
3,3,toni web page toni face thing call toni studen...
4,4,ec advanc comput architectur credit parallel a...
...,...,...
1378,1391,scott pictur background scott phd student coll...
1379,1392,advanc oper system fall marvin offic comput sc...
1380,1393,human robot hand group head kenneth salisburi ...
1381,1394,databas manag system design implement inform p...


In [9]:
X_train = text_preprocess(articles_train)
X_train[0]

'brian comput scienc depart univers wisconsin dayton street madison offic email wisc offic phone home phone advisor david wood tabl content interest schedul summer public interest profession comput architectur oper system compil high speed network distribut parallel system secur account high perform person bicycl walk hike camp travel home brew cook comput electron read schedul mondai wwt meet wednesdai meet david cow meet milwauke brian heidi wed madison comput architectur affili meet chicago base public journal articl foster perform massiv parallel comput spectral atmospher model atmospher ocean technolog byte drake foster design perform scalabl parallel commun climat model parallel comput decemb byte proceed paper foster algorithm comparison benchmark parallel spectral transform water model sixth workshop parallel process meteorolog ed world scientif singapor byte drake foster hack williamson adapt scalabl parallel comput proceed global chang symposium american meteorolog societi by

In [10]:
X_test = text_preprocess(articles_test)
X_test[0]

'eric homepag eric wei tsinghua physic fudan genet'

# TW-IDF Model

In [11]:
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion

from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV
from sklearn.model_selection import HalvingRandomSearchCV
from pprint import pprint

from gowpy.feature_extraction.gow import TwidfVectorizer
from sklearn.svm import SVC
from sklearn.decomposition import TruncatedSVD

from sklearn.metrics import classification_report, accuracy_score, f1_score

In [12]:
from sklearn.metrics import matthews_corrcoef, make_scorer
scorer_mcc = make_scorer(matthews_corrcoef)

### Hyperparameter tuning and cross-validation

In [13]:
pipeline = Pipeline([
    ('gow', TwidfVectorizer()),
    ('svc', SVC()),
])

parameters = {
    'gow__b': [0.0, 0.003],
    'gow__directed': [True, False],
    'gow__max_df': [0.8, 0.9, 1.0],
    'gow__min_df': [0, 5, 10],
    'gow__term_weighting': ['degree', 'pagerank'],
    'gow__window_size': [2, 3, 4, 6],
# 
    'svc__C': [1, 10, 100],
    'svc__class_weight': [None, 'balanced'],
    'svc__kernel': ['linear'],
    'svc__probability': [True, False],
    'svc__shrinking': [True, False],
}

In [14]:
random_search = HalvingRandomSearchCV(pipeline, 
                           parameters, 
                           cv=10,
                           scoring=scorer_mcc,
                           n_jobs=-1, 
                           verbose=10,
                           min_resources='smallest',
                           factor=3)

print("Performing random search...")
print("pipeline:", [name for name, _ in pipeline.steps])
pprint(parameters)

random_search.fit(X_train, y_train)

print("Best score: %0.3f" % random_search.best_score_)
print("Best parameters set:")
best_parameters = random_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Performing random search...
pipeline: ['gow', 'svc']
{'gow__b': [0.0, 0.003],
 'gow__directed': [True, False],
 'gow__max_df': [0.8, 0.9, 1.0],
 'gow__min_df': [0, 5, 10],
 'gow__term_weighting': ['degree', 'pagerank'],
 'gow__window_size': [2, 3, 4, 6, 8],
 'svc__C': [0.1, 1, 10, 100, 1000],
 'svc__class_weight': [None, 'balanced'],
 'svc__degree': [0, 1, 2, 3, 4, 5],
 'svc__gamma': ['scale', 'auto'],
 'svc__kernel': ['linear', 'rbf', 'poly', 'sigmoid'],
 'svc__probability': [True, False],
 'svc__shrinking': [True, False]}
n_iterations: 4
n_required_iterations: 4
n_possible_iterations: 4
min_resources_: 80
max_resources_: 2785
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 34
n_resources: 80
Fitting 10 folds for each of 34 candidates, totalling 340 fits
----------
iter: 1
n_candidates: 12
n_resources: 240
Fitting 10 folds for each of 12 candidates, totalling 120 fits
----------
iter: 2
n_candidates: 4
n_resources: 720
Fitting 10 folds for each of 4 candidates

In [22]:
grid_search = HalvingGridSearchCV(pipeline, 
                           parameters, 
                           cv=3,
                           scoring=scorer_mcc,
                           n_jobs=-1, 
                           verbose=10)

print("Performing grid search...")
print("pipeline:", [name for name, _ in pipeline.steps])
pprint(parameters)

grid_search.fit(X_train, y_train)

print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Performing grid search...
pipeline: ['gow', 'svc']
{'gow__b': [0.0, 0.003],
 'gow__directed': [True, False],
 'gow__max_df': [0.8, 0.9, 1.0],
 'gow__min_df': [0, 5, 10],
 'gow__term_weighting': ['degree', 'pagerank'],
 'gow__window_size': [2, 3, 4, 6],
 'svc__C': [1, 10, 100],
 'svc__class_weight': [None, 'balanced'],
 'svc__kernel': ['linear'],
 'svc__probability': [True, False],
 'svc__shrinking': [True, False]}
n_iterations: 5
n_required_iterations: 9
n_possible_iterations: 5
min_resources_: 24
max_resources_: 2785
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 6912
n_resources: 24
Fitting 3 folds for each of 6912 candidates, totalling 20736 fits
----------
iter: 1
n_candidates: 2304
n_resources: 72
Fitting 3 folds for each of 2304 candidates, totalling 6912 fits
----------
iter: 2
n_candidates: 768
n_resources: 216
Fitting 3 folds for each of 768 candidates, totalling 2304 fits
----------
iter: 3
n_candidates: 256
n_resources: 648
Fitting 3 folds for each 

### Fitting the TW-IDF model and evaluating on test data

In [23]:
model = TwidfVectorizer(b=0.0, directed=False, max_df=1.0, min_df=0, term_weighting='pagerank', window_size=2)
X = model.fit_transform(X_train)
X.shape

(2785, 7255)

In [14]:
# Manually setting best parameters
pipeline_gow = Pipeline([
    ('gow', TwidfVectorizer(b=0.0, directed=False, max_df=1.0, min_df=0, term_weighting='pagerank', window_size=2)),
    ('svd', TruncatedSVD(n_components=500, random_state=0)),
    ('svc', SVC(C=1, kernel='linear', class_weight='balanced', probability=False, shrinking=True)),
])

pipeline_gow.fit(X_train, y_train)

Pipeline(steps=[('gow',
                 TwidfVectorizer(directed=False, min_df=0,
                                 term_weighting='pagerank',
                                 tokenizer=<function default_tokenizer at 0x0000023FA05358B0>,
                                 window_size=2)),
                ('svd', TruncatedSVD(n_components=500, random_state=0)),
                ('svc', SVC(C=1, class_weight='balanced', kernel='linear'))])

In [15]:
# Evaluation on test data
y_pred = pipeline_gow.predict(X_test)

print(classification_report(y_test, y_pred))
print('\nAccuracy: ', accuracy_score(y_test, y_pred))
print('MCC: ', matthews_corrcoef(y_test, y_pred))
print('f1-micro: ', f1_score(y_test, y_pred, average='micro'))

              precision    recall  f1-score   support

      course       0.96      0.98      0.97       306
     faculty       0.90      0.90      0.90       371
     project       0.85      0.93      0.89       166
     student       0.94      0.91      0.93       540

    accuracy                           0.92      1383
   macro avg       0.91      0.93      0.92      1383
weighted avg       0.93      0.92      0.92      1383


Accuracy:  0.9248011569052784
MCC:  0.8951736248951271
f1-micro:  0.9248011569052784
