Connect to drive, set directory

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
cd '/content/drive/MyDrive/Thesis_UU/push/3group'

Import libraries

In [None]:
import numpy as np
import pandas as pd

import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,f1_score
from sklearn import model_selection, svm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV

#### Sampling - so that papers which are in train set are not in test set

Splitting data into train and test set

In [None]:
sample_papers = pd.read_csv('Data (CSV)/papers_journal_balanced.csv', encoding="utf-8-sig")

In [None]:
sample_train = sample_papers.sample(frac=0.75, random_state=1)

In [None]:
len(sample_train)

In [None]:
X_train = list(sample_train.preprocessed.values)
y_train = list(sample_train['class'].values)

In [None]:
sample_test = sample_papers[~sample_papers.ID.isin(sample_train.ID)] #removing papers from test which are in train

In [None]:
len(sample_test)

In [None]:
X_test = list(sample_test.preprocessed.values)
y_test = list(sample_test['class'].values)

In [None]:
labels = ['error', 'misconduct', 'non_retracted']

tf-idf - vectorizing the text

In [None]:
Tfidf_vect = TfidfVectorizer()
Tfidf_vect.fit(X_train)
Train_X_Tfidf = Tfidf_vect.transform(X_train)
Test_X_Tfidf = Tfidf_vect.transform(X_test)

Hyperparameter tuning

Code for hyperparameter tuning based on https://machinelearningmastery.com/hyperparameters-for-classification-machine-learning-algorithms/

In [None]:
# define models and parameters
model = svm.SVC()
kernel = ['linear', 'poly', 'rbf', 'sigmoid']
#'linear', 'poly', 'rbf', 'sigmoid', 'precomputed'
c_values = [100, 10, 1.0, 0.1, 0.01]
gamma = ['scale', 'auto']
# define grid search
grid = dict(kernel=kernel,C=c_values, gamma=gamma)
kf = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=kf, scoring='f1_weighted',error_score=0) #metric
grid_result = grid_search.fit(Train_X_Tfidf, y_train)
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Fit SVM classifier

In [None]:
SVM = svm.SVC(C=100, kernel='rbf', gamma='scale')
SVM.fit(Train_X_Tfidf,y_train)

Predict classes for test set

In [None]:
y_pred = SVM.predict(Test_X_Tfidf)
print(classification_report(y_test, y_pred, 
                          target_names=labels))

Confusion matrix

In [None]:
print(confusion_matrix(y_test, y_pred))

Top 30 indicative words

In [None]:
feature_array = np.array(Tfidf_vect.get_feature_names())
Tfidf_vect_sorting = np.argsort(Test_X_Tfidf.toarray()).flatten()[::-1]

n = 30
top_n = feature_array[Tfidf_vect_sorting][:n]

In [None]:
top_n

## External validation

In [None]:
test_dataset = pd.read_csv('Data (CSV)/journal_test_data_set.csv', encoding="utf-8-sig")

Redefining test data

In [None]:
X_test = list(test_dataset.preprocessed.values)
y_test = list(test_dataset['class'].values)

tf-idf - vectorizing the text

In [None]:
Test_X_Tfidf = Tfidf_vect.transform(X_test)

Predict classes for external validation dataset

In [None]:
y_pred = SVM.predict(Test_X_Tfidf)
print(classification_report(y_test, y_pred, 
                          target_names=labels))

Confusion matrix

In [None]:
print(confusion_matrix(y_test, y_pred))