Connect to drive, set directory

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
cd '/content/drive/MyDrive/Thesis_UU/push/3group'

/content/drive/MyDrive/Thesis_UU


Import libraries

In [None]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

#### Sampling - so that papers which are in train set are not in test set

Splitting data into train and test set

In [None]:
sample_papers= pd.read_csv('Data (CSV)/papers_journal_balanced.csv')

In [None]:
sample_train = sample_papers.sample(frac=0.75, random_state=1)

In [None]:
len(sample_train)

1557

In [None]:
X_train = list(sample_train.preprocessed.values)
y_train = list(sample_train['class'].values)

In [None]:
sample_test = sample_papers[~sample_papers.ID.isin(sample_train.ID)]

In [None]:
len(sample_test)

196

In [None]:
X_test = list(sample_test.preprocessed.values)
y_test = list(sample_test['class'].values)

tf-idf - vectorizing the text

In [None]:
tfidf = TfidfVectorizer() 

X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

Hyperparameter tuning

Code for hyperparameter tuning based on https://machinelearningmastery.com/hyperparameters-for-classification-machine-learning-algorithms/

In [None]:
# define models and parameters
model = LogisticRegression()
solvers = ['newton-cg', 'lbfgs', 'liblinear']
penalty = ['l2']
c_values = [100, 10, 1.0, 0.1, 0.01]
# define grid search
grid = dict(solver=solvers,penalty=penalty,C=c_values)
kf = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=kf, scoring='f1_weighted',error_score=0)
grid_result = grid_search.fit(X_train_tfidf, y_train)
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best: 0.948641 using {'C': 100, 'penalty': 'l2', 'solver': 'newton-cg'}
0.948641 (0.018409) with: {'C': 100, 'penalty': 'l2', 'solver': 'newton-cg'}
0.948427 (0.018665) with: {'C': 100, 'penalty': 'l2', 'solver': 'lbfgs'}
0.948200 (0.018412) with: {'C': 100, 'penalty': 'l2', 'solver': 'liblinear'}
0.945207 (0.020040) with: {'C': 10, 'penalty': 'l2', 'solver': 'newton-cg'}
0.945207 (0.020040) with: {'C': 10, 'penalty': 'l2', 'solver': 'lbfgs'}
0.943905 (0.019605) with: {'C': 10, 'penalty': 'l2', 'solver': 'liblinear'}
0.918701 (0.021828) with: {'C': 1.0, 'penalty': 'l2', 'solver': 'newton-cg'}
0.918701 (0.021828) with: {'C': 1.0, 'penalty': 'l2', 'solver': 'lbfgs'}
0.909394 (0.022470) with: {'C': 1.0, 'penalty': 'l2', 'solver': 'liblinear'}
0.795946 (0.038145) with: {'C': 0.1, 'penalty': 'l2', 'solver': 'newton-cg'}
0.795946 (0.038145) with: {'C': 0.1, 'penalty': 'l2', 'solver': 'lbfgs'}
0.754322 (0.039643) with: {'C': 0.1, 'penalty': 'l2', 'solver': 'liblinear'}
0.507189 (0.039715) wit

Fit LR classifier

In [None]:
lr = LogisticRegression(solver='newton-cg', penalty="l2", C=100)

lr.fit(X_train_tfidf, y_train)

LogisticRegression(C=100, solver='newton-cg')

Predict classes for test set

In [None]:
labels = ['error', 'misconduct', 'non_retracted']

In [None]:
y_pred = lr.predict(X_test_tfidf)

print(classification_report(y_test, y_pred, 
                          target_names=labels))

               precision    recall  f1-score   support

        error       0.40      0.20      0.27        10
   misconduct       0.60      0.62      0.61        29
non_retracted       0.89      0.91      0.90       157

     accuracy                           0.83       196
    macro avg       0.63      0.58      0.59       196
 weighted avg       0.82      0.83      0.82       196



Confusion matrix

In [None]:
print(confusion_matrix(y_test, y_pred))

[[  2   0   8]
 [  1  18  10]
 [  2  12 143]]


Top 30 indicative words

In [None]:
feature_array = np.array(tfidf.get_feature_names())
tfidf_sorting = np.argsort(X_test_tfidf.toarray()).flatten()[::-1]

n = 30
top_n = feature_array[tfidf_sorting][:n]



In [None]:
top_n

array(['actuator', 'forest', 'wax', 'regime', 'stiﬀness', 'strain', 'ﬁlm',
       'corrugate', 'thermal', 'melting', 'composite', 'cnt', 'paraﬃn',
       'melt', 'yarn', 'load', 'heat', 'stress', 'conductivity',
       'capillary', 'expansion', 'conﬁne', 'inﬁltration', 'vertically',
       'vertical', 'pressure', 'expand', 'nanocomposite', 'shape',
       'compress'], dtype='<U105')

## External validation

In [None]:
test_dataset = pd.read_csv('Data (CSV)/journal_test_data_set.csv', encoding="utf-8-sig")

In [None]:
test_dataset.head(2)

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,ID,All content,Text,Journal_Name,class,Retracted,preprocessed
0,0,10,nr4472,RESEARCH ARTICLE Twins! Microsatellite analysi...,RESEARCH ARTICLE Twins! Microsatellite analysi...,plos one,nr,0,microsatellite analysis embryo egg case ovipar...
1,1,11,nr4473,Is There ‘Anther-Anther Interference’ within a...,Is There ‘Anther-Anther Interference’ within a...,plos one,nr,0,evidence pollinate selective pressure impose m...


In [None]:
len(test_dataset)

367

Redefining test data

In [None]:
X_test= list(test_dataset.preprocessed.values)
y_test = list(test_dataset['class'].values)

tf-idf - vectorizing the text

In [None]:
X_test_tfidf = tfidf.transform(X_test)

Predict classes for external validation dataset

In [None]:
y_pred = lr.predict(X_test_tfidf)

print(classification_report(y_test, y_pred, 
                          target_names=labels))

               precision    recall  f1-score   support

        error       0.00      0.00      0.00        39
   misconduct       0.43      0.27      0.33        79
non_retracted       0.74      0.91      0.81       249

     accuracy                           0.67       367
    macro avg       0.39      0.39      0.38       367
 weighted avg       0.59      0.67      0.62       367



Confusion matrix

In [None]:
print(confusion_matrix(y_test, y_pred))

[[  0   9  30]
 [  7  21  51]
 [  4  19 226]]
