Connect to drive, set directory

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
cd '/content/drive/MyDrive/Thesis_UU/push/2group'

Import libraries

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

Load train/test dataset

In [None]:
train_dataset = pd.read_csv('Data (CSV)/journal_train_data_set_common.csv', encoding="utf-8-sig")

In [None]:
train_dataset.head(2)

In [None]:
len(train_dataset)

In [None]:
train_dataset.drop(columns=['Unnamed: 0', 'Unnamed: 0.1'], inplace = True)

In [None]:
X = list(train_dataset.preprocessed.values)
y = list(train_dataset['Retracted'].values)# the labels we want to predict --> Y

Splitting data into train and test set

In [None]:
X_train_str, X_test_str, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1, stratify=y)

In [None]:
labels = ['non_retracted', 'retracted']

tf-idf - vectorizing the text

In [None]:
tfidf = TfidfVectorizer() 

X_train_tfidf = tfidf.fit_transform(X_train_str)
X_test_tfidf = tfidf.transform(X_test_str)

Hyperparameter tuning

Code for hyperparameter tuning based on https://machinelearningmastery.com/hyperparameters-for-classification-machine-learning-algorithms/

In [None]:
# define models and parameters
model = LogisticRegression()
solvers = ['newton-cg', 'lbfgs', 'liblinear']
penalty = ['l2']
c_values = [100, 10, 1.0, 0.1, 0.01]
# define grid search
grid = dict(solver=solvers,penalty=penalty,C=c_values)
kf = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=kf, scoring='f1_weighted',error_score=0)
grid_result = grid_search.fit(X_train_tfidf, y_train)
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Fit LR classifier

In [None]:
lr = LogisticRegression(solver='newton-cg', penalty="l2", C=100)

lr.fit(X_train_tfidf, y_train)

Predict classes for test set

In [None]:
y_pred = lr.predict(X_test_tfidf)

print(classification_report(y_test, y_pred, 
                          target_names=labels))

Confusion matrix

In [None]:
print(confusion_matrix(y_test, y_pred))

Top 30 indicative words

In [None]:
feature_array = np.array(tfidf.get_feature_names())
tfidf_sorting = np.argsort(X_test_tfidf.toarray()).flatten()[::-1]

n = 30
top_n = feature_array[tfidf_sorting][:n]

In [None]:
top_n

## External validation

In [None]:
test_dataset = pd.read_csv('Data (CSV)/journal_test_data_set_common.csv', encoding="utf-8-sig")

In [None]:
test_dataset.head(2)

In [None]:
len(test_dataset)

Redefining test data

In [None]:
X_test= list(test_dataset.preprocessed.values)
y_test = list(test_dataset['Retracted'].values)

tf-idf - vectorizing the text

In [None]:
X_test_tfidf = tfidf.transform(X_test)

Predict classes for external validation dataset

In [None]:
y_pred = lr.predict(X_test_tfidf)

print(classification_report(y_test, y_pred, 
                          target_names=labels))

Confusion matrix

In [None]:
print(confusion_matrix(y_test, y_pred))