Connect to drive, set directory

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
cd '/content/drive/MyDrive/Thesis_UU/push/2group'

Import libraries

In [None]:
import numpy as np
import pandas as pd

import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,f1_score
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import naive_bayes

Load train/test dataset

In [None]:
train_dataset = pd.read_csv('Data (CSV)/journal_train_data_set_common.csv', encoding="utf-8-sig")

In [None]:
train_dataset.drop(columns=['Unnamed: 0', 'Unnamed: 0.1'], inplace = True)

In [None]:
train_dataset.head(2)

In [None]:
X = list(train_dataset.preprocessed.values)

y = list(train_dataset['Retracted'].values)# the labels we want to predict --> Y

Splitting data into train and test set

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25,random_state=1, stratify=y)

tf-idf - vectorizing the text

In [None]:
Tfidf_vect = TfidfVectorizer()
Tfidf_vect.fit(X_train)
X_train_tfidf = Tfidf_vect.transform(X_train)
x_test_tfidf = Tfidf_vect.transform(X_test)

Fit Naive Bayes classifier

In [None]:
Naive = naive_bayes.MultinomialNB()
Naive.fit(X_train_tfidf, y_train)

Predict classes for test set

In [None]:
labels = ['non_retracted', 'retracted']

In [None]:
y_pred = Naive.predict(x_test_tfidf)

print(classification_report(y_test, y_pred, 
                          target_names=labels))

Confusion matrix

In [None]:
print(confusion_matrix(y_test, y_pred))

Top 30 indicative words

In [None]:
feature_array = np.array(Tfidf_vect.get_feature_names())
tfidf_sorting = np.argsort(x_test_tfidf.toarray()).flatten()[::-1]

n = 30
top_n = feature_array[tfidf_sorting][:n]

In [None]:
top_n

## External validation

In [None]:
test_dataset = pd.read_csv('Data (CSV)/journal_test_data_set_common.csv', encoding="utf-8-sig")

In [None]:
test_dataset.head(2)

In [None]:
len(test_dataset)

Redefining test data

In [None]:
X_test= list(test_dataset.preprocessed.values)
y_test = list(test_dataset['Retracted'].values)

tf-idf - vectorizing the text

In [None]:
X_test_tfidf = Tfidf_vect.transform(X_test)

Predict classes for external validation dataset

In [None]:
y_pred = Naive.predict(X_test_tfidf)

print(classification_report(y_test, y_pred, 
                          target_names=labels))

Confusion matrix

In [None]:
print(confusion_matrix(y_test, y_pred))