## Step 1: Handle imports and import relevant csv files

In [None]:
!pip install -qqq language-tool-python xgboost

In [None]:
import numpy as np
import pandas as pd
from pandas.plotting import table
import os
import spacy

import matplotlib.pyplot as plt
from google.colab import drive
import seaborn as sns

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.dummy import DummyClassifier
from sklearn.metrics import f1_score, accuracy_score, recall_score, roc_auc_score, precision_score, ConfusionMatrixDisplay
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler

import xgboost as xgb

import re
import nltk
nltk.download("stopwords")
nltk.download("wordnet")

from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

In [None]:
if not os.path.exists("drive"): 
    drive.mount("/content/drive")

## Step 2: Data preprocessing

In [None]:
if not os.path.exists("/content/drive/MyDrive/UNIVERSITET/08_semester/Artificial Intelligence & Machine Learning/Code/gpt3_produced_final.csv"):
    path = input()
else:
    path = "/content/drive/MyDrive/UNIVERSITET/08_semester/Artificial Intelligence & Machine Learning/Code/gpt3_produced_final.csv"

In [None]:
df_gpt3 = pd.read_csv(path)
len(df_gpt3)

In [None]:
df_gpt3

In [None]:
df_gpt3 = df_gpt3.drop(columns=["content"])

In [None]:
df_gpt3.rename(columns={"gpt3": "content"}, inplace=True)
df_gpt3

In [None]:
stop_words = stopwords.words("english")
lemmatizer = WordNetLemmatizer()

def regex_content(text):
    text = re.sub('http\S+', ' ', text)
    text = re.sub('[^a-zA-Z]', ' ', text)
    return text.lower()

def tokenize(text):
    tokens = re.split('\s+', text)
    tokens = [tok for tok in tokens if tok not in stop_words]
    return tokens

def lemmatize(tokens):
    lemms = [lemmatizer.lemmatize(tok) for tok in tokens]
    return lemms

def join_lemmas(lemmas):
    lemmas = re.sub("[^a-zA-Z]", "", lemmas)
    return lemmas

In [None]:
df_gpt3['regexed'] = df_gpt3['content'].apply(lambda x: regex_content(x))
df_gpt3['tokens'] = df_gpt3['regexed'].apply(lambda x: tokenize(x))
df_gpt3["lemmas"] = df_gpt3["tokens"].apply(lambda x: lemmatize(x))
df_gpt3["joined"] = df_gpt3["lemmas"].apply(lambda x: " ".join(x))
df_gpt3["target"] = 1

In [None]:
df_gpt3

In [None]:
df_classifier = pd.read_csv("/content/drive/MyDrive/UNIVERSITET/08_semester/Artificial Intelligence & Machine Learning/Code/gpt3_promts.csv")
len(df_classifier)

In [None]:
df_classifier

In [None]:
df_classifier["joined"] = df_classifier["lemmas"].apply(lambda x: " ".join(eval(x)))

In [None]:
df_classifier["target"] = 0

In [None]:
df_combined = df_classifier.append(df_gpt3)
df_combined = df_combined.reset_index(drop=True)
df_combined

In [None]:
df_combined.to_csv("/content/drive/MyDrive/UNIVERSITET/08_semester/Artificial Intelligence & Machine Learning/Code/df_combined.csv")

In [None]:
to_bogusz = pd.concat([X_test, y_test], axis=1)
to_bogusz.to_csv("/content/drive/MyDrive/UNIVERSITET/08_semester/Artificial Intelligence & Machine Learning/Code/to_bogusz.csv", index=False)

In [None]:
df_combined = pd.read_csv("/content/drive/MyDrive/UNIVERSITET/08_semester/Artificial Intelligence & Machine Learning/Code/df_combined.csv", )
df_combined = df_combined.drop(columns=["Unnamed: 0"])

X, y = df_combined.loc[:, df_combined.columns != 'target'], df_combined["target"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
# Taking no chances here, saving the train test split
X_train.to_csv("/content/drive/MyDrive/UNIVERSITET/08_semester/Artificial Intelligence & Machine Learning/Code/X_train.csv", index=False)
X_test.to_csv("/content/drive/MyDrive/UNIVERSITET/08_semester/Artificial Intelligence & Machine Learning/Code/X_test.csv", index=False)
y_train.to_csv("/content/drive/MyDrive/UNIVERSITET/08_semester/Artificial Intelligence & Machine Learning/Code/y_train.csv", index=False)
y_test.to_csv("/content/drive/MyDrive/UNIVERSITET/08_semester/Artificial Intelligence & Machine Learning/Code/y_test.csv", index=False)

In [None]:
to_bogusz = pd.read_csv("/content/drive/MyDrive/UNIVERSITET/08_semester/Artificial Intelligence & Machine Learning/Code/to_bogusz.csv")
to_bogusz = to_bogusz.reset_index(drop=True)
to_bogusz["content"][1]

In [None]:
def text_to_sentences(text):
    assert type(text) == type("") or type(text) == np.str_
    return re.split("[\.\?\!]", text)

def cut_final(text):
    sep = "."
    text = " ".join(text.split()[0:250])
    text = sep.join(text.split(sep)[:-1])+"."
    return text

to_bogusz["content"] = to_bogusz["content"].apply(lambda x: cut_final(x))

In [None]:
to_bogusz['word_count'] = to_bogusz['content'].apply(lambda x: len(x.split()))

In [None]:
to_bogusz['word_count'] = to_bogusz['content'].apply(lambda x: len(x.split()))

In [None]:
pd.set_option("display.max_rows", None)
print(to_bogusz["content"][22])

In [None]:
to_bogusz.to_csv("survey_articles.csv", index=False)

In [None]:
def grammar_score(text, tool):
    error_count = 0
    for sentence in text_to_sentences(text):
        check = tool.check(sentence)
        sentence_errors = len(check)
        error_count += sentence_errors
    
    word_count = re.split('\s+', text)
    word_count = len([tok for tok in word_count if tok not in stop_words])

    error_score = 1 - (float(error_count) / float(word_count))

    return error_score

In [None]:
to_bogusz.sort_values(["target", "word_count"])

## Step 3: Classification tasks (Dummy / MLP)

#### Initialise the TfidfVectorizer and transform the values
Using unigrams, bigrams, and trigrams for vectorization

#### Import data in dataframes

In [None]:
X_train = pd.read_csv("/content/drive/MyDrive/UNIVERSITET/08_semester/Artificial Intelligence & Machine Learning/Code/X_train.csv")
X_test = pd.read_csv("/content/drive/MyDrive/UNIVERSITET/08_semester/Artificial Intelligence & Machine Learning/Code/X_test.csv")
y_train = pd.read_csv("/content/drive/MyDrive/UNIVERSITET/08_semester/Artificial Intelligence & Machine Learning/Code/y_train.csv")
y_test = pd.read_csv("/content/drive/MyDrive/UNIVERSITET/08_semester/Artificial Intelligence & Machine Learning/Code/y_test.csv")

df_combined = pd.read_csv("/content/drive/MyDrive/UNIVERSITET/08_semester/Artificial Intelligence & Machine Learning/Code/df_combined.csv", )
df_combined = df_combined.drop(columns=["Unnamed: 0"])

In [None]:
len(X_train)

In [None]:
X, y = df_combined["joined"].tolist(), df_combined["target"].tolist()

X_train = X_train.joined.tolist()
X_test = X_test.joined.tolist()
y_train = y_train.target.tolist()
y_test = y_test.target.tolist()

In [None]:
vectorizer = TfidfVectorizer(ngram_range=(1, 3))
vectorizer.fit(X)

X_train_transformed = vectorizer.transform(X_train)
X_test_transformed = vectorizer.transform(X_test)

### Baseline classifier

#### Multinomial naive bayes baseline classifier

In [None]:
nb = MultinomialNB()
nb.fit(X_train_transformed, y_train)
nb_train_pred = nb.predict(X_train_transformed)
nb_test_pred = nb.predict(X_test_transformed)

print('Accuracy score: ', round(accuracy_score(y_test, nb_test_pred), 4))
print('F1 Score: ', round(f1_score(y_test, nb_test_pred), 4))
print('Recall score: ', round(recall_score(y_test, nb_test_pred), 4))
print('Precision Score: ', round(roc_auc_score(y_test, nb_test_pred), 4))

In [None]:
nb_conf = ConfusionMatrixDisplay.from_predictions(y_test, nb_test_pred, display_labels=["Human", "AI"], cmap=plt.cm.Blues)

### Support vector machine


#### Support vector machine optimised with GridsearchCV and displayed in confusion matrix

In [None]:
svc = SVC(random_state=42)


C = [0.01, 0.1, 1, 10, 100]
gamma = [0.00001, 0.0001, 00.1, 0.1, 1]
kernel = ['linear', 'rbf', 'poly', 'sigmoid']

param_grid = {
    "C": C,
    "gamma": gamma,
    "kernel": kernel,
}

svc_grid = GridSearchCV(svc, param_grid=param_grid, cv=10, n_jobs=-1, verbose=2)
svc_grid.fit(X_train_transformed, y_train)
svc_grid_pred = svc_grid.best_estimator_.predict(X_test_transformed)

print('Accuracy score: ', round(accuracy_score(y_test, svc_grid_pred), 4))
print('F1 Score: ', round(f1_score(y_test, svc_grid_pred), 4))
print('Recall score: ', round(recall_score(y_test, svc_grid_pred), 4))
print('Precision Score: ', round(roc_auc_score(y_test, svc_grid_pred), 4))

In [None]:
svc = SVC(random_state=42)


C = [1, 10, 50, 100, 200, 300, 1000]
gamma = [0.00001, 0.0001, 00.1, 0.1, 1]
kernel = ['linear', 'rbf', 'poly', 'sigmoid']

param_grid = {
    "C": C,
    "gamma": gamma,
    "kernel": kernel,
}

svc_grid = GridSearchCV(svc, param_grid=param_grid, cv=10, verbose=3)
svc_grid.fit(X_train_transformed, y_train)
svc_grid_pred = svc_grid.best_estimator_.predict(X_test_transformed)

print('Accuracy score: ', round(accuracy_score(y_test, svc_grid_pred), 4))
print('F1 Score: ', round(f1_score(y_test, svc_grid_pred), 4))
print('Recall score: ', round(recall_score(y_test, svc_grid_pred), 4))
print('Precision Score: ', round(roc_auc_score(y_test, svc_grid_pred), 4))

In [None]:
conf = ConfusionMatrixDisplay.from_predictions(y_test, svc_grid_pred, display_labels=["Human", "AI"], cmap=plt.cm.Blues)

#### XGBoost classifier to compare with SVM

In [None]:
def algorithm_pipeline(X_train_data, X_test_data, y_train_data, y_test_data, 
                       model, param_grid, cv=10, scoring_fit='neg_mean_squared_error',
                       do_probabilities = False):
    gs = GridSearchCV(
        estimator=model,
        param_grid=param_grid, 
        cv=cv, 
        n_jobs=-1, 
        scoring=scoring_fit,
        verbose=2
    )
    fitted_model = gs.fit(X_train_data, y_train_data)
    
    if do_probabilities:
      pred = fitted_model.predict_proba(X_test_data)
    else:
      pred = fitted_model.predict(X_test_data)
    
    return fitted_model, pred

In [None]:
xgb_model = xgb.XGBClassifier(random_state=42, n_estimators=50, nthread=-1)
param_grid = {
    'colsample_bytree':[0.8],
    'max_depth': [3,4],
    'min_child_weight': [4,5],
    'subsample':[i/10.0 for i in range(6,11)],
    'gamma':[i/10.0 for i in range(3,6)],
}

xgb_grid = GridSearchCV(xgb_model, param_grid=param_grid, verbose=3, cv=10)
xgb_grid.fit(X_train_transformed, y_train)
xgb_grid_pred = xgb_grid.predict(X_test_transformed)

print('Accuracy score: ', round(accuracy_score(y_test, xgb_grid_pred), 4))
print('F1 Score: ', round(f1_score(y_test, xgb_grid_pred), 4))
print('Recall score: ', round(recall_score(y_test, xgb_grid_pred), 4))
print('Precision Score: ', round(roc_auc_score(y_test, xgb_grid_pred), 4))

In [None]:
print(xgb_grid.best_params_)

In [None]:
print('Accuracy score: ', round(accuracy_score(y_test, xgb_grid_pred), 4))
print('F1 Score: ', round(f1_score(y_test, xgb_grid_pred), 4))
print('Recall score: ', round(recall_score(y_test, xgb_grid_pred), 4))
print('Precision Score: ', round(roc_auc_score(y_test, xgb_grid_pred), 4))

In [None]:
xgb_conf = ConfusionMatrixDisplay.from_predictions(y_test, xgb_grid_pred, display_labels=["Human", "AI"], cmap=plt.cm.Blues)

In [None]:
xgb_model = xgb.XGBClassifier(random_state=42, n_estimators=10, max_depth=6, max_leaves=2, min_child_weight=8)

xgb_model.fit(X_train_transformed, y_train)
xgb_pred = xgb_model.predict(X_test_transformed)

print('Accuracy score: ', round(accuracy_score(y_test, xgb_pred), 4))
print('F1 Score: ', round(f1_score(y_test, xgb_pred), 4))
print('Recall score: ', round(recall_score(y_test, xgb_pred), 4))
print('Precision Score: ', round(roc_auc_score(y_test, xgb_pred), 4))

#### Extra: Spacy linguistic analyses

In [None]:
nlp = spacy.load("en_core_web_sm")

text = df_combined["content"][0]

doc = nlp(text)

token_table = []
for token in doc:
    token_table.append([token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
            token.shape_, token.is_alpha, token.is_stop])

columns = ["TEXT", "LEMMA", "POS", "TAG", "DEP", "SHAPE", "IS_ALPHA", "IS_STOP"]
token_df = pd.DataFrame(token_table, columns=columns)
token_df

In [None]:
ax = plt.subplot(111, frame_on=False) # no visible frame
ax.xaxis.set_visible(False)  # hide the x axis
ax.yaxis.set_visible(False)  # hide the y axis

table(ax, token_df[0:50])

In [None]:
X_test["category"].value_counts()

In [None]:
df_combined["word_count"]

#### Create classifier and display confusion matrix

In [None]:
clf_dummy = DummyClassifier(strategy="most_frequent").fit(X_train_transformed, y_train)
pred_dummy = clf_dummy.predict(X_test_transformed)

print('Accuracy score: ', round(accuracy_score(y_test, pred_dummy), 4))
print('F1 Score: ', round(f1_score(y_test, pred_dummy), 4))
print('Recall score: ', round(recall_score(y_test, pred_dummy), 4))
print('Precision Score: ', round(roc_auc_score(y_test, pred_dummy), 4))

In [None]:
confusion_matrix = ConfusionMatrixDisplay.from_estimator(clf_dummy, X_test, y_test, display_labels=["Human", "AI"], cmap=plt.cm.Blues)

#### Confusion matrix for human results

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np


tp, tn, fp, fn = 98, 117, 94, 120

cm = np.zeros((2, 2))
cm[0][0] = tn
cm[0][1] = fp
cm[1][0] = fn
cm[1][1] = tp

fig, ax = plt.subplots(figsize=(5,5))

ax = sns.heatmap(cm, annot=True, ax=ax, square=True,
            cmap=plt.cm.Blues, vmin=0, vmax=130, fmt=".0f",
            xticklabels=["Human", "AI"],
            yticklabels=["Human", "AI"],
            cbar_kws={"shrink": .82})

for _, spine in ax.spines.items():
    spine.set_visible(True)
    spine.set(linewidth=1)

ax.set_xlabel('Predicted label', fontdict={'fontsize': 10})
ax.set_ylabel('True label', fontdict={'fontsize': 10})

plt.show()

In [None]:
def convert_to_float(tp, tn, fp, fn):
    tp = float(tp)
    tn = float(tn)
    fp = float(fp)
    fn = float(fn)
    return tp, fn, fp, fn

def acc_score():
    acc = (tp + tn)/(tp + tn + fp + fn)
    return acc

def prec_score():
    prec = tp / (tp + fp)
    return prec

def recall():
    rec = tp / (tp + fn)
    return rec

def f1_score():
    f1 = 2 * (prec_score() * recall()) / (prec_score() + recall())
    return f1

In [None]:
tp, tn, fp, fn = convert_to_float(tp, tn, fp, fn)
type(tp)

In [None]:
print("Accuracy score: {}".format(round(acc_score(), 4)))
print("Precision score: {}".format(round(prec_score(), 4)))
print("Recall: {}".format(round(recall(), 4)))
print("F1 score: {}".format(round(f1_score(), 4)))