In [1]:
import os
import numpy as np
from sklearn.model_selection import train_test_split
import pickle
import re
from sklearn.utils import shuffle
import nltk
from nltk.corpus import stopwords
import string
import csv
from sklearn.preprocessing import MinMaxScaler
import pandas as pd

## Train Test Val Split - Files

In [None]:
# ------------------------------------ BBC ------------------------------------
bbc_news_files = os.listdir('BBC/human_BBC_news/')
bbc_labels = np.zeros(len(bbc_news_files))

X_bbc_train_files, X_bbc_test_files, Y_bbc_train, Y_bbc_test =  train_test_split(bbc_news_files, bbc_labels, test_size=0.3)


X_bbc_val_files = X_bbc_test_files[:len(X_bbc_test_files)//2]
X_bbc_test_files = X_bbc_test_files[len(X_bbc_test_files)//2:]
Y_bbc_val = Y_bbc_test[:len(Y_bbc_test)//2]
Y_bbc_test = Y_bbc_test[len(Y_bbc_test)//2:]

# ------------------------------------ TechCrunch ------------------------------------
techCrunch_news_files = os.listdir('TechCrunch/human_TechCrunch_news/')
techCrunch_labels = np.zeros(len(techCrunch_news_files))

X_techCrunch_train_files, X_techCrunch_test_files, Y_techCrunch_train, Y_techCrunch_test =  train_test_split(techCrunch_news_files, techCrunch_labels, test_size=0.3)


X_techCrunch_val_files = X_techCrunch_test_files[:len(X_techCrunch_test_files)//2]
X_techCrunch_test_files = X_techCrunch_test_files[len(X_techCrunch_test_files)//2:]
Y_techCrunch_val = Y_techCrunch_test[:len(Y_techCrunch_test)//2]
Y_techCrunch_test = Y_techCrunch_test[len(Y_techCrunch_test)//2:]

# ------------------------------------ TheVerge ------------------------------------
theVerge_news_files = os.listdir('TheVerge/human_TheVerge_news/')
theVerge_labels = np.zeros(len(theVerge_news_files))

X_theVerge_train_files, X_theVerge_test_files, Y_theVerge_train, Y_theVerge_test =  train_test_split(theVerge_news_files, theVerge_labels, test_size=0.3)


X_theVerge_val_files = X_theVerge_test_files[:len(X_theVerge_test_files)//2]
X_theVerge_test_files = X_theVerge_test_files[len(X_theVerge_test_files)//2:]
Y_theVerge_val = Y_theVerge_test[:len(Y_theVerge_test)//2]
Y_theVerge_test = Y_theVerge_test[len(Y_theVerge_test)//2:]


In [3]:
print(f'BBC --> train_files: {len(X_bbc_train_files)}, train_labels: {len(Y_bbc_train)}, test_files: {len(X_bbc_test_files)}, test_labels: {len(Y_bbc_test)}, val_files: {len(X_bbc_val_files)}, val_labels: {len(Y_bbc_val)}')
print(f'Tech Crunch --> train_files: {len(X_techCrunch_train_files)}, train_labels: {len(Y_techCrunch_train)}, test_files: {len(X_techCrunch_test_files)}, test_labels: {len(Y_techCrunch_test)}, val_files: {len(X_techCrunch_val_files)}, val_labels: {len(Y_techCrunch_val)}')
print(f'The Verge --> train_files: {len(X_theVerge_train_files)}, train_labels: {len(Y_theVerge_train)}, test_files: {len(X_theVerge_test_files)}, test_labels: {len(Y_theVerge_test)}, val_files: {len(X_theVerge_val_files)}, val_labels: {len(Y_theVerge_val)}')

BBC --> train_files: 142, train_labels: 142, test_files: 31, test_labels: 31, val_files: 30, val_labels: 30
Tech Crunch --> train_files: 176, train_labels: 176, test_files: 38, test_labels: 38, val_files: 38, val_labels: 38
The Verge --> train_files: 117, train_labels: 117, test_files: 26, test_labels: 26, val_files: 25, val_labels: 25


## Load Human Written News

In [4]:
def load_data(file):
    content = None
    folder = 'BBC/human_BBC_news/' if 'BBC' in file else 'TechCrunch/human_TechCrunch_news/' if 'TechCrunch' in file else 'TheVerge/human_TheVerge_news/'
    with open(f'{folder}{file}', 'r', encoding="utf-8") as news:
        content = news.read()
    return content

X_bbc_train = np.array(list(map(load_data, X_bbc_train_files)))
X_bbc_test = np.array(list(map(load_data, X_bbc_test_files)))
X_bbc_val = np.array(list(map(load_data, X_bbc_val_files)))

X_techCrunch_train = np.array(list(map(load_data, X_techCrunch_train_files)))
X_techCrunch_test = np.array(list(map(load_data, X_techCrunch_test_files)))
X_techCrunch_val = np.array(list(map(load_data, X_techCrunch_val_files)))

X_theVerge_train = np.array(list(map(load_data, X_theVerge_train_files)))
X_theVerge_test = np.array(list(map(load_data, X_theVerge_test_files)))
X_theVerge_val = np.array(list(map(load_data, X_theVerge_val_files)))


##  Human Datasets

In [5]:
X_human_train = np.hstack((X_bbc_train, X_techCrunch_train, X_theVerge_train))
Y_human_train = np.hstack((Y_bbc_train, Y_techCrunch_train, Y_theVerge_train))
print(f'Human train: {len(X_human_train)}, {len(Y_human_train)}')

X_human_test = np.hstack((X_bbc_test, X_techCrunch_test, X_theVerge_test))
Y_human_test = np.hstack((Y_bbc_test, Y_techCrunch_test, Y_theVerge_test))
print(f'Human test: {len(X_human_test)}, {len(Y_human_test)}')

X_human_val = np.hstack((X_bbc_val, X_techCrunch_val, X_theVerge_val))
Y_human_val = np.hstack((Y_bbc_val, Y_techCrunch_val, Y_theVerge_val))
print(f'Human val: {len(X_human_val)}, {len(Y_human_val)}')

Human train: 435, 435
Human test: 95, 95
Human val: 93, 93


In [7]:
## -------------------- Pickle --------------------

# human_train= [X_human_train, X_human_test, X_human_val]

# filename = '../dataset/news_pickles/train_test_val_Human'
# file = open(filename, 'wb')
# pickle.dump(human_train, file)
# file.close()


## Load Human PosTags Files

In [None]:
def load_data(file):
    content = None
    pos_tags = None
    folder = 'BBC/human_BBC_news/' if 'BBC' in file else 'TechCrunch/human_TechCrunch_news/' if 'TechCrunch' in file else 'TheVerge/human_TheVerge_news/'
    with open(f'{folder}{file}', 'r', encoding="utf-8") as news:
        content = news.read()

        tokens = nltk.word_tokenize(content)
        pos_tag_tuples = nltk.pos_tag(tokens, lang='eng')
        pos_tags = [tag[1] for tag in pos_tag_tuples]
        
    return pos_tags

X_bbc_human_posTags_train = np.array(list(map(load_data, X_bbc_train_files)))
X_bbc_human_posTags_test = np.array(list(map(load_data, X_bbc_test_files)))
X_bbc_human_posTags_val = np.array(list(map(load_data, X_bbc_val_files)))

X_techCrunch_human_posTags_train = np.array(list(map(load_data, X_techCrunch_train_files)))
X_techCrunch_human_posTags_test = np.array(list(map(load_data, X_techCrunch_test_files)))
X_techCrunch_human_posTags_val = np.array(list(map(load_data, X_techCrunch_val_files)))

X_theVerge_human_posTags_train = np.array(list(map(load_data, X_theVerge_train_files)))
X_theVerge_human_posTags_test = np.array(list(map(load_data, X_theVerge_test_files)))
X_theVerge_human_posTags_val = np.array(list(map(load_data, X_theVerge_val_files)))


In [9]:
X_human_train_posTags = np.hstack((X_bbc_human_posTags_train, X_techCrunch_human_posTags_train, X_theVerge_human_posTags_train))
print(f'Human train: {len(X_human_train_posTags)}')

X_human_test_posTags = np.hstack((X_bbc_human_posTags_test, X_techCrunch_human_posTags_test, X_theVerge_human_posTags_test))
print(f'Human test: {len(X_human_test_posTags)}')

X_human_val_posTags = np.hstack((X_bbc_human_posTags_val, X_techCrunch_human_posTags_val, X_theVerge_human_posTags_val))
print(f'Human val: {len(X_human_val_posTags)}')

Human train: 435
Human test: 95
Human val: 93


In [10]:
## -------------------- Pickle --------------------

# human_posTags= [X_human_train_posTags, X_human_test_posTags, X_human_val_posTags]

# filename = '../dataset/news_pickles/train_test_val_Human_posTags'
# file = open(filename, 'wb')
# pickle.dump(human_posTags, file)
# file.close()

## Load GPT Written News
#### Randomizing which set (Train, Test or Val) the human and GPT version of a same news will be placed.

In [11]:
# ------------------------------------ BBC ------------------------------------
bbc_news_GPT_files = os.listdir('BBC/GPT_BBC_news/')
bbc_GPT_labels = np.ones(len(bbc_news_GPT_files))

X_bbc_GPT_train_files, X_bbc_GPT_test_files, Y_bbc_GPT_train, Y_bbc_GPT_test =  train_test_split(bbc_news_GPT_files, bbc_GPT_labels, test_size=0.3)

print(X_bbc_GPT_train_files)

X_bbc_GPT_val_files = X_bbc_GPT_test_files[:len(X_bbc_GPT_test_files)//2]
X_bbc_GPT_test_files = X_bbc_GPT_test_files[len(X_bbc_GPT_test_files)//2:]
Y_bbc_GPT_val = Y_bbc_GPT_test[:len(Y_bbc_GPT_test)//2]
Y_bbc_GPT_test = Y_bbc_GPT_test[len(Y_bbc_GPT_test)//2:]

# ------------------------------------ TechCrunch ------------------------------------
techCrunch_news_GPT_files = os.listdir('TechCrunch/GPT_TechCrunch_news/')
techCrunch_GPT_labels = np.ones(len(techCrunch_news_GPT_files))

X_techCrunch_GPT_train_files, X_techCrunch_GPT_test_files, Y_techCrunch_GPT_train, Y_techCrunch_GPT_test =  train_test_split(techCrunch_news_GPT_files, techCrunch_GPT_labels, test_size=0.3)


X_techCrunch_GPT_val_files = X_techCrunch_GPT_test_files[:len(X_techCrunch_GPT_test_files)//2]
X_techCrunch_GPT_test_files = X_techCrunch_GPT_test_files[len(X_techCrunch_GPT_test_files)//2:]
Y_techCrunch_GPT_val = Y_techCrunch_GPT_test[:len(Y_techCrunch_GPT_test)//2]
Y_techCrunch_GPT_test = Y_techCrunch_GPT_test[len(Y_techCrunch_GPT_test)//2:]

# ------------------------------------ TheVerge ------------------------------------
theVerge_news_GPT_files = os.listdir('TheVerge/GPT_TheVerge_news/')
theVerge_GPT_labels = np.ones(len(theVerge_news_GPT_files))

X_theVerge_GPT_train_files, X_theVerge_GPT_test_files, Y_theVerge_GPT_train, Y_theVerge_GPT_test =  train_test_split(theVerge_news_GPT_files, theVerge_GPT_labels, test_size=0.3)


X_theVerge_GPT_val_files = X_theVerge_GPT_test_files[:len(X_theVerge_GPT_test_files)//2]
X_theVerge_GPT_test_files = X_theVerge_GPT_test_files[len(X_theVerge_GPT_test_files)//2:]
Y_theVerge_GPT_val = Y_theVerge_GPT_test[:len(Y_theVerge_GPT_test)//2]
Y_theVerge_GPT_test = Y_theVerge_GPT_test[len(Y_theVerge_GPT_test)//2:]


['205_BBC_gpt.txt', '71_BBC_gpt.txt', '98_BBC_gpt.txt', '66_BBC_gpt.txt', '166_BBC_gpt.txt', '204_BBC_gpt.txt', '225_BBC_gpt.txt', '9_BBC_gpt.txt', '16_BBC_gpt.txt', '227_BBC_gpt.txt', '183_BBC_gpt.txt', '111_BBC_gpt.txt', '46_BBC_gpt.txt', '216_BBC_gpt.txt', '192_BBC_gpt.txt', '83_BBC_gpt.txt', '30_BBC_gpt.txt', '217_BBC_gpt.txt', '228_BBC_gpt.txt', '109_BBC_gpt.txt', '65_BBC_gpt.txt', '237_BBC_gpt.txt', '158_BBC_gpt.txt', '86_BBC_gpt.txt', '219_BBC_gpt.txt', '165_BBC_gpt.txt', '156_BBC_gpt.txt', '47_BBC_gpt.txt', '44_BBC_gpt.txt', '179_BBC_gpt.txt', '211_BBC_gpt.txt', '62_BBC_gpt.txt', '26_BBC_gpt.txt', '170_BBC_gpt.txt', '129_BBC_gpt.txt', '155_BBC_gpt.txt', '148_BBC_gpt.txt', '3_BBC_gpt.txt', '19_BBC_gpt.txt', '112_BBC_gpt.txt', '57_BBC_gpt.txt', '27_BBC_gpt.txt', '41_BBC_gpt.txt', '196_BBC_gpt.txt', '157_BBC_gpt.txt', '209_BBC_gpt.txt', '7_BBC_gpt.txt', '226_BBC_gpt.txt', '6_BBC_gpt.txt', '51_BBC_gpt.txt', '147_BBC_gpt.txt', '221_BBC_gpt.txt', '37_BBC_gpt.txt', '153_BBC_gpt.txt', 

In [12]:
print(f'BBC GPT --> train_files: {len(X_bbc_GPT_train_files)}, train_labels: {len(Y_bbc_GPT_train)}, test_files: {len(X_bbc_GPT_test_files)}, test_labels: {len(Y_bbc_GPT_test)}, val_files: {len(X_bbc_GPT_val_files)}, val_labels: {len(Y_bbc_GPT_val)}')
print(f'Tech Crunch GPT --> train_files: {len(X_techCrunch_GPT_train_files)}, train_labels: {len(Y_techCrunch_GPT_train)}, test_files: {len(X_techCrunch_GPT_test_files)}, test_labels: {len(Y_techCrunch_GPT_test)}, val_files: {len(X_techCrunch_GPT_val_files)}, val_labels: {len(Y_techCrunch_GPT_val)}')
print(f'The Verge GPT--> train_files: {len(X_theVerge_GPT_train_files)}, train_labels: {len(Y_theVerge_GPT_train)}, test_files: {len(X_theVerge_GPT_test_files)}, test_labels: {len(Y_theVerge_GPT_test)}, val_files: {len(X_theVerge_GPT_val_files)}, val_labels: {len(Y_theVerge_GPT_val)}')

BBC GPT --> train_files: 142, train_labels: 142, test_files: 31, test_labels: 31, val_files: 30, val_labels: 30
Tech Crunch GPT --> train_files: 176, train_labels: 176, test_files: 38, test_labels: 38, val_files: 38, val_labels: 38
The Verge GPT--> train_files: 117, train_labels: 117, test_files: 26, test_labels: 26, val_files: 25, val_labels: 25


In [13]:
def load_data(file):
    content = None
    folder = 'BBC/GPT_BBC_news/' if 'BBC' in file else 'TechCrunch/GPT_TechCrunch_news/' if 'TechCrunch' in file else 'TheVerge/GPT_TheVerge_news/'
    with open(f'{folder}{file}', 'r', encoding="utf-8") as news:
        content = news.read()
    return content

X_bbc_gpt_train = np.array(list(map(load_data, X_bbc_GPT_train_files)))
X_bbc_gpt_test = np.array(list(map(load_data, X_bbc_GPT_test_files)))
X_bbc_gpt_val = np.array(list(map(load_data, X_bbc_GPT_val_files)))

X_techCrunch_gpt_train = np.array(list(map(load_data, X_techCrunch_GPT_train_files)))
X_techCrunch_gpt_test = np.array(list(map(load_data, X_techCrunch_GPT_test_files)))
X_techCrunch_gpt_val = np.array(list(map(load_data, X_techCrunch_GPT_val_files)))

X_theVerge_gpt_train = np.array(list(map(load_data, X_theVerge_GPT_train_files)))
X_theVerge_gpt_test = np.array(list(map(load_data, X_theVerge_GPT_test_files)))
X_theVerge_gpt_val = np.array(list(map(load_data, X_theVerge_GPT_val_files)))

## GPT Datasets

In [14]:
X_gpt_train = np.hstack((X_bbc_gpt_train, X_techCrunch_gpt_train, X_theVerge_gpt_train))
Y_gpt_train = np.ones(len(X_gpt_train))
print(f'GPT train: {len(X_gpt_train)}, {len(Y_gpt_train)}')

X_gpt_test = np.hstack((X_bbc_gpt_test, X_techCrunch_gpt_test, X_theVerge_gpt_test))
Y_gpt_test = np.ones(len(X_gpt_test))
print(f'GPT test: {len(X_gpt_test)}, {len(Y_gpt_test)}')

X_gpt_val = np.hstack((X_bbc_gpt_val, X_techCrunch_gpt_val, X_theVerge_gpt_val))
Y_gpt_val = np.ones(len(X_gpt_val))
print(f'GPT val: {len(X_gpt_val)}, {len(Y_gpt_val)}')

GPT train: 435, 435
GPT test: 95, 95
GPT val: 93, 93


In [16]:
## -------------------- Pickle --------------------

# gpt_data= [X_gpt_train, X_gpt_test, X_gpt_val]

# filename = '../dataset/news_pickles/train_test_val_GPT'
# file = open(filename, 'wb')
# pickle.dump(gpt_data, file)
# file.close()

## Load GPT PosTags Files

In [None]:
def load_data(file):
    content = None
    pos_tags = None
    folder = 'BBC/GPT_BBC_news/' if 'BBC' in file else 'TechCrunch/GPT_TechCrunch_news/' if 'TechCrunch' in file else 'TheVerge/GPT_TheVerge_news/'
    with open(f'{folder}{file}', 'r', encoding="utf-8") as news:
        content = news.read()

        tokens = nltk.word_tokenize(content)
        pos_tag_tuples = nltk.pos_tag(tokens, lang='eng')
        pos_tags = [tag[1] for tag in pos_tag_tuples]
        
    return pos_tags


X_bbc_GPT_posTags_train = np.array(list(map(load_data, X_bbc_GPT_train_files)))
X_bbc_GPT_posTags_test = np.array(list(map(load_data, X_bbc_GPT_test_files)))
X_bbc_GPT_posTags_val = np.array(list(map(load_data, X_bbc_GPT_val_files)))

X_techCrunch_GPT_posTags_train = np.array(list(map(load_data, X_techCrunch_GPT_train_files)))
X_techCrunch_GPT_posTags_test = np.array(list(map(load_data, X_techCrunch_GPT_test_files)))
X_techCrunch_GPT_posTags_val = np.array(list(map(load_data, X_techCrunch_GPT_val_files)))

X_theVerge_GPT_posTags_train = np.array(list(map(load_data, X_theVerge_GPT_train_files)))
X_theVerge_GPT_posTags_test = np.array(list(map(load_data, X_theVerge_GPT_test_files)))
X_theVerge_GPT_posTags_val = np.array(list(map(load_data, X_theVerge_GPT_val_files)))

In [18]:
X_gpt_train_posTags = np.hstack((X_bbc_GPT_posTags_train, X_techCrunch_GPT_posTags_train, X_theVerge_GPT_posTags_train))
print(f'GPT train posTags: {len(X_gpt_train_posTags)}')

X_gpt_test_posTags = np.hstack((X_bbc_GPT_posTags_test, X_techCrunch_GPT_posTags_test, X_theVerge_GPT_posTags_test))
print(f'GPT test posTags: {len(X_gpt_test_posTags)}')

X_gpt_val_posTags = np.hstack((X_bbc_GPT_posTags_val, X_techCrunch_GPT_posTags_val, X_theVerge_GPT_posTags_val))
print(f'GPT val posTags: {len(X_gpt_val_posTags)}')

GPT train posTags: 435
GPT test posTags: 95
GPT val posTags: 93


In [19]:
## -------------------- Pickle --------------------

# gpt_data_posTags= [X_gpt_train_posTags, X_gpt_test_posTags, X_gpt_val_posTags]

# filename = '../dataset/news_pickles/train_test_val_GPT_posTags'
# file = open(filename, 'wb')
# pickle.dump(gpt_data_posTags, file)
# file.close()

## Full Text Datasets

In [20]:
train_data = np.hstack((X_human_train, X_gpt_train))
train_labels = np.hstack((Y_human_train, Y_gpt_train))

test_data = np.hstack((X_human_test, X_gpt_test))
test_labels = np.hstack((Y_human_test, Y_gpt_test))

val_data = np.hstack((X_human_val, X_gpt_val))
val_labels = np.hstack((Y_human_val, Y_gpt_val))

train_data_posTag = np.hstack((X_human_train_posTags, X_gpt_train_posTags))
test_data_posTag = np.hstack((X_human_test_posTags, X_gpt_test_posTags))
val_data_posTag = np.hstack((X_human_val_posTags, X_gpt_val_posTags))

len(train_data), len(train_labels), len(train_data_posTag), len(test_data), len(test_labels), len(test_data_posTag), len(val_data), len(val_labels), len(val_data_posTag)

(870, 870, 870, 190, 190, 190, 186, 186, 186)

## Feature Extraction

In [21]:
def extract_posTag_features(text_posTag):
    adjective_tags = ['JJ', 'JJ$', 'JJ+JJ', 'JJR', 'JJR+CS', 'JJS', 'JJT', ]
    noun_tags = ['NN', 'NN$', 'NN+BEZ', 'NN+HVD', 'NN+HVZ', 'NN+IN', 'NN+MD', 'NN+NN', 'NNS', 'NNS$', 'NNS+MD', 'NP', 'NP$', 'NP+BEZ', 'NP+HVZ', 'NP+MD', 'NPS', 'NPS$', 'NR', 'NR$', 'NR+MD', 'NRS']
    pronoun_tags = ['PN', 'PN$', 'PN+BEZ', 'PN+HVD', 'PN+HVZ', 'PN+MD', 'PP$$', 'PPL', 'PPLS', 'PPO', 'PPS', 'PPS+BEZ', 'PPS+HVD', 'PPS+HVZ', 'PPS+MD', 'PPSS', 'PPSS+BEM', 'PPSS+BER', 'PPSS+BEZ', 'PPSS+BEZ*', 'PPSS+HV', 'PPSS+HVD', 'PPSS+MD', 'PPSS+VB']
    article_tags = ['AT']
    conjunction_tags = ['CC', 'CS']
    numeral_tags = ['CD', 'CD$', 'OD']
    preposition_tags = ['IN', 'IN+IN', 'IN+PPO']
    qualifier_tags = ['QL', 'QLP']
    adverb_tags = ['RB', 'RB$', 'RB+BEZ', 'RB+CS', 'RBR', 'RBR+CS', 'RBT', 'RN', 'RP', 'RP+IN']
    foreign_word = 'FW'
    w_classification_tags = ['WDT', 'WDT+BER', 'WDT+BER+PP', 'WDT+BEZ', 'WDT+DO+PPS', 'WDT+DOD', 'WDT+HVZ', 'WP$', 'WPO', 'WPS', 'WPS+BEZ', 'WPS+HVD', 'WPS+HVZ', 'WPS+MD', 'WQL', 'WRB','WRB+BER', 'WRB+BEZ', 'WRB+DO', 'WRB+DOD', 'WRB+DOD*', 'WRB+DOZ', 'WRB+IN', 'WRB+MD']
    modal_tags = ['MD', 'MD*', 'MD+HV', 'MD+PPSS', 'MD+TO']
    a_determiner_tags = ['ABL', 'ABN', 'ABX', 'AP', 'AP$', 'AP+AP']
    determiner_tags = ['DT', 'DT$', 'DT+BEZ', 'DT+MD', 'DTI', 'DTS', 'DTX']
    verb_tags = ['BE', 'BED', 'BED*', 'BEDZ', 'BEDZ*', 'BEG', 'BEM', 'BEM*', 'BEN', 'BER', 'BER*', 'BEZ', 'BEZ*', 'DO', 'DO*', 'DO+PPSS', 'DOD', 'DOD*', 'DOZ', 'DOZ*', 'HV','HV*', 'HV+TO', 'HVD', 'HVD*', 'HVG', 'HVN', 'HVZ', 'HVZ*', 'VB', 'VB+AT', 'VB+IN', 'VB+JJ', 'VB+PPO', 'VB+RP', 'VB+TO', 'VB+VB', 'VBD', 'VBG', 'VBG+TO', 'VBN','VBN+TO', 'VBZ']

    n_posTags = len(text_posTag)

    adjectives = 0
    nouns = 0
    pronouns = 0
    articles = 0
    conjunctions = 0
    numerals = 0
    prepositions = 0
    qualifiers = 0
    adverbs = 0
    foreign_words = 0
    w_classifications = 0
    modals = 0
    a_determiners = 0
    determiners = 0
    verbs = 0
    for tag in text_posTag:
        if tag in adjective_tags:
            adjectives += 1
        elif tag in noun_tags:
            nouns += 1
        elif tag in pronoun_tags:
            pronouns += 1
        elif tag in article_tags:
            articles += 1
        elif tag in conjunction_tags:
            conjunctions += 1
        elif tag in numeral_tags:
            numerals += 1
        elif tag in preposition_tags:
            prepositions += 1
        elif tag in qualifier_tags:
            qualifiers += 1
        elif tag in adverb_tags:
            adverbs += 1
        elif foreign_word in tag:
            foreign_words += 1
        elif tag in w_classification_tags:
            w_classifications += 1
        elif tag in modal_tags:
            modals += 1
        elif tag in a_determiner_tags:
            a_determiners += 1
        elif tag in determiner_tags:
            determiners += 1
        elif tag in verb_tags:
            verbs += 1

    adjectives_ratio = adjectives/n_posTags
    nouns_ratio = nouns/n_posTags
    pronouns_ratio = pronouns/n_posTags
    articles_ratio = articles/n_posTags
    conjunctions_ratio = conjunctions/n_posTags
    numerals_ratio = numerals/n_posTags
    prepositions_ratio = prepositions/n_posTags
    qualifiers_ratio = qualifiers/n_posTags
    adverbs_ratio = adverbs/n_posTags
    foreign_words_ratio = foreign_words/n_posTags
    w_classifications_ratio = w_classifications/n_posTags
    modals_ratio = modals/n_posTags
    a_determiners_ratio = a_determiners/n_posTags
    determiners_ratio = determiners/n_posTags
    verbs_ratio = verbs/n_posTags

    if adjectives_ratio > 1 or nouns_ratio > 1:
        print(f'Len_posTags: {n_posTags} --> Adjectives: {adjectives}, --> Nouns: {nouns}')

    return adjectives_ratio, nouns_ratio, conjunctions_ratio, prepositions_ratio, adverbs_ratio, w_classifications_ratio, modals_ratio, determiners_ratio, verbs_ratio


def extract_stopwords_and_ponctuation_ratio(text):

    stopwords_count = 0
    pontucation_count = 0

    stopwords_list = stopwords.words("english")
    tokens = nltk.word_tokenize(text)
    n_tokens = len(tokens)

    for token in tokens:
        if token in stopwords_list:
            stopwords_count += 1
        elif token in string.punctuation:
            pontucation_count += 1

    stopwords_ratio = stopwords_count/n_tokens
    pontucation_ratio = pontucation_count/n_tokens

    if stopwords_ratio > 1 or pontucation_ratio > 1:
        print(f'Len_tokens: {n_tokens} --> Stopwords: {stopwords_count}, --> Ponctuation: {pontucation_count}')

    return stopwords_ratio, pontucation_ratio


def extract_average_token_quantity_per_sentence(text):
    tokens_quantity = []
    sentences =  nltk.sent_tokenize(text)

    for sentence in sentences:
        sentence_tokens = nltk.word_tokenize(sentence)
        tokens_quantity.append(len(sentence_tokens))

    average_tokens_quantity_per_sentence = sum(tokens_quantity)//len(tokens_quantity)

    return average_tokens_quantity_per_sentence
    
    

def extract_average_token_length_per_text(text):

    tokens_length = []
    tokens = nltk.word_tokenize(text)

    for token in tokens:
        tokens_length.append(len(token))

    average_token_length = sum(tokens_length)//len(tokens_length)

    return average_token_length


def extract_sentence_quantity_per_text(text):
    sentences_quantity =  len(nltk.sent_tokenize(text))

    if sentences_quantity > 300:
        print(f'To many sentences: {text}')

    return sentences_quantity
    


def extract_features(text, text_posTag, label, dataset, min_word_size=1, max_word_size=43, min_n_sentences_per_text=3, max_n_sentences_per_text=116,  min_n_words_per_sentence=1, max_n_words_per_sentence=943 ):
    
    adjectives_ratio, nouns_ratio, conjunctions_ratio, prepositions_ratio, adverbs_ratio, w_classifications_ratio, modals_ratio, determiners_ratio, verbs_ratio = extract_posTag_features(text_posTag)

    stopwords_ratio, ponctuation_ratio = extract_stopwords_and_ponctuation_ratio(text)

    average_token_quantity_per_sentence = extract_average_token_quantity_per_sentence(text)

    average_token_length_per_text = extract_average_token_length_per_text(text)

    
    return [adjectives_ratio, nouns_ratio, conjunctions_ratio, prepositions_ratio, adverbs_ratio, w_classifications_ratio, modals_ratio, determiners_ratio, verbs_ratio, stopwords_ratio, ponctuation_ratio, average_token_quantity_per_sentence, average_token_length_per_text]

    

In [22]:
X_train = np.array([extract_features(text, train_data_posTag[index], train_labels[index], 'train') for index, text in enumerate(train_data)])
Y_train = train_labels

X_test = np.array([extract_features(text, test_data_posTag[index], test_labels[index], 'test') for index, text in enumerate(test_data)])
Y_test = test_labels

X_val = np.array([extract_features(text, val_data_posTag[index], val_labels[index], 'val') for index, text in enumerate(val_data)])
Y_val = val_labels

In [23]:
X_train.shape, Y_train.shape, X_test.shape, Y_test.shape, X_val.shape, Y_val.shape

((870, 13), (870,), (190, 13), (190,), (186, 13), (186,))

## Normalization

In [24]:
def normalizers(data):
    normalizers = []
    n_features = len(data[0])

    for index in range(n_features):
        features = np.array([[feature[index]] for feature in full_dataset])
        scaler = MinMaxScaler()
        scaler.fit(features)
        normalizers.append(scaler)

    return normalizers

def normalize_features(data, normalizers):
    normalized_features = []
    for index, feature in enumerate(data):
        normalized_features.append(normalizers[index].transform([[data[index]]])[0][0])

    return normalized_features


full_dataset = np.vstack((X_train, X_test, X_val))

feature_normalizers = normalizers(full_dataset)

X_train = np.array([normalize_features(data, feature_normalizers) for data in X_train])
X_test = np.array([normalize_features(data, feature_normalizers) for data in X_test])
X_val = np.array([normalize_features(data, feature_normalizers) for data in X_val])

## Save Extracted Features

In [26]:
def save_features_to_csv(data, labels, file_name, file_complement=''):
    with open(f'features_csv/{file_name}_features{file_complement}.csv', 'w', newline='', encoding="utf-8") as new_file:
        for index, sample in enumerate(data):
            sample = list(sample)
            sample.append(labels[index])
            writer = csv.writer(new_file)
            writer.writerow(sample)


In [27]:
## -------------------- Pickle --------------------

# processed_dataset= [X_train, Y_train, X_test, Y_test, X_val, Y_val]

# filename = 'processed_features'

# file = open(filename, 'wb')
# pickle.dump(processed_dataset, file)
# file.close()

In [28]:
## -------------------- CSV --------------------
# extension = '_features'

# save_features_to_csv(X_train, Y_train, 'extracted_features/train', extension)
# save_features_to_csv(X_test, Y_test, 'extracted_features/test', extension)
# save_features_to_csv(X_val, Y_val, 'extracted_features/val', extension)