In [1]:
import pandas as pd



In [198]:
df = pd.read_csv("data/essays.csv", encoding='latin-1')

In [199]:
df.head()

Unnamed: 0,#AUTHID,TEXT,cEXT,cNEU,cAGR,cCON,cOPN
0,1997_504851.txt,"Well, right now I just woke up from a mid-day ...",n,y,y,n,y
1,1997_605191.txt,"Well, here we go with the stream of consciousn...",n,n,y,n,n
2,1997_687252.txt,An open keyboard and buttons to push. The thin...,n,y,n,y,y
3,1997_568848.txt,I can't believe it! It's really happening! M...,y,n,y,y,n
4,1997_688160.txt,"Well, here I go with the good old stream of co...",y,n,y,n,y


In [200]:
mapping = {'y': 1, 'n': 0}

df[['cEXT','cNEU','cAGR','cCON','cOPN']] = df[['cEXT','cNEU','cAGR','cCON','cOPN']].replace(mapping)

In [201]:
df.head()

Unnamed: 0,#AUTHID,TEXT,cEXT,cNEU,cAGR,cCON,cOPN
0,1997_504851.txt,"Well, right now I just woke up from a mid-day ...",0,1,1,0,1
1,1997_605191.txt,"Well, here we go with the stream of consciousn...",0,0,1,0,0
2,1997_687252.txt,An open keyboard and buttons to push. The thin...,0,1,0,1,1
3,1997_568848.txt,I can't believe it! It's really happening! M...,1,0,1,1,0
4,1997_688160.txt,"Well, here I go with the good old stream of co...",1,0,1,0,1


In [202]:
#!python -m spacy download en

import spacy
import string
import re
from spacy.lang.en.stop_words import STOP_WORDS

nlp = spacy.load('en_core_web_sm')

def tokenize(text):
    doc = nlp(text)
    tokens = [token.text for token in doc]
    return tokens

In [203]:
def remove_punctuation(text):
    no_punct = "".join([c for c in text if c not in string.punctuation])
    return no_punct

def remove_numbers(text):
    return re.sub('[0-9]+', '', text)

In [204]:
df['TEXT'] = df['TEXT'].str.lower()
df.head()

Unnamed: 0,#AUTHID,TEXT,cEXT,cNEU,cAGR,cCON,cOPN
0,1997_504851.txt,"well, right now i just woke up from a mid-day ...",0,1,1,0,1
1,1997_605191.txt,"well, here we go with the stream of consciousn...",0,0,1,0,0
2,1997_687252.txt,an open keyboard and buttons to push. the thin...,0,1,0,1,1
3,1997_568848.txt,i can't believe it! it's really happening! m...,1,0,1,1,0
4,1997_688160.txt,"well, here i go with the good old stream of co...",1,0,1,0,1


In [205]:
df['TEXT'] = df['TEXT'].apply(remove_punctuation)
df.head()

Unnamed: 0,#AUTHID,TEXT,cEXT,cNEU,cAGR,cCON,cOPN
0,1997_504851.txt,well right now i just woke up from a midday na...,0,1,1,0,1
1,1997_605191.txt,well here we go with the stream of consciousne...,0,0,1,0,0
2,1997_687252.txt,an open keyboard and buttons to push the thing...,0,1,0,1,1
3,1997_568848.txt,i cant believe it its really happening my pu...,1,0,1,1,0
4,1997_688160.txt,well here i go with the good old stream of con...,1,0,1,0,1


In [206]:
df['TEXT'] = df['TEXT'].apply(remove_numbers)
df.head()

Unnamed: 0,#AUTHID,TEXT,cEXT,cNEU,cAGR,cCON,cOPN
0,1997_504851.txt,well right now i just woke up from a midday na...,0,1,1,0,1
1,1997_605191.txt,well here we go with the stream of consciousne...,0,0,1,0,0
2,1997_687252.txt,an open keyboard and buttons to push the thing...,0,1,0,1,1
3,1997_568848.txt,i cant believe it its really happening my pu...,1,0,1,1,0
4,1997_688160.txt,well here i go with the good old stream of con...,1,0,1,0,1


In [207]:
def remove_stopwords(tokens):
    filtered_tokens = [token for token in tokens if token.lower() not in STOP_WORDS]
    return filtered_tokens

def lemmatizer(text):
    document = nlp(text)
    lemmas = [token.lemma_ for token in document]
    return lemmas

In [208]:
#df['TEXT'] = df['TEXT'].apply(tokenize)
#df.head()

#df['TEXT'] = df['TEXT'].apply(lemmatizer)
#df.head()

In [209]:
#df['TEXT'] = df['TEXT'].apply(remove_stopwords)
#df.head()

In [210]:
def remove_empty(tokens):
    filtered_tokens = [token for token in tokens if token != " "]
    return filtered_tokens

In [211]:
#df['TEXT'] = df['TEXT'].apply(remove_empty)
#df.head()

In [212]:
def ngrams(sequence, n, **kwargs):
    ngrams = []
    sequence_length = len(sequence)
    for i in range(sequence_length):
        if sequence_length >= i + n:
            seq = (sequence[i])
            for k in range(n-1):
                seq = (seq, sequence[i+k+1])
            ngrams.append(seq)
    return ngrams

In [213]:
def clean_text(text):
    lines = text.split("\n")
    lines.pop(0)  
    return re.sub("[\t ]{2,}", " ", " ".join(lines))

In [214]:
def lemmatize_pipe(doc, max_len=100, ngram_min=1, ngram_max=2):
    document = nlp(doc)
    
    lemmas = []
    for token in document:
        if not token.is_stop:
            lemmas.append(token.lemma_)
    lemmas = lemmas[:max_len]
    len_lemmas = len(lemmas)

    ngrams = []
    for n in range(ngram_min, ngram_max + 1):
        for i in range(len_lemmas - n + 1):
            ngrams.append(tuple(lemmas[i:i+n]))

    return ngrams

In [215]:
df['LEMMA'] = df['TEXT'].apply(lemmatize_pipe)
df.head()

Unnamed: 0,#AUTHID,TEXT,cEXT,cNEU,cAGR,cCON,cOPN,LEMMA
0,1997_504851.txt,well right now i just woke up from a midday na...,0,1,1,0,1,"[(right,), (wake,), (midday,), (nap,), (sort,)..."
1,1997_605191.txt,well here we go with the stream of consciousne...,0,0,1,0,0,"[(stream,), (consciousness,), (essay,), (thing..."
2,1997_687252.txt,an open keyboard and buttons to push the thing...,0,1,0,1,1,"[(open,), (keyboard,), (button,), (push,), (th..."
3,1997_568848.txt,i cant believe it its really happening my pu...,1,0,1,1,0,"[(not,), (believe,), ( ,), (happen,), ( ,), (p..."
4,1997_688160.txt,well here i go with the good old stream of con...,1,0,1,0,1,"[(good,), (old,), (stream,), (consciousness,),..."


In [221]:
#!pip install scikit-learn

from sklearn.model_selection import train_test_split

df_train, df_test = train_test_split(
    df[['LEMMA', 'cEXT','cNEU','cAGR','cCON','cOPN']], test_size=0.2, random_state=42
)

In [222]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

count_vectorizer = CountVectorizer(tokenizer=lambda doc: doc, lowercase=False, min_df=3)
tfidf_vectorizer = TfidfVectorizer(tokenizer=lambda doc: doc, lowercase=False, min_df=3)

In [223]:
#countvect + classifierchain + rfc

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multioutput import ClassifierChain
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

from sklearn.metrics import f1_score

X_train = count_vectorizer.fit_transform(df_train['LEMMA'])
y_train = df_train[['cEXT','cNEU','cAGR','cCON','cOPN']]

X_test = count_vectorizer.transform(df_test['LEMMA'])
y_test = df_test[['cEXT','cNEU','cAGR','cCON','cOPN']]

model = ClassifierChain(RandomForestClassifier())
model.fit(X_train, y_train)

prediction = model.predict(X_test)

f1_macro = f1_score(y_test, prediction, average='macro')
f1_macro



0.5836999717986353

In [224]:
accuracy = accuracy_score(y_test, prediction)
precision = precision_score(y_test, prediction)
recall = recall_score(y_test, prediction)
f1_micro = f1_score(y_test, prediction, average='micro')

NameError: name 'precision_score' is not defined

In [225]:
#countvect + classifierchain + lr

X_train = count_vectorizer.fit_transform(df_train['LEMMA'])
y_train = df_train[['cEXT','cNEU','cAGR','cCON','cOPN']]

X_test = count_vectorizer.transform(df_test['LEMMA'])
y_test = df_test[['cEXT','cNEU','cAGR','cCON','cOPN']]

model = ClassifierChain(LogisticRegression())
model.fit(X_train, y_train)

prediction = model.predict(X_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [226]:
f1_macro = f1_score(y_test, prediction, average='macro')
f1_macro

0.5502009071245826

In [227]:
#countvect + classifierchain + svc

X_train = count_vectorizer.fit_transform(df_train['LEMMA'])
y_train = df_train[['cEXT','cNEU','cAGR','cCON','cOPN']]

X_test = count_vectorizer.transform(df_test['LEMMA'])
y_test = df_test[['cEXT','cNEU','cAGR','cCON','cOPN']]

model = ClassifierChain(SVC())
model.fit(X_train, y_train)

prediction = model.predict(X_test)



In [228]:
f1_macro = f1_score(y_test, prediction, average='macro')
f1_macro

0.5916427667697961

In [229]:
#tfidfvect + classifierchain + rfc

X_train2 = tfidf_vectorizer.fit_transform(df_train['LEMMA'])
y_train2 = df_train[['cEXT','cNEU','cAGR','cCON','cOPN']]

X_test2 = tfidf_vectorizer.transform(df_test['LEMMA'])
y_test2 = df_test[['cEXT','cNEU','cAGR','cCON','cOPN']]

model = ClassifierChain(RandomForestClassifier())
model.fit(X_train2, y_train2)

prediction = model.predict(X_test2)
f1_macro = f1_score(y_test2, prediction, average='macro')
f1_macro



0.5778181695572828

In [230]:
#tfidfvect + classifierchain + lr

X_train2 = tfidf_vectorizer.fit_transform(df_train['LEMMA'])
y_train2 = df_train[['cEXT','cNEU','cAGR','cCON','cOPN']]

X_test2 = tfidf_vectorizer.transform(df_test['LEMMA'])
y_test2 = df_test[['cEXT','cNEU','cAGR','cCON','cOPN']]

model = ClassifierChain(LogisticRegression())
model.fit(X_train2, y_train2)

prediction = model.predict(X_test2)
f1_macro = f1_score(y_test2, prediction, average='macro')
f1_macro



0.5754314437772138

In [231]:
#tfidfvect + classifierchain + svc

X_train2 = tfidf_vectorizer.fit_transform(df_train['LEMMA'])
y_train2 = df_train[['cEXT','cNEU','cAGR','cCON','cOPN']]

X_test2 = tfidf_vectorizer.transform(df_test['LEMMA'])
y_test2 = df_test[['cEXT','cNEU','cAGR','cCON','cOPN']]

model = ClassifierChain(SVC())
model.fit(X_train2, y_train2)

prediction = model.predict(X_test2)
f1_macro = f1_score(y_test2, prediction, average='macro')
f1_macro



0.5634240984669608

In [232]:
#ovr + rfc

from sklearn.multiclass import OneVsRestClassifier
import numpy as np

models = [OneVsRestClassifier(SVC()) for _ in range(5)]
i = 0
for j in ['cEXT','cNEU','cAGR','cCON','cOPN']:
    models[i].fit(X_train, y_train[j])
    i = i + 1

predictions = [model.predict(X_test) for model in models]

predictions = np.array(predictions)
predictions = predictions.T

In [233]:
f1_macro = f1_score(y_test, predictions, average='macro')
f1_macro

0.5949874765360368

In [234]:
f1_macro = f1_score(y_test['cEXT'], predictions2[0], average='macro')
f1_macro

0.4964913531674293

In [235]:
f1_macro = f1_score(y_test['cNEU'], predictions2[1], average='macro')
f1_macro

0.540438750712058

In [236]:
f1_macro = f1_score(y_test['cAGR'], predictions2[2], average='macro')
f1_macro

0.5323956121726823

In [237]:
f1_macro = f1_score(y_test['cCON'], predictions2[3], average='macro')
f1_macro

0.5377037876175934

In [238]:
f1_macro = f1_score(y_test['cOPN'], predictions2[4], average='macro')
f1_macro

0.5826791446715045

In [239]:
#ovr + lr

models = [OneVsRestClassifier(LogisticRegression()) for _ in range(5)]
i = 0
for j in ['cEXT','cNEU','cAGR','cCON','cOPN']:
    models[i].fit(X_train, y_train[j])
    i = i + 1

predictions = [model.predict(X_test) for model in models]

predictions = np.array(predictions)
predictions = predictions.T

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [240]:
f1_macro = f1_score(y_test, predictions, average='macro')
f1_macro

0.5529538386713592

In [241]:
f1_macro = f1_score(y_test['cEXT'], predictions2[0], average='macro')
f1_macro

0.4964913531674293

In [242]:
f1_macro = f1_score(y_test['cNEU'], predictions2[1], average='macro')
f1_macro

0.540438750712058

In [243]:
f1_macro = f1_score(y_test['cAGR'], predictions2[2], average='macro')
f1_macro

0.5323956121726823

In [244]:
f1_macro = f1_score(y_test['cCON'], predictions2[3], average='macro')
f1_macro

0.5377037876175934

In [245]:
f1_macro = f1_score(y_test['cOPN'], predictions2[4], average='macro')
f1_macro

0.5826791446715045

In [246]:
#ovr + svc

models = [OneVsRestClassifier(RandomForestClassifier()) for _ in range(5)]
i = 0
for j in ['cEXT','cNEU','cAGR','cCON','cOPN']:
    models[i].fit(X_train, y_train[j])
    i = i + 1

predictions = [model.predict(X_test) for model in models]

predictions = np.array(predictions)
predictions = predictions.T

In [247]:
f1_macro = f1_score(y_test, predictions, average='macro')
f1_macro

0.5924001424528655

In [248]:
#f1_macro = f1_score(y_test['cEXT'], predictions[0], average='macro')
predictions2 = np.array(predictions)
predictions2 = predictions2.T

f1_macro = f1_score(y_test['cEXT'], predictions2[0], average='macro')
f1_macro

0.5178105376046785

In [249]:
f1_macro = f1_score(y_test['cNEU'], predictions2[1], average='macro')
f1_macro

0.5439943876232323

In [250]:
f1_macro = f1_score(y_test['cAGR'], predictions2[2], average='macro')
f1_macro

0.5181342555644232

In [251]:
f1_macro = f1_score(y_test['cCON'], predictions2[3], average='macro')
f1_macro

0.5760039752450647

In [252]:
f1_macro = f1_score(y_test['cOPN'], predictions2[4], average='macro')
f1_macro

0.5711858479893037

In [253]:
#tfidf + moc + rfc

from sklearn.multioutput import MultiOutputClassifier

X_train, X_test, y_train, y_test = train_test_split(df['TEXT'], df[['cEXT', 'cNEU', 'cAGR', 'cCON', 'cOPN']], test_size=0.2, random_state=42)

vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

classifier = MultiOutputClassifier(RandomForestClassifier())

classifier.fit(X_train_tfidf, y_train)

y_pred = classifier.predict(X_test_tfidf)

f1_macro = f1_score(y_test, y_pred, average='macro')
f1_macro

0.5928455628300527

In [254]:
#countvect + moc + rfc

vectorizer = CountVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

classifier = MultiOutputClassifier(RandomForestClassifier())

classifier.fit(X_train_tfidf, y_train)

y_pred = classifier.predict(X_test_tfidf)

f1_macro = f1_score(y_test, y_pred, average='macro')
f1_macro

0.5898811313585257

In [255]:
#tfidf + moc + lr

vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

classifier = MultiOutputClassifier(LogisticRegression())

classifier.fit(X_train_tfidf, y_train)

y_pred = classifier.predict(X_test_tfidf)

f1_macro = f1_score(y_test, y_pred, average='macro')
f1_macro

0.6002012426460321

In [256]:
#countvect + moc + lr

vectorizer = CountVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

classifier = MultiOutputClassifier(LogisticRegression())

classifier.fit(X_train_tfidf, y_train)

y_pred = classifier.predict(X_test_tfidf)

f1_macro = f1_score(y_test, y_pred, average='macro')
f1_macro

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

0.5607392780124458

In [257]:
#tfidf + moc + svc

vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

classifier = MultiOutputClassifier(SVC())

classifier.fit(X_train_tfidf, y_train)

y_pred = classifier.predict(X_test_tfidf)

f1_macro = f1_score(y_test, y_pred, average='macro')
f1_macro

0.5954924930854043

In [258]:
#countvect + moc + svc

vectorizer = CountVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

classifier = MultiOutputClassifier(SVC())

classifier.fit(X_train_tfidf, y_train)

y_pred = classifier.predict(X_test_tfidf)

f1_macro = f1_score(y_test, y_pred, average='macro')
f1_macro

0.5672934559857881

In [259]:
# this is to check how labels are correlated !!!

# highly correlated - 
X_train, X_test, y_train, y_test = train_test_split(df['TEXT'], df[['cNEU', 'cAGR']], test_size=0.2, random_state=42)

vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

classifier = MultiOutputClassifier(LogisticRegression())

classifier.fit(X_train_tfidf, y_train)

y_pred = classifier.predict(X_test_tfidf)

f1_macro = f1_score(y_test, y_pred, average='macro')
f1_macro

0.6114914971860461

In [260]:
# this is to check how labels are correlated !!!

# less correlated - 
X_train, X_test, y_train, y_test = train_test_split(df['TEXT'], df[['cOPN', 'cAGR']], test_size=0.2, random_state=42)

vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

classifier = MultiOutputClassifier(LogisticRegression())

classifier.fit(X_train_tfidf, y_train)

y_pred = classifier.predict(X_test_tfidf)

f1_macro = f1_score(y_test, y_pred, average='macro')
f1_macro

0.6155704352474896

In [None]:
from transformers import GPT2Tokenizer, TFGPT2Model
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
gpt_model = TFGPT2Model.from_pretrained('gpt2')

input_ids = []
for text in df_train['LEMMA']:
    encoded = tokenizer.encode(text, truncation=True, max_length=200, padding='max_length')
    input_ids.append(encoded)
input_ids = tf.constant(input_ids)

input_layer = Input(shape=(200,), dtype=tf.int32)
gpt_output = gpt_model(input_layer)[0]
output_layer = Dense(5, activation='sigmoid')(gpt_output[:, -1, :])
model = Model(inputs=input_layer, outputs=output_layer)

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

model.fit(input_ids, df_train[['cEXT','cNEU','cAGR','cCON','cOPN']], epochs=10, batch_size=32)

encoded_test = tokenizer.encode(df_test['LEMMA'], truncation=True, max_length=200, padding='max_length')
test_input_ids = tf.constant(encoded_test)
model.evaluate(test_input_ids, df_test[['cEXT','cNEU','cAGR','cCON','cOPN']])

new_data = X_test
encoded_new = tokenizer.encode(new_data, truncation=True, max_length=200, padding='max_length')
new_input_ids = tf.constant(encoded_new)
predictions = model.predict(new_input_ids)