In [1]:
import pandas as pd

In [4]:
df = pd.read_csv("data/essays.csv", encoding='latin-1')

In [5]:
df.head()

Unnamed: 0,#AUTHID,TEXT,cEXT,cNEU,cAGR,cCON,cOPN
0,1997_504851.txt,"Well, right now I just woke up from a mid-day ...",n,y,y,n,y
1,1997_605191.txt,"Well, here we go with the stream of consciousn...",n,n,y,n,n
2,1997_687252.txt,An open keyboard and buttons to push. The thin...,n,y,n,y,y
3,1997_568848.txt,I can't believe it! It's really happening! M...,y,n,y,y,n
4,1997_688160.txt,"Well, here I go with the good old stream of co...",y,n,y,n,y


In [6]:
mapping = {'y': 1, 'n': 0}

df[['cEXT','cNEU','cAGR','cCON','cOPN']] = df[['cEXT','cNEU','cAGR','cCON','cOPN']].replace(mapping)

In [7]:
df.head()

Unnamed: 0,#AUTHID,TEXT,cEXT,cNEU,cAGR,cCON,cOPN
0,1997_504851.txt,"Well, right now I just woke up from a mid-day ...",0,1,1,0,1
1,1997_605191.txt,"Well, here we go with the stream of consciousn...",0,0,1,0,0
2,1997_687252.txt,An open keyboard and buttons to push. The thin...,0,1,0,1,1
3,1997_568848.txt,I can't believe it! It's really happening! M...,1,0,1,1,0
4,1997_688160.txt,"Well, here I go with the good old stream of co...",1,0,1,0,1


In [9]:
#!python -m spacy download en

import spacy
import string
import re
from spacy.lang.en.stop_words import STOP_WORDS

nlp = spacy.load('en_core_web_sm')

def remove_punctuation(text):
    no_punct = "".join([c for c in text if c not in string.punctuation])
    return no_punct

def remove_numbers(text):
    return re.sub('[0-9]+', '', text)

def tokenize(text):
    doc = nlp(text)
    tokens = [token.text for token in doc]
    return tokens

C:\Python39\python.exe: No module named spacy


ModuleNotFoundError: No module named 'spacy'

In [8]:
df['TEXT'] = df['TEXT'].str.lower()
df.head()

Unnamed: 0,AUTHID,TEXT,cEXT,cNEU,cAGR,cCON,cOPN
0,1997_504851.txt,"well, right now i just woke up from a mid-day ...",0,1,1,0,1
1,1997_605191.txt,"well, here we go with the stream of consciousn...",0,0,1,0,0
2,1997_687252.txt,an open keyboard and buttons to push. the thin...,0,1,0,1,1
3,1997_568848.txt,i can't believe it! it's really happening! m...,1,0,1,1,0
4,1997_688160.txt,"well, here i go with the good old stream of co...",1,0,1,0,1


In [9]:
df['TEXT'] = df['TEXT'].apply(remove_punctuation)
df.head()

Unnamed: 0,AUTHID,TEXT,cEXT,cNEU,cAGR,cCON,cOPN
0,1997_504851.txt,well right now i just woke up from a midday na...,0,1,1,0,1
1,1997_605191.txt,well here we go with the stream of consciousne...,0,0,1,0,0
2,1997_687252.txt,an open keyboard and buttons to push the thing...,0,1,0,1,1
3,1997_568848.txt,i cant believe it its really happening my pu...,1,0,1,1,0
4,1997_688160.txt,well here i go with the good old stream of con...,1,0,1,0,1


In [10]:
df['TEXT'] = df['TEXT'].apply(remove_numbers)
df.head()

Unnamed: 0,AUTHID,TEXT,cEXT,cNEU,cAGR,cCON,cOPN
0,1997_504851.txt,well right now i just woke up from a midday na...,0,1,1,0,1
1,1997_605191.txt,well here we go with the stream of consciousne...,0,0,1,0,0
2,1997_687252.txt,an open keyboard and buttons to push the thing...,0,1,0,1,1
3,1997_568848.txt,i cant believe it its really happening my pu...,1,0,1,1,0
4,1997_688160.txt,well here i go with the good old stream of con...,1,0,1,0,1


In [11]:
def remove_stopwords(tokens):
    filtered_tokens = [token for token in tokens if token.lower() not in STOP_WORDS]
    return filtered_tokens

def lemmatizer(text):
    document = nlp(text)
    lemmas = [token.lemma_ for token in document]
    return lemmas

In [None]:
#df['TEXT'] = df['TEXT'].apply(tokenize)
#df.head()

#df['TEXT'] = df['TEXT'].apply(lemmatizer)
#df.head()

In [None]:
#df['TEXT'] = df['TEXT'].apply(remove_stopwords)
#df.head()

In [12]:
def remove_empty(tokens):
    filtered_tokens = [token for token in tokens if token != " "]
    return filtered_tokens

In [None]:
#df['TEXT'] = df['TEXT'].apply(remove_empty)
#df.head()

In [13]:
def ngrams(sequence, n, **kwargs):
    ngrams = []
    sequence_length = len(sequence)
    for i in range(sequence_length):
        if sequence_length >= i + n:
            seq = (sequence[i])
            for k in range(n-1):
                seq = (seq, sequence[i+k+1])
            ngrams.append(seq)
    return ngrams

In [14]:
def clean_text(text):
    lines = text.split("\n")
    lines.pop(0)  
    return re.sub("[\t ]{2,}", " ", " ".join(lines))

In [15]:
def lemmatize_pipe(doc, max_len=100, ngram_min=1, ngram_max=2):
    document = nlp(doc)
    
    lemmas = []
    for token in document:
        if not token.is_stop:
            lemmas.append(token.lemma_)
    lemmas = lemmas[:max_len]
    len_lemmas = len(lemmas)

    ngrams = []
    for n in range(ngram_min, ngram_max + 1):
        for i in range(len_lemmas - n + 1):
            ngrams.append(tuple(lemmas[i:i+n]))

    return ngrams

In [16]:
df['LEMMA'] = df['TEXT'].apply(lemmatize_pipe)
df.head()

Unnamed: 0,AUTHID,TEXT,cEXT,cNEU,cAGR,cCON,cOPN,LEMMA
0,1997_504851.txt,well right now i just woke up from a midday na...,0,1,1,0,1,"[(right,), (wake,), (midday,), (nap,), (sort,)..."
1,1997_605191.txt,well here we go with the stream of consciousne...,0,0,1,0,0,"[(stream,), (consciousness,), (essay,), (thing..."
2,1997_687252.txt,an open keyboard and buttons to push the thing...,0,1,0,1,1,"[(open,), (keyboard,), (button,), (push,), (th..."
3,1997_568848.txt,i cant believe it its really happening my pu...,1,0,1,1,0,"[(not,), (believe,), ( ,), (happen,), ( ,), (p..."
4,1997_688160.txt,well here i go with the good old stream of con...,1,0,1,0,1,"[(good,), (old,), (stream,), (consciousness,),..."


In [17]:
#!pip install scikit-learn

from sklearn.model_selection import train_test_split

df_train, df_test = train_test_split(
    df[['LEMMA', 'cEXT','cNEU','cAGR','cCON','cOPN']], test_size=0.2, random_state=42
)

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

count_vectorizer = CountVectorizer(tokenizer=lambda doc: doc, lowercase=False, min_df=3)
tfidf_vectorizer = TfidfVectorizer(tokenizer=lambda doc: doc, lowercase=False, min_df=3)

In [19]:
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multioutput import ClassifierChain
from sklearn.ensemble import RandomForestClassifier

X_train = count_vectorizer.fit_transform(df_train['LEMMA'])
y_train = df_train[['cEXT','cNEU','cAGR','cCON','cOPN']]

X_test = count_vectorizer.transform(df_test['LEMMA'])
y_test = df_test[['cEXT','cNEU','cAGR','cCON','cOPN']]

model = ClassifierChain(RandomForestClassifier())
model.fit(X_train, y_train)

prediction = model.predict(X_test)



In [20]:
from sklearn.metrics import f1_score

f1_macro = f1_score(y_test, prediction, average='macro')

In [21]:
f1_macro

0.5870900023982019

In [22]:
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multioutput import ClassifierChain
from sklearn.ensemble import RandomForestClassifier

X_train2 = tfidf_vectorizer.fit_transform(df_train['LEMMA'])
y_train2 = df_train[['cEXT','cNEU','cAGR','cCON','cOPN']]

X_test2 = tfidf_vectorizer.transform(df_test['LEMMA'])
y_test2 = df_test[['cEXT','cNEU','cAGR','cCON','cOPN']]

model = ClassifierChain(RandomForestClassifier())
model.fit(X_train2, y_train2)

prediction = model.predict(X_test2)
f1_macro = f1_score(y_test2, prediction, average='macro')
f1_macro



0.5681092401941309

In [23]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression

models = [OneVsRestClassifier(LogisticRegression()) for _ in range(5)]
i = 0
for j in ['cEXT','cNEU','cAGR','cCON','cOPN']:
    models[i].fit(X_train, y_train[j])
    i = i + 1

predictions = [model.predict(X_test) for model in models]

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [24]:
import numpy as np

predictions = np.array(predictions)
predictions = predictions.T

predictions.shape

(494, 5)

In [25]:
f1_macro = f1_score(y_test, predictions, average='macro')
f1_macro

0.5542403697450148

In [26]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression

models = [OneVsRestClassifier(LogisticRegression()) for _ in range(5)]
i = 0
for j in ['cEXT','cNEU','cAGR','cCON','cOPN']:
    models[i].fit(X_train2, y_train2[j])
    i = i + 1

predictions = [model.predict(X_test2) for model in models]
predictions = np.array(predictions)
predictions = predictions.T

predictions.shape
f1_macro = f1_score(y_test2, predictions, average='macro')
f1_macro

0.6025840261835613

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC

vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

model = OneVsRestClassifier(SVC())

classifier.fit(X_train_tfidf, y_train)

y_pred = classifier.predict(X_test_tfidf)
f1_macro = f1_score(y_test, y_pred, average='macro')
f1_macro