In [96]:
import re
import spacy

import numpy as np
import pandas as pd

from spacy.tokens import Doc, Token

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline

In [43]:
Doc.set_extension("spelling_errors", default=[])

ValueError: [E090] Extension 'spelling_errors' already exists on Doc. To overwrite the existing extension, set `force=True` on `Doc.set_extension`.

In [80]:
essays_raw = pd.read_json('essays.json')

base_texts = essays_raw[:2]
essays     = essays_raw.dropna(subset = 'text') 

In [13]:
essays.columns

Index(['type', 'title', 'description', 'info', 'url', 'date', 'prompt', 'text',
       'final_score', 'criteria_scores', 'review', 'errors', 'comments'],
      dtype='object')

In [27]:
training_essays.head()

Unnamed: 0,title,text
2,[redação sem título],"No livro ''Os treze porquês"" do escritor Jay A..."
3,[redação sem título],Motivos de risada para alguns e de sofrimento ...
4,[redação sem título],O filme Karatê Kid ilustra bem o que é o bull ...
5,[redação sem título],A tempos atr s n o viamos muito se fala em bul...
6,[redação sem título],"No livro ''O Cortiço'', de Aluísio de Azevedo,..."


In [131]:
# Carregar o modelo spaCy para o português
nlp = spacy.load("pt_core_news_sm")

errors             = []
n_error            = []
tokens             = []
token_count        = []
token_nostop_count = []
lemmas             = []
lemmas_count       = []
pos                = []
pos_count          = []
sents              = []
sents_count        = []
ner                = []
ner_count          = []

n = 100#len(essays)
essays_short = essays[:n]

i=0
for essay in essays['text'][:n]:
    tokens.append(list(token.text for token in nlp(essay)))
    lemmas.append(list(token.lemma_ for token in nlp(essay)))
    pos.append(list(token.pos_ for token in nlp(essay)))
    sents.append(list(sent.text for sent in nlp(essay).sents))
    ner.append(list((ent.text, ent.label_) for ent in nlp(essay).ents))

    token_count.append(len(tokens[-1]))
    token_nostop_count.append(len([token for token in nlp(essay) if not token.is_stop]))
    lemmas_count.append(len(lemmas[-1]))
    pos_count.append(len(pos[-1]))
    sents_count.append(len(sents[-1]))
    ner_count.append(len(ner[-1]))

    i += 1

essays_short['tokens'] = tokens
essays_short['lemmas'] = lemmas
essays_short['pos'] = pos
essays_short['sents'] = sents
essays_short['ner'] = ner

essays_short['token_count'] = token_count
essays_short['tokens_nostop_count'] = token_nostop_count
essays_short['lemmas_count'] = lemmas_count
essays_short['pos_count'] = pos_count
essays_short['sents_count'] = sents_count
essays_short['ner_count'] = ner_count

"""
essays['tokens'] = tokens
essays['token_count'] = token_count
essays['tokens_nostop_count'] = tokens_nostop_count
essays['lemmas'] = lemmas
essays['pos'] = pos
essays['sents'] = sents
essays['ner'] = ner
"""

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  essays_short['tokens'] = tokens
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  essays_short['lemmas'] = lemmas
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  essays_short['pos'] = pos
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value 

"\nessays['tokens'] = tokens\nessays['token_count'] = token_count\nessays['tokens_nostop_count'] = tokens_nostop_count\nessays['lemmas'] = lemmas\nessays['pos'] = pos\nessays['sents'] = sents\nessays['ner'] = ner\n"

In [132]:
train_essays = essays_short[:int(0.8*len(essays_short))] 
train_scores = essays_short[['final_score', 'criteria_scores']][:int(0.8*len(essays_short))]

test_essays = essays_short[int(0.8*len(essays_short)):] 
test_scores = essays_short[['final_score', 'criteria_scores']][int(0.8*len(essays_short)):]

In [133]:
# Selecionar as features
features = ['token_count', 'tokens_nostop_count', 'lemmas_count', 'pos_count', 'sents_count', 'ner_count']#,'token']

# Definir variáveis de entrada (X) e variável de saída (y) para treinamento
X_train = train_essays[features]
y_train = train_scores['final_score']

# Definir variáveis de entrada (X) e variável de saída (y) para teste
X_test = test_essays[features]
y_test = test_scores['final_score']
"""
# Função para obter a média dos embeddings de palavras em uma frase
def get_sentence_embedding(sentence):
    tokens = nlp(sentence)
    return np.mean([token.vector for token in tokens if token.has_vector], axis=0)

# Aplicar a função para obter os embeddings das redações
X_train_text_embeddings = X_train['tokens'].apply(lambda tokens: get_sentence_embedding(" ".join(tokens)) if tokens else np.zeros(300))
X_test_text_embeddings = X_test['tokens'].apply(lambda tokens: get_sentence_embedding(" ".join(tokens)) if tokens else np.zeros(300))

print(X_train_text_embeddings)

# Converter os embeddings de palavras para um DataFrame
X_train_text = pd.DataFrame(X_train_text_embeddings.tolist(), columns=[f'emb_{i}' for i in range(96)])
X_test_text = pd.DataFrame(X_test_text_embeddings.tolist(), columns=[f'emb_{i}' for i in range(96)])

# Concatenar os embeddings de palavras com as outras features
X_train = pd.concat([X_train.drop(columns='tokens'), X_train_text], axis=1)
X_test = pd.concat([X_test.drop(columns='tokens'), X_test_text], axis=1)

# Substituir possíveis valores infinitos ou NaNs por 0
X_train.replace([np.inf, -np.inf], np.nan, inplace=True)
X_test.replace([np.inf, -np.inf], np.nan, inplace=True)
X_train.fillna(0, inplace=True)
X_test.fillna(0, inplace=True)
"""
print(X_train.shape)
print(y_train)

# Treinar um modelo de regressão linear
model = LinearRegression()
model.fit(X_train, y_train)

# Fazer previsões no conjunto de teste
predictions = model.predict(X_test)

# Avaliar o desempenho do modelo
mse = mean_squared_error(y_test, predictions)
r2 = r2_score(y_test, predictions)

print(f'Mean Squared Error: {mse}')
print(f'R-squared: {r2}')


(80, 6)
2     300.0
3     300.0
4     400.0
5     200.0
6     850.0
      ...  
83    600.0
84    300.0
85    500.0
86    600.0
87    450.0
Name: final_score, Length: 80, dtype: object
Mean Squared Error: 46090.225792103
R-squared: -1.0973936651696476
