In [1]:
import numpy as np 
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os
import re

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import cohen_kappa_score

In [2]:
files = []
for dirname, _, filenames in os.walk('/kaggle/input/learning-agency-lab-automated-essay-scoring-2'):
    for filename in filenames:
        print(filename)
        files.append(pd.read_csv(os.path.join(dirname,filename)))

sample_submission.csv
train.csv
test.csv


In [3]:
subb = files[0]
subb.head()

Unnamed: 0,essay_id,score
0,000d118,3
1,000fe60,3
2,001ab80,4


In [4]:
df_train = files[1]
df_train.head()

Unnamed: 0,essay_id,full_text,score
0,000d118,Many people have car where they live. The thin...,3
1,000fe60,I am a scientist at NASA that is discussing th...,3
2,001ab80,People always wish they had the same technolog...,4
3,001bdc0,"We all heard about Venus, the planet without a...",4
4,002ba53,"Dear, State Senator\n\nThis is a letter to arg...",3


In [5]:
df_train.shape

(17307, 3)

In [6]:
df_test = files[2]
df_test.head()

Unnamed: 0,essay_id,full_text
0,000d118,Many people have car where they live. The thin...
1,000fe60,I am a scientist at NASA that is discussing th...
2,001ab80,People always wish they had the same technolog...


In [7]:
df_test.shape

(3, 2)

In [8]:
def removeHTML(text):
    html = re.compile(r"<.*?>")
    return html.sub(r"", text)

In [9]:
def preprocessing(text):
    text = text.lower()
    text = removeHTML(text)
    text = re.sub("@\w+", '', text)
    text = re.sub("'\d+|\d+", '', text)  # eliminar números
    text = re.sub("http\w+", '', text)  # eliminar URL
    text = re.sub(r"\.+", ".", text)  # reemplazar puntos consecutivos con un solo punto
    text = re.sub(r"\,+", ",", text)  # reemplazar comas consecutivas con una sola coma
    text = text.strip()  # eliminar espacios al principio y al final
    return text

In [10]:
# Inicializar una lista vacía para almacenar el texto limpio
clean_corpus = []

# Iterar sobre cada texto en la columna 'full_text' del DataFrame train
for i in df_train['full_text']:
    # Aplicar la función de preprocesamiento de datos al texto
    cleaned_text = preprocessing(i)
    # Añadir el texto limpio a la lista clean_corpus
    clean_corpus.append(cleaned_text)

In [11]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(clean_corpus)
X.shape

(17307, 63901)

In [12]:
y = df_train["score"]
y.shape

(17307,)

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((13845, 63901), (13845,), (3462, 63901), (3462,))

Model

In [14]:
model = LogisticRegression(max_iter = 600)
model.fit(X_train, y_train)

In [15]:
preds = model.predict(X_test)
preds

array([2, 3, 5, ..., 3, 2, 3])

In [16]:
preds.shape

(3462,)

Score

In [17]:
score = cohen_kappa_score(y_test, preds)
score

0.3169576389746147

In [18]:
Xt = vectorizer.transform(df_test["full_text"])
Xt.shape

(3, 63901)

In [19]:
preds2 = model.predict(Xt)
preds2

array([3, 3, 4])

In [20]:
predictions_df = pd.DataFrame(preds2, columns = ["score"])
predictions_df

Unnamed: 0,score
0,3
1,3
2,4


Submission

In [21]:
predictions_df_f = pd.concat([files[0].iloc[:, 0], predictions_df], axis=1)
predictions_df_f.to_csv('submission.csv', index=False)

In [22]:
sub = pd.read_csv("submission.csv")
sub.head()

Unnamed: 0,essay_id,score
0,000d118,3
1,000fe60,3
2,001ab80,4
