# Resultados parciales y visualizaciones estáticas

In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, confusion_matrix

In [12]:
# carga de los datos

train_dataset = pd.read_csv("clean_train.csv")
test_dataset = pd.read_csv("clean_test.csv")

print(train_dataset.head(3))
print(test_dataset.head(3))

       essay_id  discourse_id discourse_type  \
0  007ACE74B050  0013cc385424           Lead   
1  007ACE74B050  9704a709b505       Position   
2  007ACE74B050  c22adee811b6          Claim   

                                      discourse_text  \
0  Hi, i'm Isaac, i'm going to be writing about h...   
1  On my perspective, I think that the face is a ...   
2  I think that the face is a natural landform be...   

                                discourse_text_clean  \
0  hi i'm isaac i'm going to be writing about how...   
1  on my perspective i think that the face is a n...   
2  i think that the face is a natural landform be...   

                                          essay_text  \
0  Hi, i'm Isaac, i'm going to be writing about h...   
1  Hi, i'm Isaac, i'm going to be writing about h...   
2  Hi, i'm Isaac, i'm going to be writing about h...   

                                    essay_text_clean discourse_effectiveness  \
0  hi i'm isaac i'm going to be writing about how...

## SVM + TF-IDF

In [14]:
X_train = train_dataset["discourse_text_clean"]
y_train = train_dataset["label"]

# vectorización con TF-IDF
tfidf = TfidfVectorizer(
    sublinear_tf=True,
    min_df=5,
    norm='l2',
    encoding='utf-8',
    ngram_range=(1,2),
    stop_words='english'
)

X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(test_dataset["discourse_text"])

svm = LinearSVC(random_state=42)
svm.fit(X_train_tfidf, y_train)

test_predictions = svm.predict(X_test_tfidf)
test_dataset["predicted_label"] = test_predictions

print(test_dataset[["discourse_text", "predicted_label"]].head())

                                      discourse_text  predicted_label
0  Making choices in life can be very difficult. ...                1
1  Seeking multiple opinions can help a person ma...                1
2                     it can decrease stress levels                 1
3             a great chance to learn something new                 2
4               can be very helpful and beneficial.                 1
