In [None]:
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from tqdm import tqdm

nltk.download('omw-1.4')
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

df = pd.read_excel('Data.xlsx')

datos = df['Data'].tolist()
etiquetas = df['Etiqueta'].tolist()

stop_words = set(stopwords.words('english'))
datos_procesados = []
lemmatizer = WordNetLemmatizer()
for texto in datos:
    if isinstance(texto, (int, np.integer, float, np.floating)):
        texto = str(texto)
        tokens = word_tokenize(texto.lower())
        tokens_filtrados = [token for token in tokens if token not in stop_words]
        tokens_lemmatized = [lemmatizer.lemmatize(token) for token in tokens_filtrados]
        datos_procesados.append(' '.join(tokens_lemmatized))

df['Columna_de_datos_procesados'] = datos_procesados

stop_words = set(stopwords.words('english'))
vectorizer = TfidfVectorizer(stop_words=stop_words)
X = vectorizer.fit_transform(datos_procesados)

print(X.shape)

X_train, X_test, y_train, y_test = train_test_split(X, etiquetas, test_size=0.2, random_state=42)
clf = SVC(kernel='linear', random_state=42)
for i in tqdm(range(10)):
    clf.fit(X_train, y_train)

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Fankrit\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Fankrit\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Fankrit\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Fankrit\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Precisión del modelo: {:.2f}%".format(accuracy * 100))