In [14]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, classification_report
from sklearn.pipeline import Pipeline
from sklearn.datasets import fetch_20newsgroups
from sklearn.linear_model import LogisticRegression

In [38]:
dataframe = fetch_20newsgroups()
# dataframe

In [9]:
# Dividimos el DataFrame en datos y etiquetas
X = dataframe['data']
y = dataframe['target']

# Dividimos los datos en entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)


In [23]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression, RidgeClassifier, SGDClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

In [22]:
vectorizer = CountVectorizer(stop_words='english', max_features=10000)

classifiers = [
    ('naive_bayes', MultinomialNB()),
    ('logistic_regression', LogisticRegression()),
    ('support_vector_classifier', SVC()),
    ('tree', DecisionTreeClassifier()),
    ('neighbors', KNeighborsClassifier()),
    ('ridge_classifier', RidgeClassifier()),
    ('SGD_classifier', SGDClassifier())
]

In [25]:
pipelines = [
    Pipeline([
        ('vect', vectorizer),
        ('clf', clf[1])
    ])
    for clf
    in classifiers
]

for pl in pipelines:
    pl.fit(X_train, y_train)
    

In [26]:
predictions = [
    pl.predict(X_test)
    for pl
    in pipelines
]

accuracy = {
    cl[0]: accuracy_score(y_test, p)
    for cl, p
    in zip(classifiers, predictions)
}

accuracy

{'ridge_classifier': 0.7242841993637328, 'SGD_classifier': 0.8275008837044893}

In [21]:
pd.DataFrame(((a[0], a[1]) for a in accuracy.items()), columns=["method", "accuracy"]).sort_values("accuracy", ascending=False)

Unnamed: 0,method,accuracy
1,logistic_regression,0.87416
0,naive_bayes,0.84058
3,tree,0.653588
4,neighbors,0.479675
2,support_vector_classifier,0.086603


In [24]:
classifiers = [
    ('ridge_classifier', RidgeClassifier()),
    ('SGD_classifier', SGDClassifier())
]

In [27]:
pd.DataFrame(((a[0], a[1]) for a in accuracy.items()), columns=["method", "accuracy"]).sort_values("accuracy", ascending=False)

Unnamed: 0,method,accuracy
1,SGD_classifier,0.827501
0,ridge_classifier,0.724284


In [28]:
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingRandomSearchCV

In [30]:
test_params = {
    'loss': ['hinge', 'log_loss', 'modified_huber', 'squared_hinge', 'perceptron', 'squared_error', 'huber', 'epsilon_insensitive', 'squared_epsilon_insensitive'],
    'penalty': ['l1', 'l2', 'elasticnet'],
}

vectorizer.fit(X_train)
Xvec_train = vectorizer.transform(X_train)

clf = SGDClassifier()

search = HalvingRandomSearchCV(clf, test_params, random_state=42)
search.fit(Xvec_train, y_train)
search.best_params_



{'penalty': 'l2', 'loss': 'perceptron'}

In [31]:
best_clf = SGDClassifier(**search.best_params_)
best_clf.fit(Xvec_train, y_train)

Xvec_test = vectorizer.transform(X_test)

prediction = best_clf.predict(Xvec_test)

print(f"Accuracy: {accuracy_score(prediction, y_test)}")

Accuracy: 0.8342170378225522


In [37]:
from sklearn.model_selection import GridSearchCV

test_params = {
    'solver': ['newton-cg', 'sag', 'saga', 'lbfgs'],
    'penalty': ['l2', None],
}

vectorizer.fit(X_train)
Xvec_train = vectorizer.transform(X_train)

clf = LogisticRegression()

search = GridSearchCV(clf, test_params)
search.fit(Xvec_train, y_train)
search.best_params_



{'penalty': 'l2', 'solver': 'lbfgs'}

In [10]:
# Creamos un vectorizador y un clasificador Naive Bayes
vectorizer = CountVectorizer(stop_words='english', max_features=10000)
clf = MultinomialNB()

# Creamos un pipeline que incluya el vectorizador y el clasificador
pipeline = Pipeline([
    ('vect', vectorizer),
    ('clf', clf)
])

# Entrenamos el modelo
pipeline.fit(X_train, y_train)

# Comprobar los resultados con los datos de prueba
predictions = pipeline.predict(X_test)
print("Accuracy:", accuracy_score(y_test, predictions))
print(classification_report(y_test, predictions))

Accuracy: 0.8405797101449275
              precision    recall  f1-score   support

           0       0.89      0.94      0.92       117
           1       0.62      0.83      0.71       138
           2       1.00      0.03      0.05       142
           3       0.53      0.81      0.64       145
           4       0.77      0.85      0.81       157
           5       0.78      0.85      0.82       136
           6       0.72      0.82      0.77       137
           7       0.88      0.90      0.89       164
           8       0.89      0.94      0.91       155
           9       0.91      0.96      0.93       141
          10       0.98      0.93      0.95       136
          11       1.00      0.88      0.94       156
          12       0.86      0.83      0.84       144
          13       0.93      0.93      0.93       145
          14       0.95      0.91      0.93       152
          15       0.93      0.91      0.92       150
          16       0.93      0.94      0.93       15

In [12]:

# Creamos un vectorizador y un clasificador Naive Bayes
vectorizer = CountVectorizer(stop_words='english', max_features=10000)
clf = LogisticRegression()

# Creamos un pipeline que incluya el vectorizador y el clasificador
pipeline = Pipeline([
    ('vect', vectorizer),
    ('clf', clf)
])

# Entrenamos el modelo
pipeline.fit(X_train, y_train)

# Comprobar los resultados con los datos de prueba
predictions = pipeline.predict(X_test)
print("Accuracy:", accuracy_score(y_test, predictions))
print(classification_report(y_test, predictions))

Accuracy: 0.8741604807352421
              precision    recall  f1-score   support

           0       0.92      0.94      0.93       117
           1       0.71      0.77      0.74       138
           2       0.78      0.82      0.80       142
           3       0.74      0.77      0.76       145
           4       0.87      0.78      0.83       157
           5       0.81      0.82      0.81       136
           6       0.83      0.83      0.83       137
           7       0.87      0.90      0.88       164
           8       0.90      0.92      0.91       155
           9       0.87      0.94      0.90       141
          10       0.94      0.92      0.93       136
          11       0.99      0.93      0.96       156
          12       0.79      0.79      0.79       144
          13       0.91      0.94      0.93       145
          14       0.99      0.91      0.95       152
          15       0.90      0.93      0.92       150
          16       0.92      0.95      0.93       15