In [1]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
import pandas as pd

from sklearn.svm import LinearSVC


In [2]:
categorias = ['sci.space', 'rec.sport.baseball', 'comp.graphics', 'talk.politics.misc']
news = fetch_20newsgroups(subset='train', categories=categorias, remove=('headers', 'footers', 'quotes'))


In [3]:
df_news = pd.DataFrame({"text": news.data, "target": news.target})
print(df_news)

                                                   text  target
0                      I thought that was Sandy Koufax.       1
1     \nAnd the religious right worships engines, sm...       3
2     \nHow can a witness tell that someone in a bur...       3
3     \n\n\n\nYes, long before Star Trek.  Before Ei...       2
4     \n\nIt depends.  If you can get your old veter...       1
...                                                 ...     ...
2234  \nGreat! I'll visit the National Air and Space...       2
2235  \nIf Comet Shoemaker-Levy 1993e is in Jovian o...       2
2236  \nThere was a recession, and none of the poten...       2
2237  THE WHITE HOUSE\n\n\n                  Office ...       3
2238  G'day all,\n\nCan anybody point me at a utilit...       0

[2239 rows x 2 columns]


In [4]:
X_train, X_test, y_train, y_test = train_test_split(news.data, news.target, test_size=0.3, random_state=42)

In [5]:
vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [6]:
model = LinearSVC()
model.fit(X_train_tfidf, y_train)

In [7]:
y_pred = model.predict(X_test_tfidf)
reporte = classification_report(y_test, y_pred, target_names=categorias, output_dict=True)

In [8]:
dataframe=pd.DataFrame(reporte).transpose()

dataframe

Unnamed: 0,precision,recall,f1-score,support
sci.space,0.898734,0.8875,0.893082,160.0
rec.sport.baseball,0.829384,0.956284,0.888325,183.0
comp.graphics,0.920904,0.791262,0.851175,206.0
talk.politics.misc,0.857143,0.878049,0.86747,123.0
accuracy,0.875,0.875,0.875,0.875
macro avg,0.876541,0.878274,0.875013,672.0
weighted avg,0.879032,0.875,0.874252,672.0


In [11]:
#from google.colab import drive
#drive.mount('/content/drive')

Mounted at /content/drive


In [12]:
#ruta = '/content/drive/MyDrive/MeIA/PLN/noticias_texto.csv'

In [19]:
# Esta ruta apunta a un lugar específico en Google Drive
ruta = 'noticias_texto.csv'
df = pd.read_csv(ruta)

In [13]:
df = pd.read_csv(ruta)
df.head()

Unnamed: 0,text,label
0,El presidente anunció una nueva reforma. El he...,política mexicana
1,La UNAM desarrolló una vacuna nasal. Esto gene...,ciencia y salud
2,Pumas empató en el último minuto. Esto generó ...,deportes (fútbol)
3,Capturan a presunto líder de cártel en Sinaloa...,seguridad y justicia
4,Belinda lanza su nuevo sencillo. Esto generó d...,espectáculos


In [14]:
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size=0.2, random_state=42)

In [15]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

stopwords_es = stopwords.words('spanish')
vectorizer = TfidfVectorizer(max_features=3000, stop_words=stopwords_es)

#vectorizer = TfidfVectorizer(max_features=3000, stop_words='spanish')
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [16]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train_tfidf, y_train)

# Evaluación
y_pred = model.predict(X_test_tfidf)
reporte = classification_report(y_test, y_pred, output_dict=True)

In [17]:
dataframe=pd.DataFrame(reporte).transpose()

dataframe

Unnamed: 0,precision,recall,f1-score,support
ciencia y salud,1.0,1.0,1.0,44.0
deportes (fútbol),1.0,1.0,1.0,38.0
espectáculos,1.0,1.0,1.0,51.0
política mexicana,1.0,1.0,1.0,34.0
seguridad y justicia,1.0,1.0,1.0,33.0
accuracy,1.0,1.0,1.0,1.0
macro avg,1.0,1.0,1.0,200.0
weighted avg,1.0,1.0,1.0,200.0
