<a href="https://colab.research.google.com/github/CD-AC/MLEnginner_NPL/blob/main/MLEnginner_NPL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import Dependencies

In [11]:
import nltk
nltk.download('punkt')
import random
from nltk.corpus import twitter_samples
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


# Descargar datos de entrenamiento de NLTK

In [12]:
nltk.download('twitter_samples')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package twitter_samples to /root/nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

# Cargar datos de entrenamiento y preprocesamiento

In [13]:
positive_tweets = twitter_samples.strings('positive_tweets.json')
negative_tweets = twitter_samples.strings('negative_tweets.json')
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Función para limpiar y preprocesar un tweet

In [14]:
def preprocess_tweet(tweet):
    tokens = word_tokenize(tweet.lower())  # Tokenización y conversión a minúsculas
    cleaned_tokens = [lemmatizer.lemmatize(token) for token in tokens if token.isalnum()]  # Lematización y eliminación de signos de puntuación
    cleaned_tokens = [token for token in cleaned_tokens if token not in stop_words]  # Eliminación de palabras vacías
    return ' '.join(cleaned_tokens)

# Preprocesamiento de tweets positivos y negativos

In [15]:
positive_tweets_preprocessed = [preprocess_tweet(tweet) for tweet in positive_tweets]
negative_tweets_preprocessed = [preprocess_tweet(tweet) for tweet in negative_tweets]

# Crear corpus de texto y etiquetas

In [16]:
corpus = positive_tweets_preprocessed + negative_tweets_preprocessed
labels = ['Positive'] * len(positive_tweets_preprocessed) + ['Negative'] * len(negative_tweets_preprocessed)


# Extracción de características usando CountVectorizer

In [17]:
vectorizer = CountVectorizer(max_features=1000)
X = vectorizer.fit_transform(corpus).toarray()

# División de datos en conjuntos de entrenamiento y prueba

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.2, random_state=42)

# Entrenamiento del modelo de clasificación SVM

In [19]:
classifier = SVC(kernel='linear')
classifier.fit(X_train, y_train)

# Predicción y evaluación del modelo

In [20]:
y_pred = classifier.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

    Negative       0.72      0.78      0.75       988
    Positive       0.77      0.70      0.73      1012

    accuracy                           0.74      2000
   macro avg       0.74      0.74      0.74      2000
weighted avg       0.74      0.74      0.74      2000

