In [1]:
import numpy as np
import pandas as pd
import re
from nltk import download
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split



In [2]:
df = pd.read_csv("hf://datasets/PrkhrAwsti/Twitter_Sentiment_3M/twitter_dataset.csv")


In [3]:
def limpiar_tweet(text):
    # Convertir a minúsculas
    text = text.lower()
    
    # Eliminar URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    
    # Eliminar menciones (@usuario)
    text = re.sub(r'@\w+', '', text)
    
    # Eliminar caracteres que no sean letras, números, espacios, o hashtags
    text = re.sub(r'[^a-z0-9# ]', ' ', text)
    
    # Eliminar letras sueltas (que suelen ser ruido)
    text = re.sub(r'\s+[a-z]\s+', ' ', text)
    
    # Reducir espacios múltiples a uno
    text = re.sub(r'\s+', ' ', text).strip()

    return text.split()

In [4]:
download("wordnet")
lemmatizer = WordNetLemmatizer()
download("stopwords")
stop_words = set(stopwords.words("english"))

def lemmatizar_text(tweets, lemmatizer=lemmatizer, stop_words=stop_words):
    
    # Lematizar los tweets
    tokens = [lemmatizer.lemmatize(tweet) for tweet in tweets]
    
    # Eliminar stopwords
    tokens = [tweet for tweet in tokens if tweet not in stop_words]
    
    # Eliminar tweets con longitud menor o igual a 3 caracteres
    tokens = [tweet for tweet in tokens if len(tweet) > 3]
    
    return tokens


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\luisC\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\luisC\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
df=df.dropna()
df=df.drop_duplicates() 
df['tweet_length'] = df['tweet'].apply(lambda x: len(x.split()))
df = df[df['tweet_length'] > 0]
df = df[df['sentiment'] != 2]
df['sentiment'] = df['sentiment'].astype(int)
df


Unnamed: 0.1,Unnamed: 0,tweet,sentiment,tweet_length
0,0,is upset that he can't update his Facebook by ...,0,21
1,1,@Kenichan I dived many times for the ball. Man...,0,18
2,2,my whole body feels itchy and like its on fire,0,10
3,3,"@nationwideclass no, it's not behaving at all....",0,21
4,4,@Kwesidei not the whole crew,0,5
...,...,...,...,...
3138695,3138695,Q3 2020 EPS Estimates for Ball Co. $BLL Increa...,1,13
3138698,3138698,#stocks back from the recovery room: https://t...,1,17
3138699,3138699,RT @MacroCharts: Breadth – expanding last week...,1,23
3138701,3138701,$AAPL $QQQ Top may now be in. https://t.co/iNK...,1,8


In [6]:
df['pro_Tweet']=df['tweet'].apply(limpiar_tweet)
df

Unnamed: 0.1,Unnamed: 0,tweet,sentiment,tweet_length,pro_Tweet
0,0,is upset that he can't update his Facebook by ...,0,21,"[is, upset, that, he, can, update, his, facebo..."
1,1,@Kenichan I dived many times for the ball. Man...,0,18,"[dived, many, times, for, the, ball, managed, ..."
2,2,my whole body feels itchy and like its on fire,0,10,"[my, whole, body, feels, itchy, and, like, its..."
3,3,"@nationwideclass no, it's not behaving at all....",0,21,"[no, it, not, behaving, at, all, m, mad, why, ..."
4,4,@Kwesidei not the whole crew,0,5,"[not, the, whole, crew]"
...,...,...,...,...,...
3138695,3138695,Q3 2020 EPS Estimates for Ball Co. $BLL Increa...,1,13,"[q3, 2020, eps, estimates, for, ball, co, bll,..."
3138698,3138698,#stocks back from the recovery room: https://t...,1,17,"[#stocks, back, from, the, recovery, room, fai..."
3138699,3138699,RT @MacroCharts: Breadth – expanding last week...,1,23,"[rt, breadth, expanding, last, week, discussio..."
3138701,3138701,$AAPL $QQQ Top may now be in. https://t.co/iNK...,1,8,"[aapl, qqq, top, may, now, be, in]"


In [7]:
df['lem_tweet']=df['pro_Tweet'].apply(lemmatizar_text)
df

Unnamed: 0.1,Unnamed: 0,tweet,sentiment,tweet_length,pro_Tweet,lem_tweet
0,0,is upset that he can't update his Facebook by ...,0,21,"[is, upset, that, he, can, update, his, facebo...","[upset, update, facebook, texting, might, resu..."
1,1,@Kenichan I dived many times for the ball. Man...,0,18,"[dived, many, times, for, the, ball, managed, ...","[dived, many, time, ball, managed, save, rest,..."
2,2,my whole body feels itchy and like its on fire,0,10,"[my, whole, body, feels, itchy, and, like, its...","[whole, body, feel, itchy, like, fire]"
3,3,"@nationwideclass no, it's not behaving at all....",0,21,"[no, it, not, behaving, at, all, m, mad, why, ...",[behaving]
4,4,@Kwesidei not the whole crew,0,5,"[not, the, whole, crew]","[whole, crew]"
...,...,...,...,...,...,...
3138695,3138695,Q3 2020 EPS Estimates for Ball Co. $BLL Increa...,1,13,"[q3, 2020, eps, estimates, for, ball, co, bll,...","[2020, estimate, ball, increased, analyst, #st..."
3138698,3138698,#stocks back from the recovery room: https://t...,1,17,"[#stocks, back, from, the, recovery, room, fai...","[#stocks, back, recovery, room, fair, value, f..."
3138699,3138699,RT @MacroCharts: Breadth – expanding last week...,1,23,"[rt, breadth, expanding, last, week, discussio...","[breadth, expanding, last, week, discussion, h..."
3138701,3138701,$AAPL $QQQ Top may now be in. https://t.co/iNK...,1,8,"[aapl, qqq, top, may, now, be, in]",[aapl]


In [8]:
lista_tweets=df['lem_tweet']                                           
lista_tweets = [" ".join(tweet) for tweet in lista_tweets]
lista_tweets
X_train, X_test, y_train, y_test = train_test_split(lista_tweets, df["sentiment"], test_size = 0.2, random_state = 42)
vectorizer = TfidfVectorizer(max_features = 5000, max_df = 0.8, min_df = 5)
X_train_vectorized = vectorizer.fit_transform(X_train)