## Import dependencies

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import nltk
from nltk.corpus import stopwords
import string

In [2]:
# import kaggle API key
from google.colab import files

files.upload()

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"primegodmusic","key":"8c53c6426b91a19f449a59d2932a69f3"}'}

In [3]:
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json

In [4]:
# download the kaggle dataset
!kaggle datasets download jp797498e/twitter-entity-sentiment-analysis

Dataset URL: https://www.kaggle.com/datasets/jp797498e/twitter-entity-sentiment-analysis
License(s): CC0-1.0
Downloading twitter-entity-sentiment-analysis.zip to /content
  0% 0.00/1.99M [00:00<?, ?B/s]
100% 1.99M/1.99M [00:00<00:00, 397MB/s]


In [5]:
# unzip the dataset
!unzip twitter-entity-sentiment-analysis.zip -d /content/data

Archive:  twitter-entity-sentiment-analysis.zip
  inflating: /content/data/twitter_training.csv  
  inflating: /content/data/twitter_validation.csv  


In [6]:
# read the csv file
df = pd.read_csv('/content/data/twitter_training.csv')

nltk.download('stopwords') # Stopwords are common words (like 'the', 'a', 'is', 'in') that are typically filtered out from text before processing
                           # Purpose: Removes common, low-information words to improve text analysis efficiency and accuracy.

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [7]:
# print first 5 rows form the dataframe
df.head()

Unnamed: 0,2401,Borderlands,Positive,"im getting on borderlands and i will murder you all ,"
0,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
1,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
2,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
3,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
4,2401,Borderlands,Positive,im getting into borderlands and i can murder y...


In [8]:
# change columns names
df = df.rename(columns={'Positive': 'sentiment', 'im getting on borderlands and i will murder you all ,': 'text'})

In [9]:
# check for null values
df.isnull().sum()

Unnamed: 0,0
2401,0
Borderlands,0
sentiment,0
text,686


In [10]:
# erase rows with null values
df = df.dropna()

In [11]:
df.isnull().sum()

Unnamed: 0,0
2401,0
Borderlands,0
sentiment,0
text,0


In [12]:
df['sentiment'].value_counts() # we have 4 classes in the target column

Unnamed: 0_level_0,count
sentiment,Unnamed: 1_level_1
Negative,22358
Positive,20654
Neutral,18108
Irrelevant,12875


### drop 'Irrelevant' and 'Neutral' from de data frame to leave only the positive and negative results

In [13]:
drop_irrelevant = df[df['sentiment'] == 'Irrelevant']
drop_neutral = df[df['sentiment'] == 'Neutral']

df = df.drop(drop_irrelevant.index)
df = df.drop(drop_neutral.index)

df['sentiment'].value_counts()

Unnamed: 0_level_0,count
sentiment,Unnamed: 1_level_1
Negative,22358
Positive,20654


###Replace the positive and negative values ​​with the numerical values ​​1 and 0

In [14]:
df['sentiment'] = df['sentiment'].replace('Negative', 0)
df['sentiment'] = df['sentiment'].replace('Positive', 1)

  df['sentiment'] = df['sentiment'].replace('Positive', 1)


In [15]:
df['sentiment'].value_counts()

Unnamed: 0_level_0,count
sentiment,Unnamed: 1_level_1
0,22358
1,20654


In [16]:
stop_words = set(stopwords.words('english')) # creates a Python set containing common English stopwords from the NLTK library

### Process the text to be used by TfidfVectorizer

In [17]:
def process_text(text):
  text = text.lower()
  text = ''.join([char for char in text if char not in string.punctuation])
  words = text.split()
  words = [word for word in words if word not in stop_words]
  return ' '.join(words)

In [18]:
# create new colum with the text processed and cleaned
df['text_cleaned'] = df['text'].apply(process_text)

print(df[['text', 'text_cleaned']].head())

                                                text  \
0  I am coming to the borders and I will kill you...   
1  im getting on borderlands and i will kill you ...   
2  im coming on borderlands and i will murder you...   
3  im getting on borderlands 2 and i will murder ...   
4  im getting into borderlands and i can murder y...   

                      text_cleaned  
0              coming borders kill  
1      im getting borderlands kill  
2     im coming borderlands murder  
3  im getting borderlands 2 murder  
4    im getting borderlands murder  


In [19]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['text_cleaned'])
y = df['sentiment']

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"Datos de entrenamiento: {X_train.shape[0]} ejemplos")
print(f"Datos de prueba: {X_test.shape[0]} ejemplos")

Datos de entrenamiento: 34409 ejemplos
Datos de prueba: 8603 ejemplos


In [21]:
model = LogisticRegression(random_state=42)
model.fit(X_train, y_train)

print("\nModelo entrenado exitosamente!")

predictions = model.predict(X_test)
precition = accuracy_score(y_test, predictions)
print(f"Precisión del modelo: {precition:.2f}")


Modelo entrenado exitosamente!
Precisión del modelo: 0.89


Testing

In [27]:
text = "I dont like this product"
text_cleaned = process_text(text)

vectorized_text = vectorizer.transform([text_cleaned])

prediction = model.predict(vectorized_text)[0]

In [28]:
if prediction == 1:
  print("The feeling is positive")
else:
  print("The feeling is negative")

The feeling is negative
