# Sentiment Analysis Notebook

Sentiment analysis notebook with a simple RNN by Basel.

## 1. Includes

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import re as reg
import string
from tensorflow.keras.preprocessing.sequence import pad_sequences


nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

from sklearn.feature_extraction.text import CountVectorizer

from sklearn.model_selection import train_test_split, cross_val_score


# Eval
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix, classification_report



pd.set_option('display.max_colwidth', 100)
plt.style.use('ggplot')

## 2. Dataset info

In [None]:

splits = {'train': 'train_df.csv', 'validation': 'val_df.csv', 'test': 'test_df.csv'}
df = pd.read_csv("hf://datasets/Sp1786/multiclass-sentiment-analysis-dataset/" + splits["train"])
print(f"Dataset shape: {df.shape}")
print("\nFirst few rows:")
df.head()

In [None]:
print("nulls:")
df.isnull().sum()

In [None]:
print("has links?")
df[df['text'].str.contains('https?')].head()

In [None]:
print("all english letters?")
df[df['text'].str.contains('[^\x00-\x7F]')].head()

## 3. Cleaning and stemming

In [None]:
def cleaner(text):
    text = text.lower()
    text = reg.sub(r'http\S+|www\S+|https\S+', '', text)
    text = reg.sub(r'@\w+', '', text)
    text = reg.sub(r'[^a-zA-Z\s]', '', text)    
    text = reg.sub(r'\s+', ' ', text).strip()
    return text
df['cleaned_text'] = df['text'].apply(cleaner)
print("any stragglers?")
df[df['cleaned_text'].str.contains('[^\x00-\x7F]')].head()


In [None]:
def preprocesser(text):
    # tokenize
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    
    # stem
    stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(token) for token in tokens]
    
    processed_text = ' '.join(stemmed_tokens)
    
    return processed_text

df['processed_text'] = df['cleaned_text'].apply(preprocesser)
df['processed_text'].head()

## 4. CBOWing and stuff

In [None]:
# Using Bag of Words (CountVectorizer)
count_vectorizer = CountVectorizer(max_features=5000)
X_count = count_vectorizer.fit_transform(df['processed_text'])


print("cbow shape:", X_count.shape)

X = X_count
y = df['label']

## 5. Divorcing the dataset

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)
print("Training set shape:", X_train.shape)
print("Testing set shape:", X_test.shape)

## 6. IT'S ALIVE, sike not yet gotta pad


In [None]:


padded_sequences = pad_sequences(X_train, 
                                maxlen=50,
                                padding='post',
                                truncating='post')

print("Shape of padded sequences:", padded_sequences.shape)
print("Example of padded sequence:", padded_sequences[0])


NameError: name 'X_train' is not defined