In [1]:
# imports
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import re  
import emoji


In [2]:
# load datasets
train_df = pd.read_csv('../Datasets/twitter_training.csv')
val_df =pd.read_csv('../Datasets/twitter_validation.csv')

# Basic exploration
print("Training set shape:", train_df.shape)
print("Validation set shape:", val_df.shape)
print("\nTraining set info:")
print(train_df.info())
print("\nClass distribution in training:")
# First, let's see what columns you actually have
print("Column names in training set:")
print(train_df.columns.tolist())
print("\nFirst few rows of training set:")
print(train_df.head())

print("\nColumn names in validation set:")
print(val_df.columns.tolist())
print("\nFirst few rows of validation set:")
print(val_df.head())


Training set shape: (74681, 4)
Validation set shape: (999, 4)

Training set info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74681 entries, 0 to 74680
Data columns (total 4 columns):
 #   Column                                                 Non-Null Count  Dtype 
---  ------                                                 --------------  ----- 
 0   2401                                                   74681 non-null  int64 
 1   Borderlands                                            74681 non-null  object
 2   Positive                                               74681 non-null  object
 3   im getting on borderlands and i will murder you all ,  73995 non-null  object
dtypes: int64(1), object(3)
memory usage: 2.3+ MB
None

Class distribution in training:
Column names in training set:
['2401', 'Borderlands', 'Positive', 'im getting on borderlands and i will murder you all ,']

First few rows of training set:
   2401  Borderlands  Positive  \
0  2401  Borderlands  Positive   


In [3]:
# Check for missing values
print("\nMissing values in training set:")
print(train_df.isnull().sum())

print("\nMissing values in validation set:")
print(val_df.isnull().sum())

# Check data types
print("\nTraining set dtypes:")
print(train_df.dtypes)


Missing values in training set:
2401                                                       0
Borderlands                                                0
Positive                                                   0
im getting on borderlands and i will murder you all ,    686
dtype: int64

Missing values in validation set:
3364                                                                                                                                                                                                                                                  0
Facebook                                                                                                                                                                                                                                              0
Irrelevant                                                                                                                                                                         

In [4]:
# Rename columns for consistency
train_df.columns = ['id', 'topic', 'sentiment', 'text']
val_df.columns = ['id', 'topic', 'sentiment', 'text']

print("Training set after renaming:")
print(train_df[['sentiment', 'text']].head())

print("\nValidation set after renaming:")
print(val_df[['sentiment', 'text']].head())

# Check unique sentiment values
print("\nUnique sentiments in training:")
print(train_df['sentiment'].value_counts())

print("\nUnique sentiments in validation:")
print(val_df['sentiment'].value_counts())

Training set after renaming:
  sentiment                                               text
0  Positive  I am coming to the borders and I will kill you...
1  Positive  im getting on borderlands and i will kill you ...
2  Positive  im coming on borderlands and i will murder you...
3  Positive  im getting on borderlands 2 and i will murder ...
4  Positive  im getting into borderlands and i can murder y...

Validation set after renaming:
  sentiment                                               text
0   Neutral  BBC News - Amazon boss Jeff Bezos rejects clai...
1  Negative  @Microsoft Why do I pay for WORD when it funct...
2  Negative  CSGO matchmaking is so full of closet hacking,...
3   Neutral  Now the President is slapping Americans in the...
4  Negative  Hi @EAHelp I’ve had Madeleine McCann in my cel...

Unique sentiments in training:
sentiment
Negative      22542
Positive      20831
Neutral       18318
Irrelevant    12990
Name: count, dtype: int64

Unique sentiments in validation:
s

In [5]:
# Remove rows with missing text
print(f"Training set before removing missing: {len(train_df)}")
train_df = train_df.dropna(subset=['text'])
print(f"Training set after removing missing: {len(train_df)}")

print(f"Validation set before removing missing: {len(val_df)}")
val_df = val_df.dropna(subset=['text'])
print(f"Validation set after removing missing: {len(val_df)}")

Training set before removing missing: 74681
Training set after removing missing: 73995
Validation set before removing missing: 999
Validation set after removing missing: 999


In [6]:
# Twitter text cleaning function
def clean_twitter_text(text):
    if pd.isna(text):
        return ""
    
    text = str(text)
    # Remove URLs
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'www\S+', '', text)
    
    # Remove user mentions but keep text
    text = re.sub(r'@\w+', '', text)
    
    # Handle emojis - convert to text description
    text = emoji.demojize(text)
    
    # Remove special characters but keep basic punctuation
    text = re.sub(r'[^\w\s!?,.]', '', text)
    
    # Handle repeated characters (e.g., "loooove" -> "love")
    text = re.sub(r'(.)\1+', r'\1\1', text)
    
    return text.strip().lower()

# Apply cleaning
print("Cleaning texts...")
train_texts = train_df['text'].apply(clean_twitter_text)
val_texts = val_df['text'].apply(clean_twitter_text)

# Prepare labels for 4-class classification
le = LabelEncoder()
le.fit(train_df['sentiment'])
y_train = le.transform(train_df['sentiment'])
y_val = le.transform(val_df['sentiment'])

print(f"Label mapping: {dict(zip(le.classes_, range(len(le.classes_))))}")
print(f"Training labels distribution: {np.bincount(y_train)}")
print(f"Validation labels distribution: {np.bincount(y_val)}")

Cleaning texts...
Label mapping: {'Irrelevant': 0, 'Negative': 1, 'Neutral': 2, 'Positive': 3}
Training labels distribution: [12875 22358 18108 20654]
Validation labels distribution: [171 266 285 277]


In [7]:
# Tokenization with larger vocabulary for Twitter data
tokenizer = Tokenizer(num_words=15000, oov_token="<OOV>")
tokenizer.fit_on_texts(train_texts)

# Convert to sequences
train_sequences = tokenizer.texts_to_sequences(train_texts)
val_sequences = tokenizer.texts_to_sequences(val_texts)

# Analyze sequence lengths
train_lens = [len(seq) for seq in train_sequences]
print(f"Sequence length analysis:")
print(f"Max length: {max(train_lens)}")
print(f"Average length: {np.mean(train_lens):.2f}")
print(f"95th percentile: {np.percentile(train_lens, 95)}")

# Set max length based on analysis
max_length = 60  # Covers most tweets
X_train = pad_sequences(train_sequences, maxlen=max_length, padding='post')
X_val = pad_sequences(val_sequences, maxlen=max_length, padding='post')

print(f"\nFinal data shapes:")
print(f"X_train: {X_train.shape}, y_train: {y_train.shape}")
print(f"X_val: {X_val.shape}, y_val: {y_val.shape}")
print(f"Vocabulary size: {len(tokenizer.word_index)}")

Sequence length analysis:
Max length: 166
Average length: 18.48
95th percentile: 47.0

Final data shapes:
X_train: (73995, 60), y_train: (73995,)
X_val: (999, 60), y_val: (999,)
Vocabulary size: 37107
