In [1]:
pip install pandas numpy nltk scikit-learn contractions textblob joblib

Collecting contractions
  Downloading contractions-0.1.73-py2.py3-none-any.whl.metadata (1.2 kB)
Collecting textsearch>=0.0.21 (from contractions)
  Downloading textsearch-0.0.24-py2.py3-none-any.whl.metadata (1.2 kB)
Collecting anyascii (from textsearch>=0.0.21->contractions)
  Downloading anyascii-0.3.2-py3-none-any.whl.metadata (1.5 kB)
Collecting pyahocorasick (from textsearch>=0.0.21->contractions)
  Downloading pyahocorasick-2.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Downloading contractions-0.1.73-py2.py3-none-any.whl (8.7 kB)
Downloading textsearch-0.0.24-py2.py3-none-any.whl (7.6 kB)
Downloading anyascii-0.3.2-py3-none-any.whl (289 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m289.9/289.9 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyahocorasick-2.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (118 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m118.3/118.3 kB[0m 

In [2]:
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
import contractions
from textblob import TextBlob
import logging
import joblib
from pathlib import Path

# Download required NLTK data
try:
    nltk.data.find('tokenizers/punkt')
    nltk.data.find('corpora/stopwords')
    nltk.data.find('corpora/wordnet')
except LookupError:
    nltk.download('punkt')
    nltk.download('stopwords')
    nltk.download('wordnet')
    nltk.download('omw-1.4')

# Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Configuration
DATA_DIR = Path('/content/drive/MyDrive/Colab Notebooks/Data_RO')
TWITTER_DATA_PATH = DATA_DIR / 'training.1600000.processed.noemoticon.csv'
PROCESSED_DATA_PATH = DATA_DIR / 'processed_twitter_data.csv'
RANDOM_STATE = 42
TRAIN_TEST_SPLIT = 0.8
VALIDATION_SPLIT = 0.1

Mounted at /content/drive


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


In [3]:
class TwitterDataPreprocessor:
    """
    Comprehensive data preprocessing pipeline for Twitter sentiment analysis
    """

    def __init__(self):
        self.lemmatizer = WordNetLemmatizer()
        self.stop_words = set(stopwords.words('english'))

    def load_twitter_data(self, file_path):
        """
        Load Twitter dataset from CSV file

        Args:
            file_path (str): Path to the Twitter CSV file

        Returns:
            pd.DataFrame: Loaded dataset
        """
        logger.info(f"Loading Twitter data from {file_path}")

        # Column names based on the dataset structure
        column_names = ['sentiment', 'id', 'date', 'query', 'user', 'text']

        try:
            # Load with Latin-1 encoding (common for this dataset)
            df = pd.read_csv(file_path, encoding='latin-1', names=column_names)
            logger.info(f"Successfully loaded {len(df)} tweets")

            # Convert sentiment labels (0 -> 0, 4 -> 1)
            df['sentiment'] = df['sentiment'].map({0: 0, 4: 1})

            # Keep only sentiment and text columns
            df = df[['sentiment', 'text']].copy()

            # Remove any missing values
            df = df.dropna()

            logger.info(f"After cleaning: {len(df)} tweets")
            logger.info(f"Sentiment distribution:\n{df['sentiment'].value_counts()}")

            return df

        except Exception as e:
            logger.error(f"Error loading data: {str(e)}")
            raise

    def clean_text(self, text):
        """
        Clean and preprocess a single text string

        Args:
            text (str): Raw text to clean

        Returns:
            str: Cleaned text
        """
        if not isinstance(text, str):
            return ""

        # Convert to lowercase
        text = text.lower()

        # Expand contractions (don't -> do not)
        text = contractions.fix(text)

        # Remove URLs
        text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)

        # Remove user mentions and hashtags
        text = re.sub(r'@\w+|#\w+', '', text)

        # Remove special characters and digits, keep only letters and spaces
        text = re.sub(r'[^a-zA-Z\s]', '', text)

        # Remove extra whitespace
        text = re.sub(r'\s+', ' ', text).strip()

        return text

    def tokenize_and_lemmatize(self, text):
        """
        Tokenize text and apply lemmatization

        Args:
            text (str): Text to tokenize

        Returns:
            list: List of lemmatized tokens
        """
        # Tokenize
        tokens = word_tokenize(text)

        # Remove stopwords and lemmatize
        tokens = [
            self.lemmatizer.lemmatize(token)
            for token in tokens
            if token not in self.stop_words and len(token) > 2
        ]

        return tokens

    def preprocess_dataset(self, df, sample_size=None):
        """
        Apply full preprocessing pipeline to the dataset

        Args:
            df (pd.DataFrame): Input dataframe
            sample_size (int): Optional sample size for faster processing

        Returns:
            pd.DataFrame: Preprocessed dataset
        """
        logger.info("Starting dataset preprocessing...")

        # Sample data if requested (for faster experimentation)
        if sample_size and sample_size < len(df):
            df = df.sample(n=sample_size, random_state=RANDOM_STATE)
            logger.info(f"Sampled {sample_size} tweets for processing")

        # Clean text
        logger.info("Cleaning text...")
        df['cleaned_text'] = df['text'].apply(self.clean_text)

        # Remove empty texts
        df = df[df['cleaned_text'].str.len() > 0]

        # Tokenize and create processed text
        logger.info("Tokenizing and lemmatizing...")
        df['tokens'] = df['cleaned_text'].apply(self.tokenize_and_lemmatize)
        df['processed_text'] = df['tokens'].apply(lambda x: ' '.join(x))

        # Remove texts that are too short after processing
        df = df[df['processed_text'].str.len() > 10]

        # Add text length features
        df['text_length'] = df['cleaned_text'].str.len()
        df['word_count'] = df['processed_text'].str.split().str.len()

        logger.info(f"Preprocessing complete. Final dataset size: {len(df)}")

        return df

    def create_train_test_split(self, df):
        """
        Create train/validation/test splits

        Args:
            df (pd.DataFrame): Preprocessed dataset

        Returns:
            tuple: (X_train, X_val, X_test, y_train, y_val, y_test)
        """
        logger.info("Creating train/test splits...")

        X = df['processed_text'].values
        y = df['sentiment'].values

        # First split: train+val vs test
        X_temp, X_test, y_temp, y_test = train_test_split(
            X, y,
            test_size=1-TRAIN_TEST_SPLIT,
            random_state=RANDOM_STATE,
            stratify=y
        )

        # Second split: train vs validation
        val_size = VALIDATION_SPLIT / TRAIN_TEST_SPLIT
        X_train, X_val, y_train, y_val = train_test_split(
            X_temp, y_temp,
            test_size=val_size,
            random_state=RANDOM_STATE,
            stratify=y_temp
        )

        logger.info(f"Train set: {len(X_train)} samples")
        logger.info(f"Validation set: {len(X_val)} samples")
        logger.info(f"Test set: {len(X_test)} samples")

        return X_train, X_val, X_test, y_train, y_val, y_test

    def get_text_statistics(self, df):
        """
        Generate statistics about the text data

        Args:
            df (pd.DataFrame): Dataset to analyze

        Returns:
            dict: Statistics dictionary
        """
        stats = {
            'total_samples': len(df),
            'avg_text_length': df['text_length'].mean(),
            'avg_word_count': df['word_count'].mean(),
            'sentiment_distribution': df['sentiment'].value_counts().to_dict(),
            'max_text_length': df['text_length'].max(),
            'min_text_length': df['text_length'].min()
        }

        return stats

In [6]:
# Initialize preprocessor
preprocessor = TwitterDataPreprocessor()

import nltk
nltk.download('punkt_tab')

# Check if data file exists
if not TWITTER_DATA_PATH.exists():
    logger.error(f"Data file not found: {TWITTER_DATA_PATH}")
    logger.info("Please upload training.1600000.processed.noemoticon.csv to your Google Drive in the twitter_data directory")
else:
    # Load and preprocess data
    df = preprocessor.load_twitter_data(TWITTER_DATA_PATH)

    # For initial testing, use a smaller sample
    # Comment out the sample_size parameter to process the full dataset
    df_processed = preprocessor.preprocess_dataset(df, sample_size=50000)

    # Generate statistics
    stats = preprocessor.get_text_statistics(df_processed)
    logger.info(f"Dataset statistics: {stats}")

    # Create train/test splits
    splits = preprocessor.create_train_test_split(df_processed)
    X_train, X_val, X_test, y_train, y_val, y_test = splits

    # Save processed data
    processed_data = {
        'X_train': X_train,
        'X_val': X_val,
        'X_test': X_test,
        'y_train': y_train,
        'y_val': y_val,
        'y_test': y_test,
        'stats': stats
    }

    joblib.dump(processed_data, DATA_DIR / 'processed_splits.pkl')

    # Also save the full processed dataframe
    df_processed.to_csv(PROCESSED_DATA_PATH, index=False)

    logger.info("Data preprocessing completed successfully!")
    logger.info(f"Processed data saved to {PROCESSED_DATA_PATH}")

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
