In [6]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer 

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/richardph911/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/richardph911/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/richardph911/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

### Raw data
label : 1 for negative and 2 for positive

title : review heading

text : review body

In [7]:
# Load the dataset
def load_data(train_path, test_path):
    try:
        train_df = pd.read_csv(train_path)
        test_df = pd.read_csv(test_path)
        
        # Load training data
        train_df = pd.read_csv(train_path, names=['label', 'title', 'text'])
        test_df = pd.read_csv(test_path, names=['label', 'title', 'text'])
        print("Datasets loaded successfully!")

        train_df['label'] = train_df['label'].map({1: 0, 2: 1})
        test_df['label'] = test_df['label'].map({1: 0, 2: 1})
        return train_df, test_df

    except FileNotFoundError:
        print("File not found. Please check the file path.")
        return None, None

In [8]:
# Text preprocessing
def preprocess_text(text):
    # Remove special characters
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text, flags=re.IGNORECASE)
    text = re.sub(r'https?://\S+|www\.\S+', '', text)  # Remove URLs
    text = re.sub(r'\S+@\S+', '', text)  # Remove emails
    # Tokenize
    tokens = text.lower().split()
    # Remove stopwords
    tokens = [t for t in tokens if t not in stopwords.words('english')]
    # Lemmatize
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(t) for t in tokens]
    return ' '.join(tokens)


In [None]:
def vectorize_text(train_text, test_text):
    # Use HashingVectorizer for memory efficiency
    vectorizer = HashingVectorizer(n_features=2**18, alternate_sign=False)
    X_train = vectorizer.transform(train_text)
    X_test = vectorizer.transform(test_text)
    
    return X_train, X_test

In [9]:
if __name__ == "__main__":
    train_path = '/Users/richardph911/Downloads/archive/train.csv' 
    test_path = '/Users/richardph911/Downloads/archive/test.csv' 
    train_df, test_df = load_data(train_path, test_path)
    # Preprocess the text
    if train_df is not None and test_df is not None:
        
        print("train text", train_df['text'].iloc[0])
        print("train label : ", train_df['label'].iloc[0])
        print("test text", test_df['text'].iloc[0])
        print("test label : ", test_df['label'].iloc[0])


        # X_train = train_df['text'].astype(str).values
        # y_train = train_df['label'].values
        # X_test = test_df['text'].astype(str).values
        # y_test = test_df['label'].values

        # print("\nFirst training text sample:")
        # print(X_test[0])
    
        # print("\nFirst test text sample:")
        # print(y_test[0])
       

        # Preprocess text for both datasets
        train_df['text'] = [preprocess_text(text) for text in train_df['text']]
        test_df['text'] = [preprocess_text(text) for text in test_df['text']]
        
        # 3. Memory-efficient vectorization
        X_train, X_test = vectorize_text(train_df['text'], test_df['text'])
        y_train = train_df['label'].values
        y_test = test_df['label'].values
       
        X_train, X_val, y_train, y_val = train_test_split(
        X_train, y_train, test_size=0.2, random_state=42, stratify=y_train)

        print(f"X_train: {X_train.shape}, y_train: {y_train.shape}")
        print(f"X_test: {X_test.shape}, y_test: {y_test.shape}")



Datasets loaded successfully!
