# Step 1: Data Acquisition and Exploration

This notebook covers the first step of our sentiment analysis project:
- Setting up the environment
- Loading the IMDB dataset
- Basic data exploration
- Understanding the dataset structure

## Import Required Libraries

In [None]:
# Import essential libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# For text processing
import re
import string

# NLTK for natural language processing
import nltk

# Set random seed for reproducibility
np.random.seed(42)

# Set plot style
plt.style.use('default')
sns.set_palette("husl")

print("Libraries imported successfully!")

## Download NLTK Data

In [None]:
# Download required NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('vader_lexicon')
print("NLTK data downloaded successfully!")

## Load and Explore IMDB Dataset

We'll use the built-in IMDB dataset from scikit-learn for this project.

In [None]:
from sklearn.datasets import fetch_20newsgroups
# Actually, let's create a sample dataset since scikit-learn doesn't have IMDB
# We'll create a simple dataset for demonstration

# Sample movie reviews data
sample_reviews = [
    "This movie was absolutely fantastic! Great acting and storyline.",
    "Terrible movie, waste of time. Poor acting and boring plot.",
    "Amazing film with incredible visuals and outstanding performances.",
    "Worst movie I've ever seen. Complete disaster.",
    "Loved every minute of it! Highly recommended for everyone.",
    "Disappointing film with weak characters and poor direction.",
    "Excellent movie with great cinematography and compelling story.",
    "Boring and predictable. Not worth watching.",
    "Outstanding performances by all actors. Must watch!",
    "Complete waste of money. Avoid at all costs."
]

sample_labels = [1, 0, 1, 0, 1, 0, 1, 0, 1, 0]  # 1 = positive, 0 = negative

# Create DataFrame
df = pd.DataFrame({
    'review': sample_reviews,
    'sentiment': sample_labels
})

print("Sample dataset created successfully!")
print(f"Dataset shape: {df.shape}")

## Dataset Overview

In [None]:
# Display basic information about the dataset
print("Dataset Info:")
print(f"Number of reviews: {len(df)}")
print(f"Number of features: {len(df.columns)}")
print("\nColumn names:")
print(df.columns.tolist())
print("\nData types:")
print(df.dtypes)
print("\nFirst few rows:")
df.head()

In [None]:
# Check sentiment distribution
print("Sentiment Distribution:")
sentiment_counts = df['sentiment'].value_counts()
print(sentiment_counts)
print("\nSentiment Distribution (%):") 
print(df['sentiment'].value_counts(normalize=True) * 100)

# Visualize sentiment distribution
plt.figure(figsize=(8, 6))
sentiment_counts.plot(kind='bar')
plt.title('Distribution of Sentiments')
plt.xlabel('Sentiment (0=Negative, 1=Positive)')
plt.ylabel('Count')
plt.xticks(rotation=0)
plt.show()

In [None]:
# Examine sample reviews
print("Sample Positive Reviews:")
positive_reviews = df[df['sentiment'] == 1]['review'].head(3)
for i, review in enumerate(positive_reviews, 1):
    print(f"{i}. {review}")

print("\nSample Negative Reviews:")
negative_reviews = df[df['sentiment'] == 0]['review'].head(3)
for i, review in enumerate(negative_reviews, 1):
    print(f"{i}. {review}")

In [None]:
# Save the dataset for future use
df.to_csv('../data/sample_reviews.csv', index=False)
print("Dataset saved to '../data/sample_reviews.csv'")

# Display dataset summary
print("\n=== STEP 1 COMPLETED ===")
print("✓ Environment setup complete")
print("✓ Sample dataset created and loaded")
print("✓ Basic data exploration completed")
print("✓ Dataset saved for next steps")
print("\nNext: Data preprocessing and text cleaning")