# Laptop Price Prediction - Exploratory Data Analysis

This notebook explores the laptop dataset and performs feature engineering for price prediction.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sys
import os

# Add parent directory to path
sys.path.append('..')

from src.utils import load_data, save_data
from src.data_preprocessing import DataPreprocessor

# Set style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')

# Display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

## 1. Load Data

In [None]:
# Load or create dataset
data_path = '../data/laptop_data.csv'
if os.path.exists(data_path):
    df = pd.read_csv(data_path)
else:
    df = load_data()
    os.makedirs('../data', exist_ok=True)
    save_data(df, data_path)

print(f"Dataset shape: {df.shape}")
df.head()

## 2. Data Overview

In [None]:
# Basic info
df.info()

In [None]:
# Statistical summary
df.describe()

In [None]:
# Check for missing values
df.isnull().sum()

## 3. Price Distribution Analysis

In [None]:
# Price distribution
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Histogram
axes[0].hist(df['price'], bins=30, edgecolor='black', alpha=0.7)
axes[0].set_xlabel('Price ($)')
axes[0].set_ylabel('Frequency')
axes[0].set_title('Price Distribution')
axes[0].axvline(df['price'].mean(), color='red', linestyle='--', label=f"Mean: ${df['price'].mean():.2f}")
axes[0].axvline(df['price'].median(), color='green', linestyle='--', label=f"Median: ${df['price'].median():.2f}")
axes[0].legend()

# Box plot
axes[1].boxplot(df['price'], vert=True)
axes[1].set_ylabel('Price ($)')
axes[1].set_title('Price Box Plot')

plt.tight_layout()
plt.show()

print(f"Price Statistics:")
print(f"  Mean: ${df['price'].mean():.2f}")
print(f"  Median: ${df['price'].median():.2f}")
print(f"  Std Dev: ${df['price'].std():.2f}")
print(f"  Min: ${df['price'].min():.2f}")
print(f"  Max: ${df['price'].max():.2f}")

## 4. Brand Analysis

In [None]:
# Brand distribution and prices
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Brand count
brand_counts = df['brand'].value_counts()
axes[0].bar(brand_counts.index, brand_counts.values)
axes[0].set_xlabel('Brand')
axes[0].set_ylabel('Count')
axes[0].set_title('Laptop Count by Brand')
axes[0].tick_params(axis='x', rotation=45)

# Average price by brand
brand_avg_price = df.groupby('brand')['price'].mean().sort_values(ascending=False)
axes[1].bar(brand_avg_price.index, brand_avg_price.values)
axes[1].set_xlabel('Brand')
axes[1].set_ylabel('Average Price ($)')
axes[1].set_title('Average Price by Brand')
axes[1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

## 5. Specifications Analysis

In [None]:
# RAM and Storage analysis
fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# RAM distribution
ram_counts = df['ram'].value_counts().sort_index()
axes[0, 0].bar(ram_counts.index, ram_counts.values)
axes[0, 0].set_xlabel('RAM')
axes[0, 0].set_ylabel('Count')
axes[0, 0].set_title('RAM Distribution')

# Price by RAM
ram_price = df.groupby('ram')['price'].mean().sort_index()
axes[0, 1].bar(ram_price.index, ram_price.values)
axes[0, 1].set_xlabel('RAM')
axes[0, 1].set_ylabel('Average Price ($)')
axes[0, 1].set_title('Average Price by RAM')

# Storage distribution
storage_counts = df['storage'].value_counts().sort_index()
axes[1, 0].bar(range(len(storage_counts)), storage_counts.values)
axes[1, 0].set_xticks(range(len(storage_counts)))
axes[1, 0].set_xticklabels(storage_counts.index, rotation=45)
axes[1, 0].set_xlabel('Storage')
axes[1, 0].set_ylabel('Count')
axes[1, 0].set_title('Storage Distribution')

# Price by Storage
storage_price = df.groupby('storage')['price'].mean().sort_index()
axes[1, 1].bar(range(len(storage_price)), storage_price.values)
axes[1, 1].set_xticks(range(len(storage_price)))
axes[1, 1].set_xticklabels(storage_price.index, rotation=45)
axes[1, 1].set_xlabel('Storage')
axes[1, 1].set_ylabel('Average Price ($)')
axes[1, 1].set_title('Average Price by Storage')

plt.tight_layout()
plt.show()

## 6. Feature Engineering

In [None]:
# Apply preprocessing
preprocessor = DataPreprocessor()
df_processed = preprocessor.preprocess_data(df)

print("Processed Features:")
print(df_processed.columns.tolist())

df_processed.head()

## 7. Correlation Analysis

In [None]:
# Select numeric features for correlation
numeric_features = ['ram_gb', 'storage_gb', 'screen_inches', 'processor_gen', 'price']
available_features = [f for f in numeric_features if f in df_processed.columns]

if len(available_features) > 1:
    # Correlation matrix
    corr_matrix = df_processed[available_features].corr()
    
    plt.figure(figsize=(10, 8))
    sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0, 
                square=True, linewidths=1, cbar_kws={"shrink": 0.8})
    plt.title('Feature Correlation Matrix')
    plt.tight_layout()
    plt.show()
    
    # Correlation with price
    if 'price' in available_features:
        price_corr = corr_matrix['price'].sort_values(ascending=False)
        print("\nCorrelation with Price:")
        print(price_corr)

## 8. Processor and GPU Analysis

In [None]:
# Processor analysis
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Top processors by average price
processor_price = df.groupby('processor')['price'].mean().sort_values(ascending=False).head(10)
axes[0].barh(processor_price.index, processor_price.values)
axes[0].set_xlabel('Average Price ($)')
axes[0].set_ylabel('Processor')
axes[0].set_title('Top 10 Processors by Average Price')

# GPU analysis
gpu_price = df.groupby('gpu')['price'].mean().sort_values(ascending=False)
axes[1].barh(gpu_price.index, gpu_price.values)
axes[1].set_xlabel('Average Price ($)')
axes[1].set_ylabel('GPU')
axes[1].set_title('Average Price by GPU')

plt.tight_layout()
plt.show()

## 9. Scatter Plots - Price Relationships

In [None]:
# Create scatter plots
fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# Price vs RAM
if 'ram_gb' in df_processed.columns:
    axes[0, 0].scatter(df_processed['ram_gb'], df_processed['price'], alpha=0.6)
    axes[0, 0].set_xlabel('RAM (GB)')
    axes[0, 0].set_ylabel('Price ($)')
    axes[0, 0].set_title('Price vs RAM')

# Price vs Storage
if 'storage_gb' in df_processed.columns:
    axes[0, 1].scatter(df_processed['storage_gb'], df_processed['price'], alpha=0.6)
    axes[0, 1].set_xlabel('Storage (GB)')
    axes[0, 1].set_ylabel('Price ($)')
    axes[0, 1].set_title('Price vs Storage')

# Price vs Screen Size
if 'screen_inches' in df_processed.columns:
    axes[1, 0].scatter(df_processed['screen_inches'], df_processed['price'], alpha=0.6)
    axes[1, 0].set_xlabel('Screen Size (inches)')
    axes[1, 0].set_ylabel('Price ($)')
    axes[1, 0].set_title('Price vs Screen Size')

# Price vs Processor Generation
if 'processor_gen' in df_processed.columns:
    axes[1, 1].scatter(df_processed['processor_gen'], df_processed['price'], alpha=0.6)
    axes[1, 1].set_xlabel('Processor Generation')
    axes[1, 1].set_ylabel('Price ($)')
    axes[1, 1].set_title('Price vs Processor Generation')

plt.tight_layout()
plt.show()

## 10. Summary Statistics

In [None]:
print("\n" + "="*50)
print("DATASET SUMMARY")
print("="*50)

print(f"\nTotal Laptops: {len(df)}")
print(f"Number of Brands: {df['brand'].nunique()}")
print(f"Number of Processors: {df['processor'].nunique()}")
print(f"Number of GPUs: {df['gpu'].nunique()}")

print(f"\nPrice Range: ${df['price'].min():.2f} - ${df['price'].max():.2f}")
print(f"Average Price: ${df['price'].mean():.2f}")
print(f"Median Price: ${df['price'].median():.2f}")

print("\nMost Common Specifications:")
print(f"  RAM: {df['ram'].mode()[0]}")
print(f"  Storage: {df['storage'].mode()[0]}")
print(f"  Screen Size: {df['screen_size'].mode()[0]}")
print(f"  Brand: {df['brand'].mode()[0]}")