In [26]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import sys
sns.set_style("whitegrid")

In [27]:
try:
    df = pd.read_csv('laptopData.csv', na_values='?')
    print("Successfully loaded laptopData.csv")
except FileNotFoundError:
    print("Error: 'laptopData.csv' not found.", file=sys.stderr)
    print("Please make sure the script is in the same folder as the CSV file.", file=sys.stderr)


Successfully loaded laptopData.csv


In [28]:
# Drop 'Unnamed: 0' column if exists
if 'Unnamed: 0' in df.columns:
    df = df.drop('Unnamed: 0', axis=1)

# Drop rows where 'Company' is NaN (empty rows)
initial_rows = len(df)
df = df.dropna(subset=['Company'])
print(f"Dropped {initial_rows - len(df)} empty rows.")

# Clean and impute 'Ram'
df['Ram'] = df['Ram'].astype(str).str.replace('GB', '', regex=False)
df['Ram'] = pd.to_numeric(df['Ram'], errors='coerce')
ram_median = df['Ram'].median()
missing_ram_count = df['Ram'].isnull().sum()
df['Ram'] = df['Ram'].fillna(ram_median)
df['Ram'] = df['Ram'].astype(int)
if missing_ram_count > 0:
    print(f"Filled {missing_ram_count} missing 'Ram' values with median: {ram_median}GB")

# Clean and impute 'Weight'
df['Weight'] = df['Weight'].astype(str).str.replace('kg', '', regex=False)
df['Weight'] = pd.to_numeric(df['Weight'], errors='coerce')
weight_median = df['Weight'].median()
missing_weight_count = df['Weight'].isnull().sum()
df['Weight'] = df['Weight'].fillna(weight_median)
if missing_weight_count > 0:
    print(f"Filled {missing_weight_count} missing 'Weight' values with median: {weight_median}kg")

# Impute missing values with mode
for col in ['Inches', 'TypeName', 'Memory', 'OpSys', 'Gpu']:
    missing_count = df[col].isnull().sum()
    if missing_count > 0:
        mode_val = df[col].mode()[0]
        df[col] = df[col].fillna(mode_val)
        print(f"Filled {missing_count} missing '{col}' values with mode: {mode_val}")

# Convert 'Inches' to numeric
df['Inches'] = pd.to_numeric(df['Inches'], errors='coerce')
df['Inches'] = df['Inches'].fillna(df['Inches'].median())

# Unify 'macOS' and 'Mac OS X' into 'Mac'
df['OpSys'] = df['OpSys'].replace({'macOS': 'Mac', 'Mac OS X': 'Mac'})
print("Unified 'macOS' and 'Mac OS X' to 'Mac'")

# Round Price
df['Price'] = df['Price'].round(2)
print("Data Cleaning Complete.")


Dropped 30 empty rows.
Filled 14 missing 'Ram' values with median: 8.0GB
Filled 16 missing 'Weight' values with median: 2.04kg
Filled 54 missing 'Inches' values with mode: 15.6
Filled 20 missing 'TypeName' values with mode: Notebook
Filled 1 missing 'Memory' values with mode: 256GB SSD
Unified 'macOS' and 'Mac OS X' to 'Mac'
Data Cleaning Complete.


In [29]:
# Touchscreen and IPS Panel detection
df['Touchscreen'] = df['ScreenResolution'].apply(lambda x: 1 if 'Touchscreen' in str(x) else 0)
df['IPS_Panel'] = df['ScreenResolution'].apply(lambda x: 1 if 'IPS' in str(x) else 0)

def extract_resolution(res_str):
    match = re.search(r'(\d+)x(\d+)', str(res_str))
    if match:
        return int(match.group(1)), int(match.group(2))
    return df['Screen_Width'].median(), df['Screen_Height'].median()

df['Screen_Width'] = 1920
df['Screen_Height'] = 1080
res_split = df['ScreenResolution'].apply(lambda x: pd.Series(extract_resolution(x), index=['Screen_Width', 'Screen_Height']))
df['Screen_Width'] = res_split['Screen_Width'].astype(int)
df['Screen_Height'] = res_split['Screen_Height'].astype(int)

# CPU Brand and Speed extraction
df['Cpu_Brand'] = df['Cpu'].apply(lambda x: str(x).split()[0])
def extract_speed(cpu_str):
    match = re.search(r'(\d+\.?\d*)GHz', str(cpu_str))
    if match:
        return float(match.group(1))
    return np.nan
df['Cpu_Speed_GHz'] = df['Cpu'].apply(extract_speed)
df['Cpu_Speed_GHz'] = df['Cpu_Speed_GHz'].fillna(df['Cpu_Speed_GHz'].median())

# Parse memory into SSD, HDD, Flash, Hybrid
def parse_memory(mem_str):
    ssd = hdd = flash = hybrid = 0
    if 'TB' in mem_str:
        tb_matches = re.findall(r'(\d+)TB (\w+)', mem_str)
        for val, typ in tb_matches:
            if typ == 'HDD':
                hdd += int(val) * 1000
            elif typ == 'SSD':
                ssd += int(val) * 1000
            elif typ == 'Hybrid':
                hybrid += int(val) * 1000
    if 'GB' in mem_str:
        gb_matches = re.findall(r'(\d+)GB (\w+)', mem_str)
        for val, typ in gb_matches:
            if typ == 'HDD':
                hdd += int(val)
            elif typ == 'SSD':
                ssd += int(val)
            elif 'Flash' in typ:
                flash += int(val)
            elif typ == 'Hybrid':
                hybrid += int(val)
    if ssd == 0 and hdd == 0 and flash == 0 and hybrid == 0:
        match = re.match(r'(\d+)(GB|TB) (\w+)', mem_str)
        if match:
            val = int(match.group(1))
            unit = match.group(2)
            typ = match.group(3)
            if unit == 'TB':
                val *= 1000
            if typ == 'HDD':
                hdd = val
            elif typ == 'SSD':
                ssd = val
            elif 'Flash' in typ:
                flash = val
            elif typ == 'Hybrid':
                hybrid = val
    return ssd, hdd, flash, hybrid

mem_df = df['Memory'].apply(lambda x: pd.Series(parse_memory(x), index=['SSD_GB', 'HDD_GB', 'Flash_Storage_GB', 'Hybrid_GB']))
df = pd.concat([df, mem_df], axis=1)

# GPU brand extraction
df['Gpu_Brand'] = df['Gpu'].apply(lambda x: str(x).split()[0])

# Drop original complex columns
df_cleaned = df.drop(['ScreenResolution', 'Cpu', 'Memory', 'Gpu'], axis=1)

print("Feature Engineering Complete.")
print(df_cleaned[['Touchscreen', 'IPS_Panel', 'Screen_Width', 'Cpu_Brand', 'SSD_GB', 'HDD_GB', 'Gpu_Brand']].head())


Feature Engineering Complete.
   Touchscreen  IPS_Panel  Screen_Width Cpu_Brand  SSD_GB  HDD_GB Gpu_Brand
0            0          1          2560     Intel     128       0     Intel
1            0          0          1440     Intel       0       0     Intel
2            0          0          1920     Intel     256       0     Intel
3            0          1          2880     Intel     512       0       AMD
4            0          1          2560     Intel     256       0     Intel


In [30]:
plt.figure(figsize=(10, 6))
sns.histplot(df_cleaned['Price'], kde=True, bins=40)
plt.title('Distribution of Laptop Prices', fontsize=16)
plt.xlabel('Price', fontsize=12)
plt.ylabel('Frequency', fontsize=12)
plt.savefig('1_price_distribution.png')
plt.close()

plt.figure(figsize=(12, 8))
sns.countplot(y='Company', data=df_cleaned, order=df_cleaned['Company'].value_counts().index)
plt.title('Number of Laptops by Company', fontsize=16)
plt.xlabel('Count', fontsize=12)
plt.ylabel('Company', fontsize=12)
plt.tight_layout()
plt.savefig('2_company_distribution.png')
plt.close()

plt.figure(figsize=(10, 6))
sns.countplot(x='TypeName', data=df_cleaned, order=df_cleaned['TypeName'].value_counts().index)
plt.title('Number of Laptops by Type', fontsize=16)
plt.xlabel('Laptop Type', fontsize=12)
plt.ylabel('Count', fontsize=12)
plt.savefig('3_typename_distribution.png')
plt.close()

plt.figure(figsize=(10, 6))
sns.countplot(x='Ram', data=df_cleaned, order=df_cleaned['Ram'].value_counts().sort_index().index)
plt.title('Number of Laptops by RAM', fontsize=16)
plt.xlabel('RAM (GB)', fontsize=12)
plt.ylabel('Count', fontsize=12)
plt.savefig('4_ram_distribution.png')
plt.close()

plt.figure(figsize=(10, 6))
sns.countplot(x='OpSys', data=df_cleaned, order=df_cleaned['OpSys'].value_counts().index)
plt.title('Number of Laptops by Operating System', fontsize=16)
plt.xlabel('Operating System', fontsize=12)
plt.ylabel('Count', fontsize=12)
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig('5_opsys_distribution.png')
plt.close()

plt.figure(figsize=(10, 6))
sns.histplot(df_cleaned['Weight'], kde=True, bins=30)
plt.title('Distribution of Laptop Weights', fontsize=16)
plt.xlabel('Weight (kg)', fontsize=12)
plt.ylabel('Frequency', fontsize=12)
plt.savefig('6_weight_distribution.png')
plt.close()


In [31]:
plt.figure(figsize=(12, 8))
company_order = df_cleaned.groupby('Company')['Price'].median().sort_values().index
sns.boxplot(y='Company', x='Price', data=df_cleaned, order=company_order)
plt.title('Laptop Price by Company', fontsize=16)
plt.xlabel('Price', fontsize=12)
plt.ylabel('Company', fontsize=12)
plt.tight_layout()
plt.savefig('7_price_vs_company.png')
plt.close()

plt.figure(figsize=(10, 6))
sns.boxplot(x='Ram', y='Price', data=df_cleaned, order=sorted(df_cleaned['Ram'].unique()))
plt.title('Laptop Price by RAM', fontsize=16)
plt.xlabel('RAM (GB)', fontsize=12)
plt.ylabel('Price', fontsize=12)
plt.savefig('8_price_vs_ram.png')
plt.close()

plt.figure(figsize=(10, 6))
sns.boxplot(x='TypeName', y='Price', data=df_cleaned)
plt.title('Laptop Price by Type', fontsize=16)
plt.xlabel('Laptop Type', fontsize=12)
plt.ylabel('Price', fontsize=12)
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig('9_price_vs_type.png')
plt.close()

plt.figure(figsize=(10, 6))
sns.boxplot(x='Gpu_Brand', y='Price', data=df_cleaned)
plt.title('Laptop Price by GPU Brand', fontsize=16)
plt.xlabel('GPU Brand', fontsize=12)
plt.ylabel('Price', fontsize=12)
plt.savefig('10_price_vs_gpu.png')
plt.close()

plt.figure(figsize=(10, 6))
sns.scatterplot(x='Weight', y='Price', data=df_cleaned, alpha=0.5)
plt.title('Price vs. Weight', fontsize=16)
plt.xlabel('Weight (kg)', fontsize=12)
plt.ylabel('Price', fontsize=12)
plt.savefig('11_price_vs_weight.png')
plt.close()

plt.figure(figsize=(10, 6))
sns.scatterplot(x='Inches', y='Weight', data=df_cleaned, alpha=0.5)
plt.title('Screen Size vs. Weight', fontsize=16)
plt.xlabel('Screen Size (Inches)', fontsize=12)
plt.ylabel('Weight (kg)', fontsize=12)
plt.savefig('12_screensize_vs_weight.png')
plt.close()

plt.figure(figsize=(16, 12))
numeric_cols = df_cleaned.select_dtypes(include=np.number).columns
corr_matrix = df_cleaned[numeric_cols].corr()
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title('Correlation Heatmap of Numeric Features', fontsize=16)
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)
plt.tight_layout()
plt.savefig('13_correlation_heatmap.png')
plt.close()

plt.figure(figsize=(12, 7))
sns.scatterplot(x='Inches', y='Price', hue='Touchscreen', data=df_cleaned, alpha=0.7)
plt.title('Price vs. Screen Size (Colored by Touchscreen)', fontsize=16)
plt.xlabel('Screen Size (Inches)', fontsize=12)
plt.ylabel('Price', fontsize=12)
plt.legend(title='Touchscreen', loc='upper left')
plt.savefig('14_price_vs_inches_vs_touchscreen.png')
plt.close()
print("All visualizations have been saved as .png files.")


All visualizations have been saved as .png files.


In [32]:
print("="*50)
print("--- Task 5: Analysis and Interpretation (Sample) ---")
print("="*50)

print("""
1. Price Distribution:
   - The laptop prices are right-skewed; most laptops are budget to mid-range.

2. Market Landscape:
   - Dell, Lenovo, and HP dominate in number of models.
   - Premium brands include Razer, LG, MSI, Apple.
   - Budget brands include Acer, Lenovo.

3. Key Price Drivers:
   - RAM positively correlates with price (~+0.74)
   - SSD size correlates strongly with price (+0.67)
   - CPU speed moderately correlates with price.
   - Weight shows some positive correlation due to gaming/workstation laptops.

4. Feature Impact:
   - Gaming and Ultrabooks categories are most expensive.
   - Nvidia and AMD GPUs command higher prices than integrated Intel GPUs.
   - Touchscreen and IPS panel features also positively influence price.

Recommendations:
- For budget consumers: prioritize at least 8GB RAM and SSD storage.
- For manufacturers: high RAM, large SSDs, and powerful GPUs increase premium pricing.
""")


--- Task 5: Analysis and Interpretation (Sample) ---

1. Price Distribution:
   - The laptop prices are right-skewed; most laptops are budget to mid-range.

2. Market Landscape:
   - Dell, Lenovo, and HP dominate in number of models.
   - Premium brands include Razer, LG, MSI, Apple.
   - Budget brands include Acer, Lenovo.

3. Key Price Drivers:
   - RAM positively correlates with price (~+0.74)
   - SSD size correlates strongly with price (+0.67)
   - CPU speed moderately correlates with price.
   - Weight shows some positive correlation due to gaming/workstation laptops.

4. Feature Impact:
   - Gaming and Ultrabooks categories are most expensive.
   - Nvidia and AMD GPUs command higher prices than integrated Intel GPUs.
   - Touchscreen and IPS panel features also positively influence price.

Recommendations:
- For budget consumers: prioritize at least 8GB RAM and SSD storage.
- For manufacturers: high RAM, large SSDs, and powerful GPUs increase premium pricing.

