In [2]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from scipy import stats

# 1. Load the dataset
# Load the CSV file into a pandas DataFrame
df = pd.read_csv('laptop_price.csv', encoding='ISO-8859-1')

# 2. Inspect the first ten rows
print("First 10 rows of the dataset:")
print(df.head(10))

# 3. Get the shape of the dataset
print(f"Dataset Shape (rows, columns): {df.shape}")

# 4. Get dataset information
print("Dataset Info:")
df.info()

# 5. Identify missing values
print("Missing Values per Column:")
print(df.isnull().sum())

# Check for any inconsistencies in column names (e.g., extra spaces)
df.columns = df.columns.str.strip() # Remove leading/trailing whitespaces from column names if any

# 6. Handle missing values
# In this dataset, there appear to be no explicit NaN values based on the sample.
# However, some entries might have 'No OS' or similar which could be treated as missing data depending on context.
# For this example, we will assume 'No OS' is a valid category and not treat it as missing.
# If there were actual NaNs, we could use:
# df.fillna(method='ffill', inplace=True) # Forward fill
# df.dropna(inplace=True) # Drop rows with missing values
# Or fill with mean/median/mode for numerical/categorical data respectively.

# 7. Address outliers in numerical features
# First, we need to identify numerical columns. Weight and Price_euros1 seem to be numerical.
# Let's clean and convert them first.

# Clean 'Weight' column: remove 'kg' and convert to float
df['Weight'] = df['Weight'].str.replace('kg', '').astype(float)

# Ensure 'Price_euros1' is float (it seems to be correctly formatted)
df['Price_euros'] = pd.to_numeric(df['Price_euros'], errors='coerce') # Coerce any non-numeric to NaN

# Let's check for outliers in 'Price_euros' and 'Weight' using the IQR method
def handle_outliers_iqr(df, column):
    """Detect and cap outliers using the IQR method."""
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    # Cap the outliers instead of removing them to preserve data
    df[column] = np.where(df[column] > upper_bound, upper_bound, df[column])
    df[column] = np.where(df[column] < lower_bound, lower_bound, df[column])
    
    return df

# Handle outliers for Price and Weight
df = handle_outliers_iqr(df, 'Price_euros')
df = handle_outliers_iqr(df, 'Weight')

print("Outliers in 'Price_euros' and 'Weight' have been capped using IQR method.")

# 8. Convert categorical features into numerical representations

# Identify categorical columns
categorical_columns = df.select_dtypes(include=['object']).columns.tolist()
print(f"Categorical Columns: {categorical_columns}")

# --- Encoding Strategies ---
# a. Label Encoding for ordinal data (if any) or when the number of categories is very high
#    and we want to avoid creating too many new columns.
#    Here, we'll use it for 'TypeName' as there might be an implicit order 
#    (e.g., Notebook < Ultrabook < Gaming < Workstation), though this is debatable.
#    We'll also use it for 'ScreenResolution' due to its high cardinality.
# 
# b. One-Hot Encoding for nominal data where no order exists and cardinality is low to moderate.
#    This creates a new binary column for each category.
#    We'll use this for 'Company', 'OpSys'.
# 
# c. For 'Cpu', 'Memory', 'Gpu', 'Inches', 'Product', 'laptop_ID' the cardinality is very high
#    or they represent complex features. We might drop them or use more advanced techniques.
#    For simplicity, we'll drop 'laptop_ID' and 'Product' as they are likely unique identifiers.
#    We'll also drop 'Cpu', 'Memory', 'Gpu' for now as parsing them is complex for this task.
#    'Inches' can be converted to numerical directly.

# Drop high cardinality or complex columns
columns_to_drop = ['laptop_ID', 'Product', 'Cpu', 'Memory', 'Gpu']
df_processed = df.drop(columns=columns_to_drop, errors='ignore')
print(f"Dropped columns: {columns_to_drop}")

# Convert 'Inches' to numerical
df_processed['Inches'] = pd.to_numeric(df_processed['Inches'], errors='coerce')

# One-Hot Encoding for 'Company' and 'OpSys'
df_processed = pd.get_dummies(df_processed, columns=['Company', 'OpSys'], prefix=['Company', 'OS'])

# Label Encoding for 'TypeName' and 'ScreenResolution'
label_encoders = {}
for col in ['TypeName', 'ScreenResolution']:
    le = LabelEncoder()
    df_processed[col + '_encoded'] = le.fit_transform(df_processed[col])
    label_encoders[col] = le
    df_processed.drop(columns=[col], inplace=True) # Drop original column after encoding

# Handle any remaining missing values that might have been introduced
# For example, if 'Inches' had non-numeric values
df_processed.dropna(inplace=True)

print("Categorical features have been encoded:")
print("- 'Company' and 'OpSys' used One-Hot Encoding.")
print("- 'TypeName' and 'ScreenResolution' used Label Encoding.")
print("- 'laptop_ID', 'Product', 'Cpu', 'Memory', 'Gpu' were dropped due to high cardinality/complexity.")

# 9. Final check
print("Final Processed Dataset Info:")
print(df_processed.info())
print("\nFirst 5 rows of processed data:")
print(df_processed.head())
print("\nShape of processed data:", df_processed.shape)

# Optional: Feature Scaling (Standardization)
# This is often useful for algorithms like SVM, KNN, Neural Networks.
# We typically scale numerical features.
# numerical_features = ['Inches', 'Ram_numerical', 'Weight', 'Price_euros1'] 
# Note: 'Ram' needs to be cleaned first (e.g., remove 'GB')
# For this example, let's just scale 'Weight' and 'Price_euros1'
scaler = StandardScaler()
df_processed[['Weight_scaled', 'Price_euros_scaled']] = scaler.fit_transform(df_processed[['Weight', 'Price_euros']])
print("\nNumerical features 'Weight' and 'Price_euros1' have been scaled (StandardScaler).")


First 10 rows of the dataset:
   laptop_ID Company          Product   TypeName  Inches  \
0          1   Apple      MacBook Pro  Ultrabook    13.3   
1          2   Apple      Macbook Air  Ultrabook    13.3   
2          3      HP           250 G6   Notebook    15.6   
3          4   Apple      MacBook Pro  Ultrabook    15.4   
4          5   Apple      MacBook Pro  Ultrabook    13.3   
5          6    Acer         Aspire 3   Notebook    15.6   
6          7   Apple      MacBook Pro  Ultrabook    15.4   
7          8   Apple      Macbook Air  Ultrabook    13.3   
8          9    Asus  ZenBook UX430UN  Ultrabook    14.0   
9         10    Acer          Swift 3  Ultrabook    14.0   

                     ScreenResolution                         Cpu   Ram  \
0  IPS Panel Retina Display 2560x1600        Intel Core i5 2.3GHz   8GB   
1                            1440x900        Intel Core i5 1.8GHz   8GB   
2                   Full HD 1920x1080  Intel Core i5 7200U 2.5GHz   8GB   
3  IPS Pa