In [2]:
import pandas as pd

In [3]:
raw_df = pd.read_csv('laptopData.csv')
print(f"Rows: {len(raw_df)} (including duplicates)")
raw_df.head()

Rows: 770 (including duplicates)


Unnamed: 0.1,Unnamed: 0,Company,TypeName,Inches,ScreenResolution,Cpu,Ram,Memory,Gpu,OpSys,Weight,Price
0,0.0,Apple,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8GB,128GB SSD,Intel Iris Plus Graphics 640,macOS,1.37kg,71378.6832
1,1.0,Apple,Ultrabook,13.3,1440x900,Intel Core i5 1.8GHz,8GB,128GB Flash Storage,Intel HD Graphics 6000,macOS,1.34kg,47895.5232
2,2.0,HP,Notebook,15.6,Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8GB,256GB SSD,Intel HD Graphics 620,No OS,1.86kg,30636.0
3,3.0,Apple,Ultrabook,15.4,IPS Panel Retina Display 2880x1800,Intel Core i7 2.7GHz,16GB,512GB SSD,AMD Radeon Pro 455,macOS,1.83kg,135195.336
4,4.0,Apple,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 3.1GHz,8GB,256GB SSD,Intel Iris Plus Graphics 650,macOS,1.37kg,96095.808


In [4]:
def clean_data(df: pd.DataFrame) -> pd.DataFrame:
    df = df.drop_duplicates()
    df = df.dropna()

    def get_storage_type(x):
        if 'SSD' in x:
            return 'SSD'
        elif 'HDD' in x:
            return 'HDD'
        else:
            return 'Flash'

    df['Storage_Type'] = df['Memory'].apply(get_storage_type)

    if 'Unnamed: 0' in df.columns:
        df = df.drop(columns=['Unnamed: 0'])

    df['Price'] = df['Price'].astype(int)
    df['Price'] = (df['Price'] / 10).astype(int)

    return df

clean_df = clean_data(raw_df)
clean_df.head()


Unnamed: 0,Company,TypeName,Inches,ScreenResolution,Cpu,Ram,Memory,Gpu,OpSys,Weight,Price,Storage_Type
0,Apple,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8GB,128GB SSD,Intel Iris Plus Graphics 640,macOS,1.37kg,7137,SSD
1,Apple,Ultrabook,13.3,1440x900,Intel Core i5 1.8GHz,8GB,128GB Flash Storage,Intel HD Graphics 6000,macOS,1.34kg,4789,Flash
2,HP,Notebook,15.6,Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8GB,256GB SSD,Intel HD Graphics 620,No OS,1.86kg,3063,SSD
3,Apple,Ultrabook,15.4,IPS Panel Retina Display 2880x1800,Intel Core i7 2.7GHz,16GB,512GB SSD,AMD Radeon Pro 455,macOS,1.83kg,13519,SSD
4,Apple,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 3.1GHz,8GB,256GB SSD,Intel Iris Plus Graphics 650,macOS,1.37kg,9609,SSD


# طريقة اخرى اسهل 

In [5]:
def clean_laptop_data(df: pd.DataFrame) -> pd.DataFrame:
    # إزالة الصفوف المكررة والقيم الفارغة
    df = df.drop_duplicates()
    df = df.dropna()

    # إضافة عمود Storage_Type حسب محتوى عمود Memory
    def get_storage_type(x):
        if 'SSD' in x:
            return 'SSD'
        elif 'HDD' in x:
            return 'HDD'
        else:
            return 'Flash'

    df['Storage_Type'] = df['Memory'].apply(get_storage_type)

    # حذف العمود 'Unnamed: 0' إذا موجود
    if 'Unnamed: 0' in df.columns:
        df = df.drop(columns=['Unnamed: 0'])

    # تعديل عمود السعر
    df['Price'] = df['Price'].astype(int)
    df['Price'] = (df['Price'] / 10).astype(int)

    return df

In [6]:
clean_df.to_csv('clean_laptop.csv', index=False)
print("Clean file written: clean_sales.csv")
print(f"Clean rows: {len(clean_df)}")

Clean file written: clean_sales.csv
Clean rows: 744


In [7]:
def run_etl(input_path: str = 'laptopData.csv',
            output_path: str = 'clean_laptop.csv',
            verbose: bool = True) -> pd.DataFrame:
    """Simple, reproducible ETL pipeline."""

    if verbose:
        print('Extract ▶ ', end='')
    df_raw = pd.read_csv(input_path)
    if verbose:
        print('Done')

    if verbose:
        print('Transform ▶ ', end='')
    df_clean = clean_data(df_raw)
    if verbose:
        print('Done')

    if verbose:
        print('Load ▶ ', end='')
    df_clean.to_csv(output_path, index=False)
    if verbose:
        print('Done')

    if verbose:
        print('Pipeline completed successfully!')

    return df_clean

df = run_etl()
df.head()


Extract ▶ Done
Transform ▶ Done
Load ▶ Done
Pipeline completed successfully!


Unnamed: 0,Company,TypeName,Inches,ScreenResolution,Cpu,Ram,Memory,Gpu,OpSys,Weight,Price,Storage_Type
0,Apple,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 2.3GHz,8GB,128GB SSD,Intel Iris Plus Graphics 640,macOS,1.37kg,7137,SSD
1,Apple,Ultrabook,13.3,1440x900,Intel Core i5 1.8GHz,8GB,128GB Flash Storage,Intel HD Graphics 6000,macOS,1.34kg,4789,Flash
2,HP,Notebook,15.6,Full HD 1920x1080,Intel Core i5 7200U 2.5GHz,8GB,256GB SSD,Intel HD Graphics 620,No OS,1.86kg,3063,SSD
3,Apple,Ultrabook,15.4,IPS Panel Retina Display 2880x1800,Intel Core i7 2.7GHz,16GB,512GB SSD,AMD Radeon Pro 455,macOS,1.83kg,13519,SSD
4,Apple,Ultrabook,13.3,IPS Panel Retina Display 2560x1600,Intel Core i5 3.1GHz,8GB,256GB SSD,Intel Iris Plus Graphics 650,macOS,1.37kg,9609,SSD
