In [None]:
import pandas as pd
import os


folder = "dataset"

df1 = pd.read_csv(os.path.join(folder, "data.csv"))
df2 = pd.read_csv(os.path.join(folder, "ouedkniss_informatique_ordinateur_portable_20250811_180006.csv"))

print("Dataset 1 shape:", df1.shape)
print("Dataset 2 shape:", df2.shape)


if "description" in df2.columns:
    df2 = df2.drop(columns=["description"])


rename_map_2 = {
    "spec_Carte graphique": "gpu_general",
    "spec_Reférence carte graphique dédiée": "gpu_dedicated",
    "spec_Reférence carte graphique integrée": "gpu_integrated",
    "spec_Processeur": "cpu",
    "spec_RAM": "ram_size",
    "spec_Marque": "brand",
    "spec_Taille du disque": "storage_size",
    "spec_Type disque": "storage_type",
    "spec_Taille écran": "screen_size",
    "spec_Etat": "condition"
}

rename_map_1 = {
    "DEDICATED_GPU": "gpu_dedicated",
    "CPU": "cpu",
    "RAM_SIZE": "ram_size",
    "RAM_TYPE": "ram_type",
    "SSD_SIZE": "ssd_size",
    "HDD_SIZE": "hdd_size",
    "SCREEN_SIZE": "screen_size",
    "SCREEN_FREQUENCY": "screen_frequency",
    "SCREEN_RESOLUTION": "screen_resolution",
    "spec_Etat": "condition",
    "model_name": "model_name"
}

df1 = df1.rename(columns=rename_map_1)
df2 = df2.rename(columns=rename_map_2)


all_features = df1.columns.union(df2.columns)
df1_aligned = df1.reindex(columns=all_features)
df2_aligned = df2.reindex(columns=all_features)


final_df = pd.concat([df1_aligned, df2_aligned], ignore_index=True)


cat_cols = final_df.select_dtypes(include='object').columns
final_df[cat_cols] = final_df[cat_cols].fillna('NeedToBeFilled')

# Numeric columns → -1
num_cols = final_df.select_dtypes(include='number').columns
final_df[num_cols] = final_df[num_cols].fillna(-1)


final_df.to_csv("final_dataset.csv", index=False)
print("\nFinal merged dataset saved as 'final_dataset.csv'")
print("Shape:", final_df.shape)
print("Columns:", list(final_df.columns))


Dataset 1 shape: (16406, 14)
Dataset 2 shape: (20819, 16)

Final merged dataset saved as 'final_dataset.csv'
Shape: (37225, 21)
Columns: ['brand', 'city', 'condition', 'cpu', 'created_at', 'gpu_dedicated', 'gpu_general', 'gpu_integrated', 'hdd_size', 'model_name', 'price_preview', 'ram_size', 'ram_type', 'reference', 'screen_frequency', 'screen_resolution', 'screen_size', 'ssd_size', 'storage_size', 'storage_type', 'title']


In [None]:
import pandas as pd


selected_cols = [
    'price_preview',
    'condition',
    'brand',
    'model_name',
    'gpu_dedicated',
    'gpu_general',
    'gpu_integrated',
    'cpu',
    'ram_size',
    'ram_type',
    'ssd_size',
    'hdd_size',
    'storage_size',
    'storage_type',
    'screen_size',
    'screen_frequency',
    'screen_resolution',
    'city',
    'created_at'
]

ml_df = final_df[selected_cols].copy()


ml_df['created_at'] = pd.to_datetime(ml_df['created_at'], errors='coerce')
ml_df['POST_YEAR'] = ml_df['created_at'].dt.year.fillna(-1).astype(int)
ml_df['POST_MONTH'] = ml_df['created_at'].dt.month.fillna(-1).astype(int)
ml_df = ml_df.drop(columns=['created_at'])

# 3️⃣ Rename columns to match cleaned Kaggle style
ml_df = ml_df.rename(columns={
    'price_preview': 'PRICE',
    'condition': 'LAPTOP_CONDITION',
    'brand': 'LAPTOP_BRAND',
    'model_name': 'LAPTOP_MODEL',
    'gpu_dedicated': 'DEDICATED_GPU',
    'gpu_general': 'GPU_GENERAL',
    'gpu_integrated': 'GPU_INTEGRATED',
    'cpu': 'CPU',
    'ram_size': 'RAM_SIZE',
    'ram_type': 'RAM_TYPE',
    'ssd_size': 'SSD_SIZE',
    'hdd_size': 'HDD_SIZE',
    'storage_size': 'STORAGE_SIZE',
    'storage_type': 'STORAGE_TYPE',
    'screen_size': 'SCREEN_SIZE',
    'screen_frequency': 'SCREEN_FREQUENCY',
    'screen_resolution': 'SCREEN_RESOLUTION',
    'city': 'CITY'
})


cat_cols = ml_df.select_dtypes(include='object').columns
num_cols = ml_df.select_dtypes(include='number').columns

ml_df[cat_cols] = ml_df[cat_cols].fillna('NeedToBeFilled')
ml_df[num_cols] = ml_df[num_cols].fillna(-1)


ml_df.to_csv("merged_price_prediction_ready.csv", index=False)

print("✅ ML-ready dataset saved as 'merged_price_prediction_ready.csv'")
print("Shape:", ml_df.shape)
print("Columns:", list(ml_df.columns))


✅ ML-ready dataset saved as 'merged_price_prediction_ready.csv'
Shape: (37225, 20)
Columns: ['PRICE', 'LAPTOP_CONDITION', 'LAPTOP_BRAND', 'LAPTOP_MODEL', 'DEDICATED_GPU', 'GPU_GENERAL', 'GPU_INTEGRATED', 'CPU', 'RAM_SIZE', 'RAM_TYPE', 'SSD_SIZE', 'HDD_SIZE', 'STORAGE_SIZE', 'STORAGE_TYPE', 'SCREEN_SIZE', 'SCREEN_FREQUENCY', 'SCREEN_RESOLUTION', 'CITY', 'POST_YEAR', 'POST_MONTH']


In [None]:
import pandas as pd


my_data = pd.read_csv("merged_price_prediction_ready.csv") 
kaggle_data = pd.read_csv(os.path.join(folder, "laptop_price_prediction_cleaned.csv"))


all_cols = my_data.columns.union(kaggle_data.columns)
my_data_aligned = my_data.reindex(columns=all_cols)
kaggle_data_aligned = kaggle_data.reindex(columns=all_cols)


full_dataset = pd.concat([my_data_aligned, kaggle_data_aligned], ignore_index=True)


cat_cols = full_dataset.select_dtypes(include='object').columns
num_cols = full_dataset.select_dtypes(include='number').columns

full_dataset[cat_cols] = full_dataset[cat_cols].fillna('NeedToBeFilled')
full_dataset[num_cols] = full_dataset[num_cols].fillna(-1)


full_dataset.to_csv("full_merged_dataset.csv", index=False)

print("✅ Full merged dataset saved as 'full_merged_dataset.csv'")
print("Shape:", full_dataset.shape)
print("Columns:", list(full_dataset.columns))


✅ Full merged dataset saved as 'full_merged_dataset.csv'
Shape: (53445, 20)
Columns: ['CITY', 'CPU', 'DEDICATED_GPU', 'GPU_GENERAL', 'GPU_INTEGRATED', 'HDD_SIZE', 'LAPTOP_BRAND', 'LAPTOP_CONDITION', 'LAPTOP_MODEL', 'POST_MONTH', 'POST_YEAR', 'PRICE', 'RAM_SIZE', 'RAM_TYPE', 'SCREEN_FREQUENCY', 'SCREEN_RESOLUTION', 'SCREEN_SIZE', 'SSD_SIZE', 'STORAGE_SIZE', 'STORAGE_TYPE']
