In [1]:
!pip install pandas
!pip install numpy
!pip install matplot
!pip install requests

Collecting pandas
  Using cached pandas-2.2.3-cp311-cp311-win_amd64.whl.metadata (19 kB)
Collecting numpy>=1.23.2 (from pandas)
  Using cached numpy-2.2.1-cp311-cp311-win_amd64.whl.metadata (60 kB)
Collecting pytz>=2020.1 (from pandas)
  Using cached pytz-2024.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Using cached tzdata-2024.2-py2.py3-none-any.whl.metadata (1.4 kB)
Using cached pandas-2.2.3-cp311-cp311-win_amd64.whl (11.6 MB)
Using cached numpy-2.2.1-cp311-cp311-win_amd64.whl (12.9 MB)
Using cached pytz-2024.2-py2.py3-none-any.whl (508 kB)
Using cached tzdata-2024.2-py2.py3-none-any.whl (346 kB)
Installing collected packages: pytz, tzdata, numpy, pandas
Successfully installed numpy-2.2.1 pandas-2.2.3 pytz-2024.2 tzdata-2024.2
Collecting matplot
  Using cached matplot-0.1.9-py2.py3-none-any.whl.metadata (241 bytes)
Collecting pyloco>=0.0.134 (from matplot)
  Using cached pyloco-0.0.139-py2.py3-none-any.whl.metadata (1.1 kB)
Collecting matplotlib>

In [2]:
import pandas as pd

import requests
import zipfile
import os

import numpy as np

import matplotlib.pyplot as plt


pd.set_option('display.max_columns', None)  # or 1000
pd.set_option('display.max_rows', 5)  # or 1000
pd.set_option('display.max_colwidth', None)  # or 199


In [3]:
def clean_dataset(df):

    df = df.loc[:, ~df.columns.str.contains('Unnamed:')]

    na_pourcentage = (df.isna().sum()/len(df))*100

    print(na_pourcentage)

    drop_col = na_pourcentage[na_pourcentage > 90].index

    df = df.drop(columns=drop_col)

    return df

In [4]:

# Paths to the datasets
paths = {
    "https://www.kaggle.com/api/v1/datasets/download/berkayeserr/phone-prices",
    "https://www.kaggle.com/api/v1/datasets/download/rkiattisak/mobile-phone-price",
    "https://www.kaggle.com/api/v1/datasets/download/ganjerlawrence/mobile-phone-price-prediction-cleaned-dataset",
    "https://www.kaggle.com/api/v1/datasets/download/pratikgarai/mobile-phone-specifications-and-prices",
    #"https://www.kaggle.com/api/v1/datasets/download/shreyasur965/phone-search-dataset",
    "https://www.kaggle.com/api/v1/datasets/download/artempozdniakov/ukrainian-market-mobile-phones-data",
    "https://www.kaggle.com/api/v1/datasets/download/shaikmohammedzubair/mobile-phone-prices"
}

# Directory to save the extracted CSVs
output_directory = './datasets/'

# Ensure the directory exists
if not os.path.exists(output_directory):
    os.makedirs(output_directory)

# Array to hold all CSV file paths
csv_files = []

for value in paths:
    url = value
    output_path = output_directory + value.split('/')[-1] + '.zip'

    print(f"Downloading {url} to {output_path}...")

    # Download the zip file
    response = requests.get(url, allow_redirects=True)
    
    with open(output_path, "wb") as file:
        file.write(response.content)

    print(f"Download completed for {url}!")

    # Extract the ZIP file
    with zipfile.ZipFile(output_path, 'r') as zip_ref:
        zip_ref.extractall(output_directory)

    print(f"Extraction completed for {output_path}!")



    # Optionally, remove the zip file after extraction to save space
    os.remove(output_path)
    print(f"Removed the zip file: {output_path}")

# Add all CSV files to the csv_files array
for extracted_file in os.listdir(output_directory):
    if extracted_file.endswith('.csv'):
        csv_file_path = os.path.join(output_directory, extracted_file)
        csv_files.append(csv_file_path)
        print(f"CSV file extracted: {csv_file_path}")

Downloading https://www.kaggle.com/api/v1/datasets/download/pratikgarai/mobile-phone-specifications-and-prices to ./datasets/mobile-phone-specifications-and-prices.zip...
Download completed for https://www.kaggle.com/api/v1/datasets/download/pratikgarai/mobile-phone-specifications-and-prices!
Extraction completed for ./datasets/mobile-phone-specifications-and-prices.zip!
Removed the zip file: ./datasets/mobile-phone-specifications-and-prices.zip
Downloading https://www.kaggle.com/api/v1/datasets/download/berkayeserr/phone-prices to ./datasets/phone-prices.zip...
Download completed for https://www.kaggle.com/api/v1/datasets/download/berkayeserr/phone-prices!
Extraction completed for ./datasets/phone-prices.zip!
Removed the zip file: ./datasets/phone-prices.zip
Downloading https://www.kaggle.com/api/v1/datasets/download/shaikmohammedzubair/mobile-phone-prices to ./datasets/mobile-phone-prices.zip...
Download completed for https://www.kaggle.com/api/v1/datasets/download/shaikmohammedzubai

In [5]:
df = pd.read_csv(csv_files[0])
df2 = pd.read_csv(csv_files[1])
df3 = pd.read_csv(csv_files[2])
df4 = pd.read_csv(csv_files[3])
df5 = pd.read_csv(csv_files[4])
df6 = pd.read_csv(csv_files[5])

In [6]:
print(df.keys().to_list())
print(df2.keys().to_list())
print(df3.keys().to_list())
print(df4.keys().to_list())
print(df5.keys().to_list())
print(df6.keys().to_list())

['title', 'rating', 'image', 'price', 'touch', 'quick_charge', 'gpu', 'processor_brand', 'ram', 'hd', 'battery', 'battery_type', 'warranty_time', 'java', 'brand', 'headset', 'charger', 'warranty', 'cable', 'pro', 'model', 'color1', 'color2', 'sim', 'display', 'height', 'width', 'os', 'core', 'clock_speed', 'storage', 'expandable', 'camera', 'secondary_camera', 'video', 'network']
['phone_name', 'brand', 'os', 'inches', 'resolution', 'battery', 'battery_type', 'ram(GB)', 'announcement_date', 'weight(g)', 'storage(GB)', 'video_720p', 'video_1080p', 'video_4K', 'video_8K', 'video_30fps', 'video_60fps', 'video_120fps', 'video_240fps', 'video_480fps', 'video_960fps', 'price(USD)']
['Brand', 'Model', 'Storage ', 'RAM ', 'Screen Size (inches)', 'Camera (MP)', 'Battery Capacity (mAh)', 'Price ($)']
['Ratings', 'RAM', 'ROM', 'Mobile_Size', 'Primary_Cam', 'Selfi_Cam', 'Battery_Power', 'Price']
['Unnamed: 0', 'Name', 'Brand', 'Model', 'Battery capacity (mAh)', 'Screen size (inches)', 'Touchscreen

In [7]:
df['currency'] = "INR"
df2['currency'] = "USD"
df3['currency'] = "USD"
df4['currency'] = "INR"
df5['currency'] = "INR"
df6['currency'] = "UAH"

In [8]:
df_cleaned  = (clean_dataset(df[['price','brand','currency']]))
df2_cleaned = (clean_dataset(df2[['price(USD)','brand','currency']]))
df3_cleaned = (clean_dataset(df3[['Price ($)','Brand','currency']]))
df4_cleaned = (clean_dataset(df4[['Price','currency']]))
df5_cleaned = (clean_dataset(df5[['Price','Brand','currency' ]]))
df6_cleaned = (clean_dataset(df6[['best_price','brand_name','currency']]))


price       0.0
brand       0.0
currency    0.0
dtype: float64
price(USD)    0.0
brand         0.0
currency      0.0
dtype: float64
Price ($)    0.0
Brand        0.0
currency     0.0
dtype: float64
Price       0.0
currency    0.0
dtype: float64
Price       0.0
Brand       0.0
currency    0.0
dtype: float64
best_price    0.0
brand_name    0.0
currency      0.0
dtype: float64


In [9]:
df_cleaned  = df_cleaned.rename(columns={'price': 'price', 'brand': 'brand'})
df2_cleaned = df2_cleaned.rename(columns={'price(USD)': 'price', 'brand': 'brand'})
df3_cleaned = df3_cleaned.rename(columns={'Price ($)': 'price', 'Brand': 'brand'})
df4_cleaned = df4_cleaned.rename(columns={'Price': 'price'})
df5_cleaned = df5_cleaned.rename(columns={'Price': 'price', 'Brand': 'brand'})
df6_cleaned = df6_cleaned.rename(columns={'best_price': 'price', 'brand_name': 'brand'})


In [10]:
df_combined = pd.concat([df_cleaned, df2_cleaned, df3_cleaned, df4_cleaned, df5_cleaned,df6_cleaned], axis=0, ignore_index=True)


In [11]:
df_combined['brand'] = df_combined['brand'].fillna(method='ffill')

df_combined['price'] = pd.to_numeric(df_combined['price'], errors='coerce')

mean_col2 = df_combined['price'].mean()

random_factors = np.random.uniform(-1, 1, size=df_combined['price'].isna().sum())

nan_indices = df_combined['price'].isna()

df_combined.loc[nan_indices, 'price'] = mean_col2 + random_factors

  df_combined['brand'] = df_combined['brand'].fillna(method='ffill')


In [12]:
exchange_rates = {
    'INR': 0.011,  
    'UAH': 0.027,
    'USD': 0.98,   
}

def convert_to_eur(row):
    return row['price'] * exchange_rates.get(row['currency'], 1)

# Ajouter la colonne 'price_euro'
df_combined['price_euro'] = df_combined.apply(convert_to_eur, axis=1)

# Réorganiser les colonnes pour mettre 'price_euro' en premier
df_combined = df_combined[['price_euro'] + [col for col in df_combined.columns if col != 'price_euro']]



In [13]:
df_combined

Unnamed: 0,price_euro,price,brand,currency
0,142.989,12999.0,POCO,INR
1,153.989,13999.0,POCO,INR
...,...,...,...,...
5857,506.385,18755.0,ZTE,UAH
5858,24.489,907.0,Sigma mobile,UAH


In [14]:
pourcentage = df['rating'].round().value_counts(normalize=True) * 100
print(pourcentage)

rating
4.0    93.272727
5.0     4.000000
3.0     2.727273
Name: proportion, dtype: float64


In [15]:
df_combined['price_euro'] = round(df_combined['price_euro'],1)
df_combined['price'] = round(df_combined['price'],1)

In [16]:
df = df_combined

def add_gaussian_noise(data, noise_level=0.02):
    noise = np.random.normal(0, noise_level * data, data.shape)
    return data + noise

df_augmented = df.copy()
df_augmented["price_euro"] = add_gaussian_noise(df["price_euro"])
df_augmented["price"] = add_gaussian_noise(df["price"])

df_combined = pd.concat([df, df_augmented], ignore_index=True)

df_combined = df_combined.round()

df_combined.head()


Unnamed: 0,price_euro,price,brand,currency
0,143.0,12999.0,POCO,INR
1,154.0,13999.0,POCO,INR
2,165.0,14999.0,Motorola,INR
3,187.0,16999.0,POCO,INR
4,165.0,14999.0,Motorola,INR


In [17]:
ratings = [4.0, 5.0, 3.0]
probabilities = [0.9327, 0.04, 0.0273]
categories = {5.0: "Neuf", 4.0: "Bon état", 3.0: "Occasion"}

np.random.seed(42)
assigned_ratings = np.random.choice(ratings, size=df_combined.shape[0], p=probabilities)

df_combined["rating"] = assigned_ratings
df_combined["condition"] = df_combined["rating"].map(categories)


In [18]:
def adjust_price(row):
    if row['condition'] == "Neuf":
        factor = np.random.uniform(1.0, 1.15)
    elif row['condition'] == "Bon état":
        factor = np.random.uniform(0.5, 0.9)
    elif row['condition'] == "Occasion":
        factor = np.random.uniform(0.2, 0.4)
    else:
        factor = 1
    return round(row['price_euro'] * factor)

In [19]:
df_combined['price_adjuted'] = df_combined.apply(adjust_price, axis=1)

In [20]:
df_combined

Unnamed: 0,price_euro,price,brand,currency,rating,condition,price_adjuted
0,143.0,12999.0,POCO,INR,4.0,Bon état,86
1,154.0,13999.0,POCO,INR,5.0,Neuf,161
...,...,...,...,...,...,...,...
11716,490.0,18818.0,ZTE,UAH,4.0,Bon état,327
11717,24.0,885.0,Sigma mobile,UAH,4.0,Bon état,18


In [21]:
df_combined.to_csv('dataset.csv',index=False)