In [1]:
with open('smartprix_mobiles.html','r',encoding='utf-8') as f:
    html = f.read()

In [3]:
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd

In [4]:
soup = BeautifulSoup(html,'lxml')

In [5]:
containers = soup.find_all('div',{'class':'sm-product has-tag has-features has-actions'})

In [14]:
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd

# Initialize lists to store extracted data
names = []
prices = []
ratings = []
spec_scores = []  # List for specs scores
sim = []
processor = []
ram = []
battery = []
display = []
camera = []
card = []
os = []

# Iterate through each mobile product
for i in soup.find_all('div', {'class': 'sm-product has-tag has-features has-actions'}):
    # Default values for each product
    name = np.nan
    price = np.nan
    rating = np.nan
    current_spec_score = np.nan  # Initialize the specs score
    current_sim = np.nan
    current_processor = np.nan
    current_ram = np.nan
    current_battery = np.nan
    current_display = np.nan
    current_camera = np.nan
    current_card = np.nan
    current_os = np.nan

    # Extract mobile name
    name_tag = i.find('h2')  # Ensure we're finding the <h2> tag for the mobile name
    if name_tag:
        name = name_tag.text.strip()  # Clean up the text by stripping unnecessary whitespace

    # Extract price
    price_tag = i.find('span', {'class': 'price'})
    if price_tag:
        price = price_tag.text.strip()

    # Extract rating from the score divs
    rating_div = i.find('div', {'class': 'rating'})
    if rating_div:
        star_rating_span = rating_div.find('span', {'class': 'sm-rating'})
        if star_rating_span and 'style' in star_rating_span.attrs:
            rating = star_rating_span['style'].split(':')[-1].strip().replace(';', '')

    # Extract specs score (from class="score rank-1-bg" to "score rank-5-bg")
    score_div = i.find('div', {'class': 'score'})
    if score_div:
        b_tag = score_div.find('b')  # Look for the <b> tag that contains the score text
        if b_tag:
            current_spec_score = b_tag.text.strip()  # Extract the actual spec score from <b>

    # Extract specs
    specs_ul = i.find('ul', {'class': 'sm-feat specs'})
    if specs_ul:
        li_items = specs_ul.find_all('li')
        for li in li_items:
            li_text = li.text.lower()

            # Check for keywords in the spec text to categorize them
            if 'sim' in li_text:
                current_sim = li_text
            elif 'processor' in li_text:
                current_processor = li_text
            # Updated check for RAM: now checks for both 'ram' and 'inbuilt'
            elif ('ram' in li_text or 'inbuilt' in li_text) and 'memory card' not in li_text:
                # Include both formats for RAM
                if 'ram' in li_text:
                    current_ram = li_text  # e.g., "8 gb ram, 128 gb inbuilt"
                elif 'inbuilt' in li_text:
                    # If it doesn't mention 'ram' but contains 'inbuilt', use it as RAM
                    current_ram = li_text  # e.g., "128 gb inbuilt"
            elif 'mah' in li_text:
                current_battery = li_text
            elif 'inch' in li_text or 'px' in li_text:
                current_display = li_text
            elif 'mp' in li_text:
                current_camera = li_text
            elif 'card' in li_text:
                current_card = li_text
            elif any(os_keyword in li_text for os_keyword in ['android', 'ios', 'linux']):
                current_os = li_text

    # Append collected data
    names.append(name)
    prices.append(price)
    ratings.append(rating)
    spec_scores.append(current_spec_score)  # Add the specs score
    sim.append(current_sim)
    processor.append(current_processor)
    ram.append(current_ram)
    battery.append(current_battery)
    display.append(current_display)
    camera.append(current_camera)
    card.append(current_card)
    os.append(current_os)

# To store the results in a DataFrame
data = {
    'Name': names,
    'Price': prices,
    'Rating': ratings,
    'Specs Score': spec_scores,  # Include the specs score in the DataFrame
    'SIM': sim,
    'Processor': processor,
    'RAM': ram,
    'Battery': battery,
    'Display': display,
    'Camera': camera,
    'Card': card,
    'OS': os
}

# Create DataFrame
df = pd.DataFrame(data)

# Display the DataFrame
df

Unnamed: 0,Name,Price,Rating,Specs Score,SIM,Processor,RAM,Battery,Display,Camera,Card,OS
0,Motorola Edge 50 Fusion,"₹20,999",4.65,85,"dual sim, 3g, 4g, 5g, volte, vo5g, wi-fi, nfc","snapdragon 7s gen2, octa core, 2.4 ghz processor","8 gb ram, 128 gb inbuilt",5000 mah battery with 68w fast charging,"6.67 inches, 1080 x 2400 px, 144 hz display wi...",50 mp + 13 mp dual rear & 32 mp front camera,memory card not supported,android v14
1,Motorola Moto G85 5G,"₹16,999",4.05,80,"dual sim, 3g, 4g, 5g, volte, wi-fi","snapdragon 6s gen3, octa core, 2.3 ghz processor","8 gb ram, 128 gb inbuilt",5000 mah battery with 33w fast charging,"6.67 inches, 1080 x 2400 px, 120 hz display wi...",50 mp + 8 mp dual rear & 32 mp front camera,memory card not supported,android v14
2,Motorola Moto G45 5G,"₹9,999",4.05,76,"dual sim, 3g, 4g, 5g, volte, wi-fi","snapdragon 6s gen3, octa core, 2.3 ghz processor","4 gb ram, 128 gb inbuilt",5000 mah battery with 20w fast charging,"6.5 inches, 720 x 1600 px, 120 hz display with...",50 mp + 2 mp dual rear & 16 mp front camera,"memory card (hybrid), upto 1 tb",android v14
3,Vivo V40 5G,"₹32,300",4.35,86,"dual sim, 3g, 4g, 5g, volte, wi-fi","snapdragon 7 gen3, octa core, 2.63 ghz processor","8 gb ram, 128 gb inbuilt",5500 mah battery with 80w fast charging,"6.78 inches, 1260 x 2800 px, 120 hz display wi...",50 mp + 50 mp dual rear & 50 mp front camera,memory card not supported,android v14
4,Samsung Galaxy M55s,"₹19,999",4.05,85,"dual sim, 3g, 4g, 5g, volte, wi-fi, nfc","snapdragon 7 gen1, octa core, 2.4 ghz processor","8 gb ram, 128 gb inbuilt",5000 mah battery with 45w fast charging,"6.7 inches, 1080 x 2400 px, 120 hz display wit...",50 mp + 8 mp + 2 mp triple rear & 50 mp front ...,"memory card (hybrid), upto 1 tb",android v14
...,...,...,...,...,...,...,...,...,...,...,...,...
1015,Samsung Galaxy S21 Plus,"₹81,999",4,88,"dual sim, 3g, 4g, 5g, volte, wi-fi, nfc","exynos 2100, octa core, 2.9 ghz processor","8 gb ram, 128 gb inbuilt",4800 mah battery with 25w fast charging,"6.7 inches, 1080 x 2400 px, 120 hz display wit...",64 mp + 12 mp + 12 mp triple rear & 10 mp fron...,,android v10
1016,Realme X3 SuperZoom Edition (8GB RAM + 256GB),"₹30,049",4.05,82,"dual sim, 3g, 4g, volte, wi-fi","snapdragon 855+, octa core, 2.96 ghz processor","8 gb ram, 256 gb inbuilt",4200 mah battery with 30w fast charging,"6.57 inches, 1080 x 2400 px, 120 hz display wi...",64 mp quad rear & 32 mp + 8 mp dual front camera,memory card not supported,android v10
1017,Micromax IN Note 1 (4GB RAM + 128GB),"₹8,990",4.55,75,"dual sim, 3g, 4g, volte, wi-fi","helio g85, octa core, 2 ghz processor","4 gb ram, 128 gb inbuilt",5000 mah battery with 18w fast charging,"6.67 inches, 1080 x 2400 px display with punch...",48 mp quad rear & 16 mp front camera,"memory card supported, upto 256 gb",android v10
1018,Samsung Galaxy F12,"₹10,490",4.65,73,"dual sim, 3g, 4g, volte, wi-fi","exynos 850, octa core, 2 ghz processor","4 gb ram, 64 gb inbuilt",6000 mah battery with 15w fast charging,"6.5 inches, 720 x 1600 px, 90 hz display with ...",48 mp quad rear & 8 mp front camera,"memory card supported, upto 512 gb",android v11


In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1020 entries, 0 to 1019
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Name         1020 non-null   object
 1   Price        1020 non-null   object
 2   Rating       1020 non-null   object
 3   Specs Score  1020 non-null   object
 4   SIM          1019 non-null   object
 5   Processor    971 non-null    object
 6   RAM          1018 non-null   object
 7   Battery      1017 non-null   object
 8   Display      1020 non-null   object
 9   Camera       995 non-null    object
 10  Card         813 non-null    object
 11  OS           925 non-null    object
dtypes: object(12)
memory usage: 95.8+ KB


In [16]:
df.isnull().sum()

Name             0
Price            0
Rating           0
Specs Score      0
SIM              1
Processor       49
RAM              2
Battery          3
Display          0
Camera          25
Card           207
OS              95
dtype: int64

In [17]:
# ccode to check null processor values and also if it is feature phones then we will remove from dataframe
df[df['Processor'].isnull()].head()

Unnamed: 0,Name,Price,Rating,Specs Score,SIM,Processor,RAM,Battery,Display,Camera,Card,OS
287,Jio JioPhone Prima 2 4G,"₹2,799",4.35,25,"single sim, 4g, volte, wi-fi",,"512 mb ram, 4 gb inbuilt",2000 mah battery,"2.4 inches, 240 x 320 px display",0.3 mp rear & 0.3 mp front camera,"memory card supported, upto 128 gb",kaios v2.5.3
334,Jio Bharat J1 4G,"₹1,799",4.35,23,"single sim, 3g, 4g, volte",,"512 mb ram, 4 gb inbuilt",2500 mah battery,"2.8 inches, 240 x 320 px display",0.3 mp rear camera,"memory card supported, upto 128 gb",
350,Jio Bharat V2,₹999,4.5,22,"single sim, 3g, 4g, volte",,"512 mb ram, 4 gb inbuilt",1000 mah battery,"1.77 inches, 240 x 320 px display",0.3 mp rear camera,"memory card supported, upto 128 gb",
390,Nokia 220 4G (2024),"₹3,605",4.2,19,"dual sim, 3g, 4g",,"64 mb ram, 128 mb inbuilt",1450 mah battery,"2.8 inches, 240 x 320 px display",,"memory card supported, upto 32 gb",
421,iKall Z16 Plus,"₹5,999",4.4,55,"dual sim, 3g, 4g, volte, wi-fi",,"4 gb ram, 64 gb inbuilt",5000 mah battery,"6.53 inches, 720 x 1600 px display with punch ...",13 mp rear & 8 mp front camera,"memory card (hybrid), upto 128 gb",android v12


In [18]:
# all above processor null values are for feature phones so we have to remvoe it from the dataframe

# removing feature phones form the dataframe
df.drop([287, 334, 350, 390, 421, 425, 433, 435, 475, 485, 489, 546, 553, 579,
        591, 614, 617, 621, 637, 651, 662, 676, 679, 682, 684, 689, 697, 725,
        733, 737, 773, 784, 797, 800, 802, 812, 830, 880, 881, 886, 891, 898,
        899, 904, 909, 916, 922, 945, 996],inplace=True)

In [19]:
df[df['Processor'].isnull()]

Unnamed: 0,Name,Price,Rating,Specs Score,SIM,Processor,RAM,Battery,Display,Camera,Card,OS


In [22]:
# checking for null camera values if these phones are feature phones then will be removed
df[df['Camera'].isnull()]

Unnamed: 0,Name,Price,Rating,Specs Score,SIM,Processor,RAM,Battery,Display,Camera,Card,OS
730,Motorola Moto A10G,"₹1,275",4.1,13,"dual sim, 3g",1.5 mhz processor,"4 mb ram, 32 mb inbuilt",800 mah battery,"1.8 inches, 160 x 128 px display",,"memory card supported, upto 64 gb",
888,Lava A1 Clear,"₹1,085",4.25,14,dual sim,"single core, 1.5 mhz processor","4 mb ram, 32 mb inbuilt",1000 mah battery,"1.77 inches, 720 x 1280 px display",,"memory card supported, upto 32 gb",
889,itel it2165s,₹944,4.6,9,dual sim,"single core, 1.4 mhz processor","4 mb ram, 4 mb inbuilt",1200 mah battery,"2 inches, 160 x 128 px display",,"memory card supported, upto 32 gb",
913,Itel Ace 2 Power,"₹1,099",4.4,14,dual sim,1.2 mhz processor,"32 mb ram, 32 mb inbuilt",2500 mah battery,"1.8 inches, 240 x 120 px display",,,
914,itel Ace2 Lite,₹849,4.15,12,dual sim,1.2 mhz processor,"32 mb ram, 32 mb inbuilt",1000 mah battery,"1.8 inches, 240 x 120 px display",,,


In [23]:
# remvoing camrea null values as thees are feature phones
df.drop([730, 888, 889, 913, 914],inplace=True)

In [24]:
df.isnull().sum()

Name             0
Price            0
Rating           0
Specs Score      0
SIM              0
Processor        0
RAM              0
Battery          3
Display          0
Camera           0
Card           201
OS              44
dtype: int64

now the df is a little bit cleaned as the there is info which are missing is it is not present in the site info

In [43]:
df.to_csv('smartprix_mobile_data.csv', index=False)

In [194]:
# making copy of my original df
df_copy = df.copy()

### splitting sim column into multiple columns

In [195]:
# SIM Information
df_copy['Dual Sim'] = df_copy['sim'].apply(lambda x: 'Dual Sim' in x)

# Create a new column for each technology, considering hierarchy
df_copy['5G'] = df_copy['sim'].apply(lambda x: '5G' in x)
df_copy['4G'] = df_copy['sim'].apply(lambda x: '4G' in x and '5G' not in x)
df_copy['3G'] = df_copy['sim'].apply(lambda x: '3G' in x and '5G' not in x and '4G' not in x)

# Only include 2G if there's no support for higher technologies
df_copy['2G'] = df_copy['sim'].apply(lambda x: '2G' in x and '5G' not in x and '4G' not in x and '3G' not in x)

# Additional features
df_copy['VoLTE'] = df_copy['sim'].apply(lambda x: 'VoLTE' in x)
df_copy['Vo5G'] = df_copy['sim'].apply(lambda x: 'Vo5G' in x)
df_copy['Wi-Fi'] = df_copy['sim'].apply(lambda x: 'Wi-Fi' in x)
df_copy['NFC'] = df_copy['sim'].apply(lambda x: 'NFC' in x)

# Drop the original 'sim' column
df_copy.drop(columns=['sim'], inplace=True)


KeyError: 'sim'

### Splitting ram into propper ram and rom columns

In [14]:
# Split RAM and ROM into separate columns
df_copy['RAM'] = df_copy['ram'].str.extract(r'(\d+ GB RAM)')  # Extract RAM info
df_copy['ROM'] = df_copy['ram'].str.extract(r'(\d+ GB inbuilt)')  # Extract ROM info

# Remove unnecessary units (GB) and handle NaN values
df_copy['RAM'] = df_copy['RAM'].str.replace(' GB RAM', '').fillna('0').astype(int).astype(str) + ' GB'
df_copy['ROM'] = df_copy['ROM'].str.replace(' GB inbuilt', '').fillna('0').astype(int).astype(str) + ' GB'

# Drop the old 'ram' column if you don't need it anymore
df_copy.drop(columns=['ram'], inplace=True)


### Splitting camera column into front and rear camera column

In [15]:
# Split Camera into front and rear camera columns using the '&' symbol
df_copy['Rear Camera'] = df_copy['camera'].str.extract(r'(.+?)&')  # Extract the part before '&' for rear camera
df_copy['Front Camera'] = df_copy['camera'].str.extract(r'&(.+)')  # Extract the part after '&' for front camera

# Clean the data
df_copy['Rear Camera'] = df_copy['Rear Camera'].str.strip()  # Clean any extra spaces for rear camera
df_copy['Front Camera'] = df_copy['Front Camera'].str.strip().replace(' MP Front Camera', 'MP')  # Clean and format the front camera

# Drop the old 'camera' column if you don't need it anymore
df_copy.drop(columns=['camera'], inplace=True)


### Splitting display size and display resolution

In [16]:
# Assuming your DataFrame is df

# Function to extract display information
def extract_display_info(display_str):
    if pd.isna(display_str):
        return [np.nan, np.nan, np.nan]  # Return NaN if the input is NaN
    
    # Split by comma and strip any whitespace
    display_info = [info.strip() for info in display_str.split(',')]
    
    # Initialize variables
    screen_size = resolution = refresh_rate = np.nan

    # Assign values based on the known format
    for info in display_info:
        if 'inches' in info:
            screen_size = info  # e.g., '6.67 inches'
        elif 'x' in info:
            resolution = info  # e.g., '1080 x 2400 px'
        elif 'Hz' in info:
            refresh_rate = info  # e.g., '144 Hz Display'

    return screen_size, resolution, refresh_rate

# Apply the function to the display column
df_copy[['Screen Size', 'Resolution', 'Refresh Rate']] = df_copy['display'].apply(extract_display_info).apply(pd.Series)

# Drop the old 'display' column if you don't need it anymore
df_copy.drop(columns=['display'], inplace=True)


### Splitting battery column inot batteyr and charging column

In [17]:
# Split Battery and Charging information into separate columns
# Split at 'with' and take the first part for battery and the second for charging
df_copy[['Battery_Info', 'Charging_Info']] = df_copy['battery'].str.split(' with ', expand=True)

# Extract Battery and Charging information separately
df_copy['Battery'] = df_copy['Battery_Info'].str.extract(r'(\d+\s*mAh)')  # Extract Battery info
df_copy['Charging'] = df_copy['Charging_Info'].str.extract(r'(\d+)\s*W(?:att)?')  # Extract Charging info without 'W'

# Clean and convert data types
df_copy['Battery'] = df_copy['Battery'].str.replace(' mAh', '', regex=False)  # Removing ' mAh'
df_copy['Charging'] = df_copy['Charging'].fillna('0')  # Fill NaN in Charging with '0'

# Convert to integer, handling NaN values
df_copy['Battery'] = pd.to_numeric(df_copy['Battery'], errors='coerce').fillna(0).astype(int)  # Convert to int
df_copy['Charging'] = pd.to_numeric(df_copy['Charging'], errors='coerce').fillna(0).astype(int)  # Convert to int

# Add units back to the Battery and Charging columns
df_copy['Battery'] = df_copy['Battery'].astype(str) + ' mAh'  # Adding 'mAh' back
df_copy['Charging'] = df_copy['Charging'].astype(str) + ' Watt'  # Adding 'Watt' back

# Drop the old battery-related columns if you don't need them anymore
df_copy.drop(columns=['battery', 'Battery_Info', 'Charging_Info'], inplace=True)



In [18]:
df_copy

Unnamed: 0,model,price,specs_score,ratings,processor,card,os,Dual Sim,5G,4G,...,NFC,RAM,ROM,Rear Camera,Front Camera,Screen Size,Resolution,Refresh Rate,Battery,Charging
0,Motorola Edge 50 Fusion,72866.53,85,4.65,"Snapdragon 7s Gen2, Octa Core, 2.4 GHz Processor",Memory Card Not Supported,Android v14,True,True,False,...,True,8 GB,128 GB,50 MP + 13 MP Dual Rear,32 MP Front Camera,6.67 inches,1080 x 2400 px,144 Hz Display with Punch Hole,5000 mAh,68 Watt
1,Motorola Moto G85 5G,58986.53,80,4.05,"Snapdragon 6s Gen3, Octa Core, 2.3 GHz Processor",Memory Card Not Supported,Android v14,True,True,False,...,False,8 GB,128 GB,50 MP + 8 MP Dual Rear,32 MP Front Camera,6.67 inches,1080 x 2400 px,120 Hz Display with Punch Hole,5000 mAh,33 Watt
2,Motorola Moto G45 5G,34696.53,76,4.05,"Snapdragon 6s Gen3, Octa Core, 2.3 GHz Processor","Memory Card (Hybrid), upto 1 TB",Android v14,True,True,False,...,False,4 GB,128 GB,50 MP + 2 MP Dual Rear,16 MP Front Camera,6.5 inches,720 x 1600 px,120 Hz Display with Punch Hole,5000 mAh,20 Watt
3,Vivo V40 5G,112081.00,86,4.35,"Snapdragon 7 Gen3, Octa Core, 2.63 GHz Processor",Memory Card Not Supported,Android v14,True,True,False,...,False,8 GB,128 GB,50 MP + 50 MP Dual Rear,50 MP Front Camera,6.78 inches,1260 x 2800 px,120 Hz Display with Punch Hole,5500 mAh,80 Watt
4,Samsung Galaxy M55s,69396.53,85,4.05,"Snapdragon 7 Gen1, Octa Core, 2.4 GHz Processor","Memory Card (Hybrid), upto 1 TB",Android v14,True,True,False,...,True,8 GB,128 GB,50 MP + 8 MP + 2 MP Triple Rear,50 MP Front Camera,6.7 inches,1080 x 2400 px,120 Hz Display with Punch Hole,5000 mAh,45 Watt
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1015,Samsung Galaxy S21 Plus,284536.53,88,4,"Exynos 2100, Octa Core, 2.9 GHz Processor",Android v10,No FM Radio,True,True,False,...,True,8 GB,128 GB,64 MP + 12 MP + 12 MP Triple Rear,10 MP Front Camera,6.7 inches,1080 x 2400 px,120 Hz Display with Punch Hole,4800 mAh,25 Watt
1016,Realme X3 SuperZoom Edition (8GB RAM + 256GB),104270.03,82,4.05,"Snapdragon 855+, Octa Core, 2.96 GHz Processor",Memory Card Not Supported,Android v10,True,False,True,...,False,8 GB,256 GB,64 MP Quad Rear,32 MP + 8 MP Dual Front Camera,6.57 inches,1080 x 2400 px,120 Hz Display with Dual Punch Hole,4200 mAh,30 Watt
1017,Micromax IN Note 1 (4GB RAM + 128GB),31195.30,75,4.55,"Helio G85, Octa Core, 2 GHz Processor","Memory Card Supported, upto 256 GB",Android v10,True,False,True,...,False,4 GB,128 GB,48 MP Quad Rear,16 MP Front Camera,6.67 inches,1080 x 2400 px Display with Punch Hole,,5000 mAh,18 Watt
1018,Samsung Galaxy F12,36400.30,73,4.65,"Exynos 850, Octa Core, 2 GHz Processor","Memory Card Supported, upto 512 GB",Android v11,True,False,True,...,False,4 GB,64 GB,48 MP Quad Rear,8 MP Front Camera,6.5 inches,720 x 1600 px,90 Hz Display with Water Drop Notch,6000 mAh,15 Watt


In [20]:
df_copy.to_csv('processed_mobiles_data.csv', index=False)


In [50]:
# Create a new dataframe with only 'os' and 'card' columns
df_os_card = df[['os', 'card']].copy()

# Display the new dataframe
df_os_card


Unnamed: 0,os,card
0,Android v14,Memory Card Not Supported
1,Android v14,Memory Card Not Supported
2,Android v14,"Memory Card (Hybrid), upto 1 TB"
3,Android v14,Memory Card Not Supported
4,Android v14,"Memory Card (Hybrid), upto 1 TB"
...,...,...
1015,No FM Radio,Android v10
1016,Android v10,Memory Card Not Supported
1017,Android v10,"Memory Card Supported, upto 256 GB"
1018,Android v11,"Memory Card Supported, upto 512 GB"


In [64]:
# List of valid OS strings
valid_os = ['android', 'ios', 'linux', 'windows', 'macos', 'ubuntu', 'chromeos']

# Function to check if the value in 'os' column is a valid OS and swap if necessary
def check_and_swap(row):
    os_value = row['os'].lower() if pd.notna(row['os']) else ''
    card_value = row['card']  # Get card value

    # If os_value is not a valid OS, swap it with the card value
    if not any(valid in os_value for valid in valid_os):
        row['os'], row['card'] = card_value, row['os']  # Swap the values

    return row

# Apply the function row by row
df_os_card = df_os_card.apply(check_and_swap, axis=1)

# Display the updated dataframe
df_os_card

Unnamed: 0,os,card
0,Android v14,Memory Card Not Supported
1,Android v14,Memory Card Not Supported
2,Android v14,"Memory Card (Hybrid), upto 1 TB"
3,Android v14,Memory Card Not Supported
4,Android v14,"Memory Card (Hybrid), upto 1 TB"
...,...,...
1015,Android v10,No FM Radio
1016,Android v10,Memory Card Not Supported
1017,Android v10,"Memory Card Supported, upto 256 GB"
1018,Android v11,"Memory Card Supported, upto 512 GB"


In [65]:
# List of valid OS strings
valid_os = ['android', 'ios', 'linux', 'windows', 'macos', 'ubuntu', 'chromeos']

# Initialize counters
android_count = 0
ios_count = 0
linux_count = 0
other_os_count = 0
invalid_os_count = 0  # For strings not related to any OS

# Iterate through the 'os' column
for os_value in df_os_card['os'].dropna():  # Drop NaN values if any
    os_value_lower = os_value.lower()  # Convert to lowercase for case-insensitive matching

    if 'android' in os_value_lower:
        android_count += 1
    elif 'ios' in os_value_lower:
        ios_count += 1
    elif 'linux' in os_value_lower:
        linux_count += 1
    elif any(valid in os_value_lower for valid in valid_os):  # Other valid OS
        other_os_count += 1
    else:
        invalid_os_count += 1  # For non-OS strings

# Print the results
print(f"Android count: {android_count}")
print(f"iOS count: {ios_count}")
print(f"Linux count: {linux_count}")
print(f"Other OS count: {other_os_count}")
print(f"Invalid OS count (non-OS strings): {invalid_os_count}")


Android count: 884
iOS count: 41
Linux count: 0
Other OS count: 0
Invalid OS count (non-OS strings): 83


In [75]:
import pandas as pd

# List of valid OS strings
valid_os = ['android', 'ios', 'linux', 'windows', 'macos', 'ubuntu', 'chromeos']

# Initialize a list for invalid OS values
invalid_os_values = []

# Iterate through the 'os' column
for os_value in df_os_card['os'].dropna():  # Drop NaN values if any
    os_value_lower = os_value.lower()  # Convert to lowercase for case-insensitive matching

    if 'android' in os_value_lower or \
       'ios' in os_value_lower or \
       'linux' in os_value_lower or \
       any(valid in os_value_lower for valid in valid_os):
        continue  # Skip valid OS values
    else:
        invalid_os_values.append(os_value)  # Collect invalid (non-OS) values

# Convert the list of invalid OS values to a Pandas Series
invalid_os_series = pd.Series(invalid_os_values)

# Optionally, convert to a DataFrame
invalid_os_df = pd.DataFrame(invalid_os_series, columns=['Invalid OS Values'])

# Print the Series or DataFrame
# print("Invalid OS values as Series:")
# print(invalid_os_series)

print("\nInvalid OS values as DataFrame:")
invalid_os_df.head(40)



Invalid OS values as DataFrame:


Unnamed: 0,Invalid OS Values
0,50 MP + 8 MP + 8 MP Triple Rear & 16 MP Front ...
1,50 MP + 8 MP + 8 MP Triple Rear & 16 MP Front ...
2,64 MP + 50 MP + 50 MP Triple Rear & 32 MP + 32...
3,Bluetooth
4,2 MP Rear Camera
5,"Memory Card Supported, upto 128 GB"
6,64 MP + 48 MP + 48 MP Triple Rear & 32 MP + 20...
7,50 MP + 12 MP Dual Rear & 10 MP Front Camera
8,13 MP + 12 MP Dual Rear & 32 MP Front Camera
9,No Rear Camera


In [53]:
# List of valid OS strings
valid_os = ['android', 'ios', 'linux', 'windows', 'macos', 'ubuntu', 'chromeos']

# Function to check if the value in 'os' column is not a valid OS
def check_invalid_os(os_value):
    if pd.notna(os_value):
        os_value = os_value.lower()  # Convert to lowercase for case-insensitive comparison
        # If the value does not contain any valid OS string, return True
        if not any(valid in os_value for valid in valid_os):
            return True
    return False

# Apply the function to the 'os' column and filter rows with invalid OS
invalid_os_rows = df_os_card[df_os_card['os'].apply(check_invalid_os)]

# Display the rows with invalid OS values
print(invalid_os_rows)

                                    os  \
19           Memory Card Not Supported   
123          Memory Card Not Supported   
257          Memory Card Not Supported   
338              Memory Card Supported   
350                          Bluetooth   
..                                 ...   
897                        No FM Radio   
913                     No Rear Camera   
914                     No Rear Camera   
945  Memory Card Supported, upto 32 GB   
996                          Bluetooth   

                                                  card  
19   50 MP + 8 MP + 8 MP Triple Rear & 16 MP Front ...  
123  50 MP + 8 MP + 8 MP Triple Rear & 16 MP Front ...  
257  64 MP + 50 MP + 50 MP Triple Rear & 32 MP + 32...  
338                                   2 MP Rear Camera  
350                 Memory Card Supported, upto 128 GB  
..                                                 ...  
897                              Memory Card Supported  
913                   1.8 inches, 240 x

In [398]:

# Function to swap values between 'os' and 'card'
for index, row in df_copy.iterrows():
    os_value = row['os'] if pd.notna(row['os']) else ''
    card_value = row['card'] if pd.notna(row['card']) else ''
    
    if not any(word in os_value.lower() for word in ['android', 'ios']):
        # If 'os' does not contain 'Android' or 'iOS', check 'card'
        if any(word in card_value.lower() for word in ['android', 'ios']):
            # Temporarily store the original value of 'os'
            original_os = os_value
            # Set 'os' to the value in 'card'
            df_copy.at[index, 'os'] = card_value
            # Set 'card' to the original value in 'os'
            df_copy.at[index, 'card'] = original_os

# Display the updated DataFrame
df_copy.reset_index()



Unnamed: 0,index,model,price,specs_score,ratings,processor,card,os,Dual Sim,5G,...,NFC,RAM,ROM,Rear Camera,Front Camera,Screen Size,Resolution,Refresh Rate,Battery,Charging
0,0,Motorola Edge 50 Fusion,69716.68,85,85,"Snapdragon 7s Gen2, Octa Core, 2.4 GHz Processor",Memory Card Not Supported,Android v14,True,True,...,True,8 GB,128 GB,50 MP + 13 MP Dual Rear,32 MP Front Camera,6.67 inches,1080 x 2400 px,144 Hz Display with Punch Hole,5000 mAh,68 Watt
1,1,Motorola Moto G85 5G,56436.68,80,80,"Snapdragon 6s Gen3, Octa Core, 2.3 GHz Processor",Memory Card Not Supported,Android v14,True,True,...,False,8 GB,128 GB,50 MP + 8 MP Dual Rear,32 MP Front Camera,6.67 inches,1080 x 2400 px,120 Hz Display with Punch Hole,5000 mAh,33 Watt
2,2,Motorola Moto G45 5G,33196.68,76,76,"Snapdragon 6s Gen3, Octa Core, 2.3 GHz Processor","Memory Card (Hybrid), upto 1 TB",Android v14,True,True,...,False,4 GB,128 GB,50 MP + 2 MP Dual Rear,16 MP Front Camera,6.5 inches,720 x 1600 px,120 Hz Display with Punch Hole,5000 mAh,20 Watt
3,3,Vivo V40 5G,107236.00,86,86,"Snapdragon 7 Gen3, Octa Core, 2.63 GHz Processor",Memory Card Not Supported,Android v14,True,True,...,False,8 GB,128 GB,50 MP + 50 MP Dual Rear,50 MP Front Camera,6.78 inches,1260 x 2800 px,120 Hz Display with Punch Hole,5500 mAh,80 Watt
4,4,Samsung Galaxy M55s,66396.68,85,85,"Snapdragon 7 Gen1, Octa Core, 2.4 GHz Processor","Memory Card (Hybrid), upto 1 TB",Android v14,True,True,...,True,8 GB,128 GB,50 MP + 8 MP + 2 MP Triple Rear,50 MP Front Camera,6.7 inches,1080 x 2400 px,120 Hz Display with Punch Hole,5000 mAh,45 Watt
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1015,1015,Samsung Galaxy S21 Plus,272236.68,88,88,"Exynos 2100, Octa Core, 2.9 GHz Processor",No FM Radio,Android v10,True,True,...,True,8 GB,128 GB,64 MP + 12 MP + 12 MP Triple Rear,10 MP Front Camera,6.7 inches,1080 x 2400 px,120 Hz Display with Punch Hole,4800 mAh,25 Watt
1016,1016,Realme X3 SuperZoom Edition (8GB RAM + 256GB),99762.68,82,82,"Snapdragon 855+, Octa Core, 2.96 GHz Processor",Memory Card Not Supported,Android v10,True,False,...,False,8 GB,256 GB,64 MP Quad Rear,32 MP + 8 MP Dual Front Camera,6.57 inches,1080 x 2400 px,120 Hz Display with Dual Punch Hole,4200 mAh,30 Watt
1017,1017,Micromax IN Note 1 (4GB RAM + 128GB),29846.80,75,75,"Helio G85, Octa Core, 2 GHz Processor","Memory Card Supported, upto 256 GB",Android v10,True,False,...,False,4 GB,128 GB,48 MP Quad Rear,16 MP Front Camera,6.67 inches,1080 x 2400 px Display with Punch Hole,,5000 mAh,18 Watt
1018,1018,Samsung Galaxy F12,34826.80,73,73,"Exynos 850, Octa Core, 2 GHz Processor","Memory Card Supported, upto 512 GB",Android v11,True,False,...,False,4 GB,64 GB,48 MP Quad Rear,8 MP Front Camera,6.5 inches,720 x 1600 px,90 Hz Display with Water Drop Notch,6000 mAh,15 Watt


In [405]:
import pandas as pd


# Create a mask for entries containing "Android" or "iOS"
android_ios_mask = df_copy['os'].str.contains(r'Android|iOS', case=False, na=False)

# Filter the DataFrame for entries that do not contain "Android" or "iOS"
other_values_df = df_copy[~android_ios_mask]

# Display the filtered DataFrame
print("Entries containing other values (not 'Android' or 'iOS'):")
other_values_df.reset_index()


Entries containing other values (not 'Android' or 'iOS'):


Unnamed: 0,index,model,price,specs_score,ratings,processor,card,os,Dual Sim,5G,...,NFC,RAM,ROM,Rear Camera,Front Camera,Screen Size,Resolution,Refresh Rate,Battery,Charging
0,19,Lava Agni 3 5G,69716.68,83,83,"Dimensity 7300X, Octa Core, 2.5 GHz Processor",50 MP + 8 MP + 8 MP Triple Rear & 16 MP Front ...,Memory Card Not Supported,True,True,...,False,8 GB,128 GB,,,6.78 inches,1200 x 2652 px,120 Hz Display with Punch Hole,5000 mAh,66 Watt
1,123,Lava Agni 3 5G (8GB RAM + 256GB),82996.68,84,84,"Dimensity 7300X, Octa Core, 2.5 GHz Processor",50 MP + 8 MP + 8 MP Triple Rear & 16 MP Front ...,Memory Card Not Supported,True,True,...,False,8 GB,256 GB,,,6.78 inches,1200 x 2652 px,120 Hz Display with Punch Hole,5000 mAh,66 Watt
2,257,Vivo X Fold 3 Pro,531196.68,98,98,"Snapdragon 8 Gen3, Octa Core, 3.3 GHz Processor",64 MP + 50 MP + 50 MP Triple Rear & 32 MP + 32...,Memory Card Not Supported,True,True,...,True,16 GB,512 GB,,,8.03 inches,2200 x 2480 px,120 Hz Display with Punch Hole,5700 mAh,100 Watt
3,334,Jio Bharat J1 4G,5972.68,23,23,"512 MB RAM, 4 GB inbuilt",Bluetooth,,False,False,...,False,0 GB,0 GB,,,,,,0 mAh,0 Watt
4,338,Nokia 235 4G 2024,14936.68,20,20,No Wifi,2 MP Rear Camera,Memory Card Supported,True,False,...,False,0 GB,0 GB,,,,,,0 mAh,0 Watt
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
90,915,Itel Super Guru 400,4442.16,10,10,"SC6531C, 312 MHz Processor",Bluetooth,,True,False,...,False,0 GB,0 GB,,,2.4 inches Display,,,1200 mAh,0 Watt
91,916,Nokia 110 (2023),4976.68,11,11,No Wifi,,,True,False,...,False,0 GB,0 GB,,,1.8 inches,120 x 160 px Display,,1000 mAh,0 Watt
92,922,Motorola Moto A10e,4644.68,10,10,"4 MB RAM, 32 MB inbuilt",,,True,False,...,False,0 GB,0 GB,,,,,,0 mAh,0 Watt
93,945,Nokia 5710 XpressAudio,14936.68,19,19,No Wifi,0.3 MP Rear Camera,"Memory Card Supported, upto 32 GB",True,False,...,False,0 GB,0 GB,,,,,,0 mAh,0 Watt


In [410]:
df_copy.to_csv('mobiles_data.csv')