# Import necessary libraries

In [None]:
# Import pandas library for data manipulation and analysis
import pandas as pd
# Import numpy for numerical operations
import numpy as np
# Import regular expression module for text pattern matching
import re
# Import BeautifulSoup for parsing HTML and XML documents
from bs4 import BeautifulSoup
# Import warnings module to manage warning messages
import warnings
# Import requests library for making HTTP requests
import requests
# Suppress warning messages to keep output clean
warnings.filterwarnings('ignore')

### Sending an HTTP GET request to Jumia Nigeria's Apple mobile phones page 
### And retrieve the HTML content as text for further processing

In [None]:
# Send an HTTP GET request to Jumia Nigeria's Apple mobile phones page
# and retrieve the HTML content as text for further processing
source = requests.get('https://www.jumia.com.ng/mobile-phones/apple/#catalog-listing').text

### Creating a BeautifulSoup object from the HTML source using the lxml parser

In [None]:
# Create a BeautifulSoup object from the HTML source
# using the 'lxml' parser for efficient HTML parsing
soup = BeautifulSoup(source, 'lxml')

### Finding the first 'article' HTML element with class attributes "prd _fb col c-prd"

In [None]:
# Find the article element with class attributes "prd _fb col c-prd"
# This targets the product container on a webpage
Article = soup.find('article', class_="prd _fb col c-prd")

### printing the HTML structure of the 'Article' object in a more readable format

In [None]:
# This line is commented out, but would print the HTML structure of the 'Article' element
# in a more readable format using BeautifulSoup's prettify() method
#print(Article.prettify())

### Extract the phone info from the article by finding the h3 element with class "name"

In [None]:
# Extract the phone name/info from the article by finding the h3 element with class "name"
# and get its text content with whitespace stripped
phone_info = Article.find('h3', class_="name").get_text(strip=True)
phone_info  # Display the extracted phone information

### Extract the price text from the HTML element with class "prc"

In [None]:
# Extract the price text from the HTML element with class "prc"
# Remove spaces and strip whitespace characters from the extracted text
phone_price = Article.find('div', class_="prc").get_text().replace(' ', '').strip()
phone_price  # Return the cleaned price text

### Finding the first anchor tag with an href attribute that starts with '/apple-' and Combine it with base url

In [None]:
# Define the base URL for Jumia Nigeria
base_link = "https://www.jumia.com.ng"

# Find the first anchor tag with an href attribute that starts with '/apple-'
link = Article.find('a', href=lambda x: x and x.startswith('/apple-'))

# Combine the base URL with the relative path to create a complete URL
full_link = base_link + link['href']

# Print the complete URL
print(full_link)

### Looping through pages 1 to 16 of search results to get all the phone information

In [None]:
# Initialize empty list to store phone data
Phone= []

# Loop through pages 1 to 16 of search results
for page in range(1, 17):
    # Make HTTP request to Jumia website for each page of iPhone search results
    source = requests.get(f"https://www.jumia.com.ng/catalog/?q=iphone&page={page}").text
    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(source, 'lxml')

    # Find all product articles on the page
    Articles = soup.find_all('article', class_="prd _fb col c-prd")
    # Print progress information
    print(f"Scraping page {page}... found {len(Articles)} items")

    # Loop through each product article
    for Article in Articles:
        # Extract the phone name, removing extra whitespace
        name = Article.find('h3', class_="name").get_text(strip=True)
        # Extract the price, removing spaces and extra whitespace
        price = Article.find('div', class_="prc").get_text().replace(' ', '').strip()

        # Add the extracted data as a dictionary to the Phone list
        Phone.append({
            'Phone_info': name,
            'Phone_Price': price
        })

### Printing the length of the Phone scraped

In [None]:
# Print the length of the Phone variable 
#print(len(Phone))

### Creating a DataFrame from the 'Phone' data structure

In [None]:
# This converts the data into a tabular format for easier manipulation and analysis
#df = pd.DataFrame(Phone)

### Saving the DataFrame 'df' to a CSV file

In [None]:
# Save the DataFrame 'df' to a CSV file named 'jumia_iphones.csv' without including the index column
#df.to_csv("jumia_iphones.csv", index=False)

### Reading the saved csv file into a DataFrame

In [None]:
# Read the iPhone data from the CSV file 'jumia_iphones.csv' into a pandas DataFrame
DataFrame = pd.read_csv("jumia_iphones.csv")

### Displaying the first 5 rows

In [None]:
# Display the first 5 rows of the DataFrame
# For quick inspection of  the data structure and content
DataFrame.head(5)

In [None]:
# Convert the DataFrame's column names to a list and print the string representation
# repr() shows the exact string representation including quotes and escape characters
print(repr(DataFrame.columns.tolist()))

### defining iPhone models in chronological order from iPhone 3G to recent models

In [None]:
# List of iPhone models in chronological order from iPhone 3G to recent models
# This includes all main iPhone releases, including standard, Plus, Pro, Pro Max, mini, SE, and Air variants
# The list spans from early models (3G, 4, 5 series) through current models (16 and 17 series)
models = [
    'iPhone 3G', 'iPhone 3GS', 'iPhone 4', 'iPhone 4s', 'iPhone 5', 
    'iPhone 5c', 'iPhone 5s', 'iPhone 6', 'iPhone 6 Plus', 'iPhone 6s', 
    'iPhone 6s Plus', 'iPhone SE', 'iPhone 7', 'iPhone 7 Plus', 'iPhone 8', 
    'iPhone 8 Plus', 'iPhone X', 'iPhone XS', 'iPhone XS Max', 'iPhone XR', 
    'iPhone 11', 'iPhone 11 Pro', 'iPhone 11 Pro Max', 'iPhone 12', 
    'iPhone 12 mini', 'iPhone 12 Pro', 'iPhone 12 Pro Max', 'iPhone 13', 
    'iPhone 13 mini', 'iPhone 13 Pro', 'iPhone 13 Pro Max', 'iPhone 14', 
    'iPhone 14 Plus', 'iPhone 14 Pro', 'iPhone 14 Pro Max', 'iPhone 15', 
    'iPhone 15 Plus', 'iPhone 15 Pro', 'iPhone 15 Pro Max', 'iPhone 16', 
    'iPhone 16 Plus', 'iPhone 16 Pro', 'iPhone 16 Pro Max', 'iPhone 16e', 
    'iPhone 17', 'iPhone Air', 'iPhone 17 Pro', 'iPhone 17 Pro Max'
]

### Sorting the models

In [None]:
# Sort the 'models' list by length of each element in descending order (longest first)
sorted_models = sorted(models, key=len, reverse=True)

In [None]:
# Create an empty list to store the identified phone series
series_list = []
# Iterate through each phone information entry in the DataFrame
for name in DataFrame['Phone_info']:
    # Check if the phone information is missing (NaN)
    if pd.isna(name):
        # If missing, label as 'Unknown'
        series_list.append('Unknown')
    else:
        # Convert phone information to lowercase for case-insensitive matching
        name_lower = str(name).lower()
        # Default value if no match is found
        found = 'Unknown'
        # Iterate through the sorted list of phone models
        for model in sorted_models:
            # Check if the current model appears in the phone information
            if model.lower() in name_lower:
                # If found, assign the model name and stop searching
                found = model
                break
        # Add the identified model to the list
        series_list.append(found)

# Create a new column 'Series' in the DataFrame with the identified phone models
DataFrame['Series'] = series_list

### Finding duplicate rows in DataFrame based on 'Phone_info', 'Series', and 'Phone_Price' 

In [None]:
# Find duplicate rows in DataFrame based on 'Phone_info', 'Series', and 'Phone_Price' columns
# keep=False ensures all duplicates are marked (not just first or last occurrences)
duplicates = DataFrame[DataFrame.duplicated(subset=['Phone_info', 'Series', 'Phone_Price'], keep=False)]
print(duplicates)  # Display all identified duplicate rows

### Removing duplicates

In [None]:
# Remove duplicate rows from DataFrame based on 'Series' and 'Phone_Price' columns
# This keeps only the first occurrence of each unique combination
df_clean = DataFrame.drop_duplicates(subset=['Series', 'Phone_Price'])

### Checking the rows doesn't match the defined iphone model

In [None]:
# Filter the dataframe to only include rows where 'Series' column equals 'Unknown'
# and store the result in a new dataframe called 'Unknown'
Unknown = df_clean[df_clean['Series'] == 'Unknown']
# Display the contents of the 'Unknown' dataframe
print(Unknown)

### Display all unknown entries for futher inspection before dropping it

In [None]:
# Temporarily set pandas to display all rows
pd.set_option('display.max_rows', None)

# Display the entire Unknown DataFrame
print(Unknown[['Phone_info', 'Phone_Price', 'Series']])

# Optional: reset the option back to default to avoid huge outputs later
pd.reset_option('display.max_rows')

### chehcking the rows of the cleaned DataFrame

In [None]:
# Get the number of rows in the cleaned DataFrame
len(df_clean)

In [None]:
# Randomly display 10 rows from the cleaned DataFrame
df_clean.sample(10)

### Inspecting iPhone 16 Pro Max

In [None]:
# Filter the dataset to include only rows where the Series column equals "iPhone 16 Pro Max"
iphone_16_pro_max = df_clean[df_clean['Series'] == 'iPhone 16 Pro Max']

# Print the filtered DataFrame to view the records for iPhone 16 Pro Max
print(iphone_16_pro_max)

### Inspecting iPhone 16

In [None]:
# Filter the dataframe to get only rows where the Series is "iPhone 16"
iphone_16 = df_clean[df_clean['Series'] == 'iPhone 16']

# Display the filtered result
print(iphone_16)

### Create a new DataFrame by removing rows where the Series column has the value "Unknown"

In [None]:
# Create a new DataFrame by removing rows where the Series column has the value "Unknown"
# reset_index(drop=True) resets the index after filtering so it starts from 0 again
df1 = df_clean[df_clean['Series'] != 'Unknown'].reset_index(drop=True)

# Display the cleaned DataFrame
df1

In [None]:
# Get unique values from the 'Series' column in df1
# Sort them alphabetically to view in an organized way
print(sorted(df1['Series'].unique()))

### Inpecting iphone 3GS

In [None]:
# Filter df1 to include only rows where the Series is "iPhone 3GS"
iPhone_3GS = df1[df1['Series'] == 'iPhone 3GS']

# Display the filtered DataFrame to see all records for iPhone 3GS
print(iPhone_3GS)


### Inpecting iphone 6

In [None]:
# Filter df1 to include only rows where the Series is "iPhone 6"
iPhone_6 = df1[df1['Series'] == 'iPhone 6']

# Display the filtered DataFrame to see all records for iPhone 6
print(iPhone_6)

### Inpecting iPhone 6 plus

In [None]:
# Filter df1 to include only rows where the Series is "iPhone 6 Plus"
iPhone_6_plus = df1[df1['Series'] == 'iPhone 6 Plus']

# Display the filtered DataFrame to see all records for iPhone 6 Plus
print(iPhone_6_plus)

### Inpecting iphone 6s plus

In [None]:
# Filter df1 to include only rows where the Series is "iPhone 6s Plus"
iPhone_6s_plus = df1[df1['Series'] == 'iPhone 6s Plus']

# Display the filtered DataFrame to see all records for iPhone 6s Plus
print(iPhone_6s_plus)

### Inpecting iphone SE

In [None]:
# Filter df1 to include only rows where the Series is "iPhone SE"
iPhone_SE = df1[df1['Series'] == 'iPhone SE']

# Display the filtered DataFrame to see all records for iPhone SE
print(iPhone_SE)

### Removing currency symbol, commas and any leading or trailing whitespace

In [None]:
# Remove the currency symbol (₦) and commas from the 'Phone_Price' column
# Also remove any leading or trailing whitespace
df1['Phone_Price'] = df1['Phone_Price'].str.replace('[₦,]', '', regex=True).str.strip()

# Display the updated DataFrame
df1

### Converting the 'Phone_Price' column to numeric values

In [None]:
# Convert the 'Phone_Price' column to numeric values
# If the price is a range (e.g., "1000-1500"), take the average of the two values
# Otherwise, convert the single price string to a float
df1['Phone_Price'] = df1['Phone_Price'].apply(
    lambda x: np.mean([float(p) for p in x.split('-')]) if '-' in x else float(x)
)

In [None]:
# Filter df1 to include only rows where the Series is "iPhone 6" after cleaning the Phone_Price column
iPhone_6 = df1[df1['Series'] == 'iPhone 6']

# Display the filtered DataFrame to see all iPhone 6 records with numeric prices
print(iPhone_6)

### After removing rows and columns labeled as "Unknown," I noticed that some phone accessories were still classified under defined iPhone models. Upon inspection, I found that none of these accessories had prices close to the lowest-priced iPhone in the dataset. To address this, I identified the price of the cheapest iPhone model and removed any items priced below that threshold, ensuring the dataset contains only actual phone entries.

In [None]:
# Remove rows where the Phone_Price is below 57,949 (because it's the lowest iPhone price)
# Reset the index after filtering to start from 0 again
df1 = df1[df1['Phone_Price'] >= 57949.0].reset_index(drop=True)

# Display the updated DataFrame
df1

In [None]:
# Get the number of rows in the updated DataFrame after filtering out low-priced entries
len(df1)

In [None]:
# Filter df1 to include only rows where the Series is "iPhone 6" after removing low-priced entries
iPhone_6 = df1[df1['Series'] == 'iPhone 6']

# Display the filtered DataFrame to verify iPhone 6 records remain
print(iPhone_6)

In [None]:
# Filter df1 to include only rows where the Series is "iPhone 16" after cleaning and price filtering
iphone_16 = df1[df1['Series'] == 'iPhone 16']

# Display the filtered DataFrame to see all iPhone 16 records
print(iphone_16)

In [None]:
# Display the first 5 rows of the updated DataFrame to get a quick overview
df1.head(5)

### Saving the cleaned DataFrame to a CSV file

In [None]:
# Save the cleaned and filtered DataFrame to a CSV file named 'cleaned_jumia_iphone.csv'
# index=False ensures that the row indices are not written to the file
#df1.to_csv('cleaned_jumia_iphone.csv', index=False)