# Import necessary libraries

In [None]:
import pandas as pd  # For data manipulation and analysis
import numpy as np   # For numerical operations
import re            # For regular expressions (text processing)
import warnings      # To handle or suppress warnings
import requests      # For making HTTP requests (e.g., API calls)
import json          # For working with JSON data

# Suppress any warnings to keep the output clean
warnings.filterwarnings('ignore')

### Scraping Apple iPhone Products from Konga API

In [None]:
# Konga GraphQL API endpoint
url = "https://api.konga.com/v1/graphql"

# Headers to specify JSON content type for the API request
headers = {
    "Content-Type": "application/json",
    "Accept": "application/json"
}

# Function to fetch products from a specific page
def get_products(page):
    # GraphQL query payload with dynamic page number
    payload = {
        "query": f"""
        {{
            searchByStore(
                search_term: [["attributes.brand:Apple"], ["category.category_id:7539"]],
                paginate: {{page: {page}, limit: 40}},
                store_id: 1
            ) {{
                pagination {{
                    total
                    page
                    limit
                }}
                products {{
                    name
                    special_price
                    price
                }}
            }}
        }}
        """
    }

    # Send POST request to the Konga API
    response = requests.post(url, headers=headers, data=json.dumps(payload))
    response.raise_for_status()  # Raise error if request fails
    data = response.json()

    # Return the 'searchByStore' part of the response containing products and pagination
    return data["data"]["searchByStore"]

# Function to fetch all products across all pages
def fetch_all_products():
    all_products = []
    page = 0

    while True:
        print(f"Fetching page {page}...")
        result = get_products(page)
        products = result["products"]

        if not products:
            break  # Stop when no products are returned

        all_products.extend(products)
        page += 1  # Move to next page

    return all_products

# Fetch all iPhone products from Konga
phones = fetch_all_products()

# Loop through products to access their name and price
for product in phones:
    name = product["name"]
    price = product.get("price")
    # print(f"{name} — ₦{price}")

### Converting the scraped iPhones to DataFrame

In [None]:
# Convert the list of iPhone products (fetched from Konga API) into a pandas DataFrame
#df = pd.DataFrame(phones)

### Saving the scraped iphone to csv file

In [None]:
# Save the DataFrame containing Konga iPhone products to a CSV file
# index=False ensures that the row indices are not included in the file
#df.to_csv("konga_iphones.csv", index=False)

### Reading the saved csv file

In [None]:
# Read the CSV file "konga_iphones.csv" back into a pandas DataFrame for analysis
DataFrame = pd.read_csv("konga_iphones.csv")

### Checkinf the length of the DataFrame

In [None]:
# Get the number of rows in the DataFrame to see how many iPhone products were loaded
len(DataFrame)

### Checking the complete list of column names in the DataFrame

In [None]:
# Print a complete list of column names in the DataFrame
# Using repr() ensures the output shows all characters clearly, including spaces or special characters
print(repr(DataFrame.columns.tolist()))

### Renaming columns

In [None]:
# Rename columns for clarity:
# 'name' becomes 'Phone_info' to better describe the phone details
# 'price' becomes 'Phone_Price' to indicate it contains the price of the phone
DataFrame.rename(columns={
    'name': 'Phone_info',
    'price': 'Phone_Price'
}, inplace=True)

### Printing the updated list of column names after renaming

In [None]:
# Print the updated list of column names after renaming
# Using repr() to clearly show all characters in the column names
print(repr(DataFrame.columns.tolist()))

### Dropping the column we no longer need

In [None]:
# Remove the 'special_price' column from the DataFrame as it's no longer needed
DataFrame.drop(columns=['special_price'], inplace=True)

In [None]:
# Print the current list of column names after dropping 'special_price'
# Using repr() ensures all characters are clearly visible
print(repr(DataFrame.columns.tolist()))

### Defining Iphone model

In [None]:
# List of iPhone models in chronological order from iPhone 3G to recent models
models = [
    'iPhone 3G', 'iPhone 3GS', 'iPhone 4', 'iPhone 4s', 'iPhone 5', 
    'iPhone 5c', 'iPhone 5s', 'iPhone 6', 'iPhone 6 Plus', 'iPhone 6s', 
    'iPhone 6s Plus', 'iPhone SE', 'iPhone 7', 'iPhone 7 Plus', 'iPhone 8', 
    'iPhone 8 Plus', 'iPhone X', 'iPhone XS', 'iPhone XS Max', 'iPhone XR', 
    'iPhone 11', 'iPhone 11 Pro', 'iPhone 11 Pro Max', 'iPhone 12', 
    'iPhone 12 mini', 'iPhone 12 Pro', 'iPhone 12 Pro Max', 'iPhone 13', 
    'iPhone 13 mini', 'iPhone 13 Pro', 'iPhone 13 Pro Max', 'iPhone 14', 
    'iPhone 14 Plus', 'iPhone 14 Pro', 'iPhone 14 Pro Max', 'iPhone 15', 
    'iPhone 15 Plus', 'iPhone 15 Pro', 'iPhone 15 Pro Max', 'iPhone 16', 
    'iPhone 16 Plus', 'iPhone 16 Pro', 'iPhone 16 Pro Max', 'iPhone 16e', 
    'iPhone 17', 'iPhone Air', 'iPhone 17 Pro', 'iPhone 17 Pro Max'
]

### Sorting the models

In [None]:
# Sort the list of iPhone models by the length of their names in descending order
# This helps ensure longer model names (like "iPhone 16 Pro Max") are matched before shorter ones
sorted_models = sorted(models, key=len, reverse=True)

### Creating a new column 'Series' by mapping each phone's name to its corresponding iPhone model

In [None]:
# Create a new column 'Series' by mapping each phone's name to its corresponding iPhone model

series_list = []
for name in DataFrame['Phone_info']:
    if pd.isna(name):
        # If the phone name is missing, mark as 'Unknown'
        series_list.append('Unknown')
    else:
        name_lower = str(name).lower()
        found = 'Unknown'
        # Check against all sorted models (longest names first)
        for model in sorted_models:
            if model.lower() in name_lower:
                found = model  # Assign the matched model
                break
        series_list.append(found)

# Add the resulting series to the DataFrame
DataFrame['Series'] = series_list

### Identifying duplicates

In [None]:
# Identify duplicate rows in the DataFrame based on 'Phone_info', 'Series', and 'Phone_Price'
# keep=False marks all duplicates as True (not just the second and subsequent occurrences)
duplicates = DataFrame[DataFrame.duplicated(subset=['Phone_info', 'Series', 'Phone_Price'], keep=False)]

# Display all detected duplicate rows
print(duplicates)

### Identifying unknown

In [None]:
# Filter the DataFrame to show rows where the Series could not be identified (marked as 'Unknown')
Unknown = DataFrame[DataFrame['Series'] == 'Unknown']

# Display all unknown series entries
print(Unknown)

### Fixing typos and formatting error

In [None]:
# Fix typos and formatting issues in the Unknown DataFrame
typo_corrections = {
    r'iPphone': 'iPhone',          # Fix typo
    r'iPhone16': 'iPhone 16',      # Add missing space
    r'iPhone one': 'iPhone 13',    # Correct "iPhone one" to iPhone 13
    r'iPhone  X': 'iPhone X',       # Ensure proper spacing (optional)
}

for pattern, replacement in typo_corrections.items():
    Unknown['Phone_info'] = Unknown['Phone_info'].str.replace(pattern, replacement, regex=True)

# Reassign Series using sorted_models 
series_list = []
for name in Unknown['Phone_info']:
    if pd.isna(name):
        series_list.append('Unknown')
    else:
        name_lower = str(name).lower()
        found = 'Unknown'
        for model in sorted_models:
            if model.lower() in name_lower:
                found = model
                break
        series_list.append(found)

Unknown['Series'] = series_list

# check results
print(Unknown[['Phone_info', 'Phone_Price', 'Series']])

### Assigning new iphone model

In [None]:
# Assign 'Ipod Touch'
Unknown.loc[Unknown['Phone_info'].str.contains('Ipod Touch', case=False, na=False), 'Series'] = 'Ipod Touch'

# Check the result
print(Unknown[['Phone_info', 'Phone_Price', 'Series']])

### Updating Main DataFrame with Corrected Unknown Entries

In [None]:
# Remove old Unknown entries from the main DataFrame
DataFrame = DataFrame[DataFrame['Series'] != 'Unknown'].reset_index(drop=True)

# Append the corrected Unknown entries back
DataFrame = pd.concat([DataFrame, Unknown], ignore_index=True)

#### Checking if there is any unknown left

In [None]:
# Filter the DataFrame to find any remaining Unknown series entries
new_unknown =DataFrame[DataFrame['Series'] == 'Unknown']

# Check how many are left and display them
print(f"Number of remaining Unknown entries: {len(new_unknown)}")

### Removing duplicate 

In [None]:
# Remove duplicate rows from DataFrame based on 'Series' and 'Phone_Price' columns
# This keeps only the first occurrence of each unique combination
df_clean = DataFrame.drop_duplicates(subset=['Series', 'Phone_Price'])

### Checking all iPhone and other device models present

In [None]:
# Assign the cleaned DataFrame to df1 for further analysis
df1 = df_clean

# Print a sorted list of unique Series values to see all iPhone and other device models present
print(sorted(df1['Series'].unique()))

### Inspect Data Types of Columns

In [None]:
# Display the data types of each column in df1
# This helps to understand which columns are numeric, object (string), etc.
print(df1.dtypes)

### Converting Phone_Price to Numeric

In [None]:
# Convert the 'Phone_Price' column to float type to enable numeric operations
df1['Phone_Price'] = df1['Phone_Price'].astype(float)

# Verify the conversion by printing the data type
print(df1['Phone_Price'].dtype)

### Saving Cleaned DataFrame to CSV file

In [None]:
# Save the cleaned and processed DataFrame to a CSV file named 'cleaned_konga_iphone.csv'
# index=False ensures that the row indices are not included in the saved file
df1.to_csv('cleaned_konga_iphone.csv', index=False)