In [27]:
# Importing libraries
import pandas as pd, numpy as np, seaborn as sns, matplotlib.pyplot as plt
import time
import warnings 
import statsmodels.api as sm
from scipy.spatial.distance import pdist, squareform
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, pairwise_distances
from sklearn.decomposition import PCA
from scipy.stats import zscore
from pandas.plotting import parallel_coordinates
from matplotlib.lines import Line2D
from geopy.geocoders import Nominatim
from IPython.display import display


# Suppress all warnings
warnings.filterwarnings("ignore", category=UserWarning, module="sklearn"), warnings.filterwarnings("ignore", category=DeprecationWarning), 
warnings.filterwarnings('ignore', category=FutureWarning, module='seaborn')

# Step 1: Load the dataset
data = pd.read_csv(r"C:\Users\claud\OneDrive\Desktop\DataMining\homework\Project\shopping_trends.csv")

# Step 2: Clean up column names by removing any leading or trailing spaces
print("\n" + "*" * 70, "Step 2", "*" * 70 + "\n")
data.columns = data.columns.str.strip().str.replace(' ', '_')
data = data.drop(columns=['Customer_ID'], errors='ignore')
print('Lines and Columns:', data.shape)

# Step 3: Transforming and Creating some variables
print("\n" + "*" * 70, "Step 3", "*" * 70 + "\n")
# Creating Dictionary for Frequency_of_Purchases variable
freq_map = {"Weekly": 52, "Fortnightly": 26, "Bi-Weekly": 26,  "Monthly": 12, "Quarterly": 4, "Every 3 Months": 4, "Annually": 1}
data["Frequency_Purch_per_year"] = data["Frequency_of_Purchases"].map(freq_map)
# Defining a Subscrition_Status as a Num (Yes=1, No=0)
data['Subscription_Num'] = data['Subscription_Status'].map({'Yes': 1, 'No': 0})
display(data)


********************************************************************** Step 2 **********************************************************************

Lines and Columns: (3900, 18)

********************************************************************** Step 3 **********************************************************************



Unnamed: 0,Age,Gender,Item_Purchased,Category,Purchase_Amount_(USD),Location,Size,Color,Season,Review_Rating,Subscription_Status,Payment_Method,Shipping_Type,Discount_Applied,Promo_Code_Used,Previous_Purchases,Preferred_Payment_Method,Frequency_of_Purchases,Frequency_Purch_per_year,Subscription_Num
0,55,Male,Blouse,Clothing,53,Kentucky,L,Gray,Winter,3.1,Yes,Credit Card,Express,Yes,Yes,14,Venmo,Fortnightly,26,1
1,19,Male,Sweater,Clothing,64,Maine,L,Maroon,Winter,3.1,Yes,Bank Transfer,Express,Yes,Yes,2,Cash,Fortnightly,26,1
2,50,Male,Jeans,Clothing,73,Massachusetts,S,Maroon,Spring,3.1,Yes,Cash,Free Shipping,Yes,Yes,23,Credit Card,Weekly,52,1
3,21,Male,Sandals,Footwear,90,Rhode Island,M,Maroon,Spring,3.5,Yes,PayPal,Next Day Air,Yes,Yes,49,PayPal,Weekly,52,1
4,45,Male,Blouse,Clothing,49,Oregon,M,Turquoise,Spring,2.7,Yes,Cash,Free Shipping,Yes,Yes,31,PayPal,Annually,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3895,40,Female,Hoodie,Clothing,28,Virginia,L,Turquoise,Summer,4.2,No,Cash,2-Day Shipping,No,No,32,Venmo,Weekly,52,0
3896,52,Female,Backpack,Accessories,49,Iowa,L,White,Spring,4.5,No,PayPal,Store Pickup,No,No,41,Bank Transfer,Bi-Weekly,26,0
3897,46,Female,Belt,Accessories,33,New Jersey,L,Green,Spring,2.9,No,Credit Card,Standard,No,No,24,Venmo,Quarterly,4,0
3898,44,Female,Shoes,Footwear,77,Minnesota,S,Brown,Summer,3.8,No,PayPal,Express,No,No,24,Venmo,Weekly,52,0


In [29]:
from geopy.geocoders import Nominatim
import time

# Configure the geolocator
geolocator = Nominatim(user_agent="geoapi_states")

# Create lists to store the results
latitude_list = []
longitude_list = []

# Variables for tracking progress
total = len(data['Location'])  # Total number of rows in the dataset
errored_indices = []           # List to store indices of rows that caused errors

# Loop to process each location in the 'Location' column
for index, location_name in enumerate(data['Location'], start=1):
    # Display the processing status
    print(f"\rStatus: Processing. Item: {index}/{total}.", end="")  # Overwrite with "Processing"
    
    try:
        # Attempt to retrieve the location
        location = geolocator.geocode(location_name)
        if location:
            latitude_list.append(location.latitude)
            longitude_list.append(location.longitude)
        else:
            latitude_list.append(None)
            longitude_list.append(None)
    except Exception as e:
        # Log the index of the row that caused an error
        errored_indices.append(index)
        latitude_list.append(None)
        longitude_list.append(None)

    # Update the status to "Completed" after processing
    print(f"\rStatus: Completed. Item: {index}/{total}.", end="")  # Overwrite with "Completed"
    time.sleep(0.1)  # Avoid exceeding API limits

# After the loop is complete, print the summary
print("\n")  # Move to a new line
if errored_indices:
    print(f"Errors occurred at indices: {', '.join(map(str, errored_indices))}. All other rows processed successfully.")
else:
    print("All rows were processed successfully.")

Status: Completed. Item: 3900/3900..

All rows were processed successfully.


In [31]:
# Add the new columns to the original DataFrame
data['Latitude'] = latitude_list
data['Longitude'] = longitude_list

# Save the result to a new CSV file
data.to_csv("dataset_with_coordinates.csv", index=False)