In [1]:
import pandas as pd
import numpy as np
from faker import Faker

# Initialize Faker to generate random data
fake = Faker()

# Load your dataset into a DataFrame
df = pd.read_csv('DataCoSupplyChainDataset.csv', encoding='latin-1')

# Extract unique customer locations with street information
customer_locations = df[['Customer Id', 'Customer City', 'Customer Country', 'Customer State', 'Customer Zipcode', 'Customer Street']].drop_duplicates()

# Generate random latitude and longitude for customers
customer_locations['Latitude'] = np.random.uniform(low=-90.0, high=90.0, size=len(customer_locations))
customer_locations['Longitude'] = np.random.uniform(low=-180.0, high=180.0, size=len(customer_locations))

# Extract unique order locations without street information
order_locations = df[['Order Id', 'Order City', 'Order Country', 'Order State']].drop_duplicates()

# Initialize a counter for unmatched cities
unmatched_city_count = 0

# Define a function to generate random street names based on customer city and order zip code
def generate_random_street(row):
    global unmatched_city_count
    
    # Extract order city
    order_city = row['Order City']
    
    # Filter customer locations by city
    relevant_customers = customer_locations[customer_locations['Customer City'] == order_city]
    
    if len(relevant_customers) > 0:
        # Select a random customer with the same city
        random_customer = relevant_customers.sample(n=1)
         
        # Return the customer's street and order zip code
        return random_customer['Customer Street'].iloc[0], random_customer['Customer Zipcode'].iloc[0]
    else:
        # If no customers found in the city, generate a generic random street name and order zip code
        unmatched_city_count += 1
        return fake.street_name(), fake.zipcode()

# Generate random street names and order zip codes for orders based on customer city
generated_data = order_locations.apply(generate_random_street, axis=1, result_type='expand')
order_locations['Order Street'] = generated_data[0]
order_locations['Order Zipcode'] = generated_data[1]

# Add prefix 'O' to 'Order Id' and 'C' to 'Customer Id'
order_locations['Order Id'] = 'O' + order_locations['Order Id'].astype(str)
customer_locations['Customer Id'] = 'C' + customer_locations['Customer Id'].astype(str)

customer_locations.rename(columns={
    'Customer Id': 'Location Id',
    'Customer City': 'City',
    'Customer Country': 'Country',
    'Customer State': 'State',
    'Customer Zipcode': 'Zipcode',
    'Customer Street': 'Street'
}, inplace=True)

order_locations.rename(columns={
    'Order Id': 'Location Id',
    'Order City': 'City',
    'Order Country': 'Country',
    'Order State': 'State',
    'Order Street': 'Street',
    'Order Zipcode': 'Zipcode'
}, inplace=True)

# Add a type column to distinguish between customer and order locations
customer_locations['Location Type'] = 'Customer'
order_locations['Location Type'] = 'Order'

# Combine both location dataframes
location_table = pd.concat([customer_locations, order_locations], ignore_index=True).drop_duplicates()

# Reset index for the location table
location_table.reset_index(drop=True, inplace=True)

In [2]:
location_table.head()

Unnamed: 0,Location Id,City,Country,State,Zipcode,Street,Latitude,Longitude,Location Type
0,C20755,Caguas,Puerto Rico,PR,725,5365 Noble Nectar Island,-3.101663,-120.8446,Customer
1,C19492,Caguas,Puerto Rico,PR,725,2679 Rustic Loop,-12.287946,82.601205,Customer
2,C19491,San Jose,EE. UU.,CA,95125,8510 Round Bear Gate,-42.255504,-176.323659,Customer
3,C19490,Los Angeles,EE. UU.,CA,90027,3200 Amber Bend,-3.656592,59.780074,Customer
4,C19489,Caguas,Puerto Rico,PR,725,8671 Iron Anchor Corners,64.075809,-59.122291,Customer


In [3]:
# Read the supplier information from the Excel file
supplier_df = pd.read_excel('suppliers.xlsx')

supplier_df

Unnamed: 0,Supplier ID,Department Id,Supplier Name,Contact Name,Contact Email,Phone Number
0,S1,D2,Reynolds-Hoffman,Heather Goodman,kylie53@example.com,556-722-7513
1,S2,D7,Walker-Wilkinson,Scott Gardner,johnlynch@example.net,809-964-1981
2,S3,D6,"Sanders, Hampton and Young",Melanie Dillon,tfox@example.net,227-120-5173
3,S4,D3,Steele Inc,Kevin Jackson DDS,andersonrichard@example.net,156-863-2189
4,S5,D4,Rogers LLC,Justin Smith,ashley67@example.org,763-776-9327
5,S6,D5,Jimenez-Meadows,Joseph Williams,pauladillon@example.org,592-964-6448
6,S7,D8,"Cox, Freeman and Mathis",Devin Blackburn DVM,denise15@example.net,523-802-2428
7,S8,D9,"Davies, Acevedo and Wiggins",Michael Gomez,tjones@example.net,384-281-2789
8,S9,D10,Washington-Randall,James Williams,andrea44@example.net,465-689-9121
9,S10,D12,"Mccoy, Phillips and Barrett",Lindsey Patel,dramirez@example.com,920-146-5883


In [4]:
# Function to generate random latitude and longitude
def generate_random_lat_long():
    return np.random.uniform(low=-90.0, high=90.0), np.random.uniform(low=-180.0, high=180.0)

In [5]:
# Function to add a new record based on a randomly selected location
def add_random_location_record(df, location_table, id_column, location_type):
    # Create a new DataFrame to hold the new records
    new_records = pd.DataFrame()
    
    for id_value in df[id_column].unique():
        # Randomly select a record from location_table for each unique ID
        random_location = location_table.sample(n=1).iloc[0]
        
        # Generate random latitude and longitude
        latitude, longitude = generate_random_lat_long()
        
        # Create a record for this ID
        record = pd.DataFrame({
            'Location Id': [id_value],
            'City': [random_location['City']],
            'Country': [random_location['Country']],
            'State': [random_location['State']],
            'Zipcode': [random_location['Zipcode']],
            'Street': [random_location['Street']],
            'Latitude': [latitude],
            'Longitude': [longitude],
            'Location Type': [location_type]
        })
        
        # Append to the new_records DataFrame
        new_records = pd.concat([new_records, record], ignore_index=True)
    
    return new_records

In [6]:
# Generate new records for each supplier
supplier_records = add_random_location_record(supplier_df, location_table, 'Supplier ID', 'Supplier')
supplier_records

Unnamed: 0,Location Id,City,Country,State,Zipcode,Street,Latitude,Longitude,Location Type
0,S1,Portland,Estados Unidos,Oregón,97223,4166 Golden Willow Circuit,-8.729166,65.573562,Supplier
1,S2,Peoria,EE. UU.,AZ,85345,318 Merry Creek Mountain,57.119049,-63.897042,Supplier
2,S3,Viena,Austria,Viena,24896,Taylor Street,-8.212089,13.781272,Supplier
3,S4,Redditch,Reino Unido,Inglaterra,1064,Randall Falls,-50.734378,-80.866009,Supplier
4,S5,Cleveland,Estados Unidos,Ohio,44105,3482 Fallen Deer Isle,-9.06371,96.89144,Supplier
5,S6,Depok,Indonesia,Yogyakarta,88308,Sandra Ridge,-89.149095,-66.814392,Supplier
6,S7,Cologne,Alemania,Renania del Norte-Westfalia,86043,Lamb Coves,-53.959067,-84.64163,Supplier
7,S8,Mianyang,China,Sichuan,79261,Garcia Fields,52.259813,118.885,Supplier
8,S9,London,Reino Unido,Inglaterra,79485,Stephen Passage,19.788978,-12.469348,Supplier
9,S10,Moreno Valley,EE. UU.,CA,92553,4055 Stony Anchor Forest,20.667961,71.114299,Supplier


In [7]:
# Generate a single random location for all departments with slight variations
def add_single_location_record_for_departments(department_df, location_table, location_type):
    # Randomly select a record from location_table for the department location
    random_location = location_table.sample(n=1).iloc[0]
    
    # Generate base latitude and longitude
    base_latitude, base_longitude = generate_random_lat_long()
    
    # Create records with slight variations for each department
    records = []
    for dept_id in department_df['Department ID'].unique():
        # Add small random offsets to the base latitude and longitude
        latitude = base_latitude + random.uniform(-0.001, 0.001)
        longitude = base_longitude + random.uniform(-0.001, 0.001)
        
        record = {
            'Location Id': dept_id,
            'City': random_location['City'],
            'Country': random_location['Country'],
            'State': random_location['State'],
            'Zipcode': random_location['Zipcode'],
            'Street': random_location['Street'],
            'Latitude': latitude,
            'Longitude': longitude,
            'Location Type': location_type
        }
        records.append(record)
    
    # Convert records to DataFrame
    new_records = pd.DataFrame(records)
    
    return new_records

In [8]:
import random

# We need to create a DataFrame with unique Department IDs
department_df = supplier_df[['Department Id']].drop_duplicates()
department_df.rename(columns={'Department Id': 'Department ID'}, inplace=True)

# Generate new records for each department (with slightly different locations)
department_records = add_single_location_record_for_departments(department_df, location_table, 'Department')
department_records

Unnamed: 0,Location Id,City,Country,State,Zipcode,Street,Latitude,Longitude,Location Type
0,D2,Little Rock,Estados Unidos,Arkansas,21861,Keith Locks,12.897068,-145.581358,Department
1,D7,Little Rock,Estados Unidos,Arkansas,21861,Keith Locks,12.895951,-145.580499,Department
2,D6,Little Rock,Estados Unidos,Arkansas,21861,Keith Locks,12.895799,-145.581316,Department
3,D3,Little Rock,Estados Unidos,Arkansas,21861,Keith Locks,12.897358,-145.58105,Department
4,D4,Little Rock,Estados Unidos,Arkansas,21861,Keith Locks,12.896965,-145.581417,Department
5,D5,Little Rock,Estados Unidos,Arkansas,21861,Keith Locks,12.895682,-145.581913,Department
6,D8,Little Rock,Estados Unidos,Arkansas,21861,Keith Locks,12.897222,-145.581094,Department
7,D9,Little Rock,Estados Unidos,Arkansas,21861,Keith Locks,12.897324,-145.581723,Department
8,D10,Little Rock,Estados Unidos,Arkansas,21861,Keith Locks,12.89723,-145.580179,Department
9,D12,Little Rock,Estados Unidos,Arkansas,21861,Keith Locks,12.896049,-145.581604,Department


In [9]:
# Combine the newly created records with the existing location table
location_table = pd.concat([location_table, supplier_records, department_records], ignore_index=True).drop_duplicates()

# Reset index for the updated location table
location_table.reset_index(drop=True, inplace=True)

In [10]:
location_table.head()

Unnamed: 0,Location Id,City,Country,State,Zipcode,Street,Latitude,Longitude,Location Type
0,C20755,Caguas,Puerto Rico,PR,725,5365 Noble Nectar Island,-3.101663,-120.8446,Customer
1,C19492,Caguas,Puerto Rico,PR,725,2679 Rustic Loop,-12.287946,82.601205,Customer
2,C19491,San Jose,EE. UU.,CA,95125,8510 Round Bear Gate,-42.255504,-176.323659,Customer
3,C19490,Los Angeles,EE. UU.,CA,90027,3200 Amber Bend,-3.656592,59.780074,Customer
4,C19489,Caguas,Puerto Rico,PR,725,8671 Iron Anchor Corners,64.075809,-59.122291,Customer


In [11]:
location_table['Zipcode'] = location_table['Zipcode'].astype(int)

In [12]:
location_table.dtypes

Location Id       object
City              object
Country           object
State             object
Zipcode            int32
Street            object
Latitude         float64
Longitude        float64
Location Type     object
dtype: object

In [13]:
len(location_table)

86426

In [14]:
location_table['Location Type'].value_counts()

Location Type
Order         65752
Customer      20652
Supplier         11
Department       11
Name: count, dtype: int64

In [15]:
len(location_table)

86426

In [16]:
# Convert 'Order Id' in df to have the 'O' prefix
df['Order Id'] = 'O' + df['Order Id'].astype(str)

# Group by 'Order Id' and keep the first 'Market' value
grouped_df = df.groupby('Order Id').first().reset_index()

# Merge location_table with the grouped_df to add the 'Market' column
merged_df = pd.merge(location_table, grouped_df[['Order Id', 'Market']], how='left', left_on='Location Id', right_on='Order Id')

In [18]:
len(merged_df)

86426

In [26]:
merged_df[merged_df['Location Id'] != merged_df['Order Id']].head()

Unnamed: 0,Location Id,City,Country,State,Zipcode,Street,Latitude,Longitude,Location Type,Order Id,Market
0,C20755,Caguas,Puerto Rico,PR,725,5365 Noble Nectar Island,-3.101663,-120.8446,Customer,,
1,C19492,Caguas,Puerto Rico,PR,725,2679 Rustic Loop,-12.287946,82.601205,Customer,,
2,C19491,San Jose,EE. UU.,CA,95125,8510 Round Bear Gate,-42.255504,-176.323659,Customer,,
3,C19490,Los Angeles,EE. UU.,CA,90027,3200 Amber Bend,-3.656592,59.780074,Customer,,
4,C19489,Caguas,Puerto Rico,PR,725,8671 Iron Anchor Corners,64.075809,-59.122291,Customer,,


In [23]:
merged_df.shape

(86426, 11)

In [27]:
merged_df = merged_df.drop(['Order Id'],axis = 1)

In [28]:
merged_df.columns

Index(['Location Id', 'City', 'Country', 'State', 'Zipcode', 'Street',
       'Latitude', 'Longitude', 'Location Type', 'Market'],
      dtype='object')

In [None]:
# Save the final location table with Market Column added to itto a CSV file
merged_df.to_excel('LocationTable.xlsx', index=False)