# Sales Dataset Revised V3

* Regroup the Resion based on client group requirements
* Rename the columns for PostgreSQL data processing
* Output the datset

## Regroup the Region based on original data and client requirements

* Read in the cleaned dataset and the original dataset that includes the original groups
* Merge the original region column to the current dataset
* Regroup the region based on client requirements

In [None]:
import pandas as pd

In [None]:
# Read in the current cleaned dataset and the original dataset
current_filepath = 'cleaned_data_V2.csv'
current = pd.read_csv(current_filepath)

print(current.head())


        C1AccountNo        CXRecords  Year        1         2       3    4  \
0  A2061733056$=L._  91470FS$KD_> W<  2018     0.00      0.00    0.00  0.0   
1  A2061733057#*N-2  91470GQ%![I! W<  2018  1730.06  10023.32  108.18  0.0   
2  A2061733058)U90^  91470I8%.+ML W<  2018     0.00      0.00    0.00  0.0   
3  A2061733059#5)>2  91470KJ(,V-0 W<  2018     0.00      0.00    0.00  0.0   
4  A2061733060)(W<%  91470M0#>H/T W<  2018     0.00    302.72    0.00  0.0   

        5    6    7  ...       9      10     11       12  q1_calculated  \
0    0.00  0.0  0.0  ...  639.00    0.00   0.00  1438.00           0.00   
1    0.00  0.0  0.0  ...    0.00  282.60   0.00   861.14       11861.56   
2  306.86  0.0  0.0  ...    0.00  211.37  66.01     0.00           0.00   
3    0.00  0.0  0.0  ...  153.46    0.00   0.00     0.00           0.00   
4    0.00  0.0  0.0  ...    0.00    0.00   0.00     0.00         302.72   

   q2_calculated  q3_calculated  q4_calculated  annual_sales_calculated  \
0    

In [None]:
og_filepath = 'final_data_with_origin_and_calculate.csv'
og = pd.read_csv(og_filepath)

print(og.head())

   Unnamed: 0       C1AccountNo        CXRecords Comp Terr   Latitude  Year  \
0           0  A2061733054*,CNR  91470CV(.{82 W<      MCOW   0.000000  2018   
1           1  A2061733056$=L._  91470FS$KD_> W<   Western  37.773032  2018   
2           2  A2061733057#*N-2  91470GQ%![I! W<   Central  42.135659  2018   
3           3  A2061733058)U90^  91470I8%.+ML W<   Western  33.436405  2018   
4           4  A2061733059#5)>2  91470KJ(,V-0 W<   Central   0.000000  2018   

         1         2       3    4  ...      q2      q3       q4     sales  \
0     0.00      0.00    0.00  0.0  ...    0.00    0.00     0.00      0.00   
1     0.00      0.00    0.00  0.0  ...    0.00  639.00  1438.00   4154.00   
2  1730.06  10023.32  108.18  0.0  ...    0.00    0.00  1143.74  26010.60   
3     0.00      0.00    0.00  0.0  ...  306.86    0.00   277.38   1168.48   
4     0.00      0.00    0.00  0.0  ...    0.00  153.46     0.00    306.92   

   individual  q1_calculated  q2_calculated  q3_calculated  q4

In [None]:
print(len(current))

42034


In [None]:
# Create a function to find the first matching 'Comp Terr' from 'og' dataframe
def find_first_match(row, og_df):
    matched_rows = og_df[(og_df['C1AccountNo'] == row['C1AccountNo']) &
                         (og_df['CXRecords'] == row['CXRecords'])]

    if not matched_rows.empty:
        return matched_rows['Comp Terr'].iloc[0]

    return None

current['Comp Terr Imputed'] = current.apply(lambda row: find_first_match(row, og), axis=1)


print(current[['C1AccountNo', 'CXRecords', 'Comp Terr Imputed']].head())
print(len(current))

        C1AccountNo        CXRecords Comp Terr Imputed
0  A2061733056$=L._  91470FS$KD_> W<           Western
1  A2061733057#*N-2  91470GQ%![I! W<           Central
2  A2061733058)U90^  91470I8%.+ML W<           Western
3  A2061733059#5)>2  91470KJ(,V-0 W<           Central
4  A2061733060)(W<%  91470M0#>H/T W<           Midwest
42034


In [None]:
# Check whether there are missing values in the company terr column after merge
missing_comp_terr = current[current['Comp Terr Imputed'].isnull()]


print(f"Number of rows with missing 'Comp Terr Imputed': {missing_comp_terr.shape[0]}")
print(missing_comp_terr[['C1AccountNo', 'CXRecords', 'Comp Terr Imputed']])

Number of rows with missing 'Comp Terr Imputed': 0
Empty DataFrame
Columns: [C1AccountNo, CXRecords, Comp Terr Imputed]
Index: []


In [None]:
# Examine the original groupings
unique_territories = current['Comp Terr Imputed'].unique()
print(unique_territories)
print(len(unique_territories))

['Western' 'Central' 'Midwest' 'Northeast' 'Southern' 'Minnesota' 'INTL'
 'Unknown' 'California' 'East' 'West_x000D_\n' 'MN-West' 'Washington'
 'MCOW' 'North']
15


In [None]:
# Examine the number of values in each original territory category
category_counts = current['Comp Terr Imputed'].value_counts()
print(category_counts)

Comp Terr Imputed
Southern         9447
Central          9237
Northeast        8367
Western          7725
Midwest          4125
Unknown          1080
INTL              804
California        733
Minnesota         255
MN-West           156
Washington         62
West_x000D_\n      18
MCOW               13
East               11
North               1
Name: count, dtype: int64


In [None]:
# Regroup the regions based on client response
def map_territory(territory):
    if territory in ['Western', 'California', 'West_x000D_\n', 'Washington']:
        return 'West'
    elif territory in ['Midwest', 'MN-West', 'Minnesota']:
        return 'Midwest'
    elif territory in ['Central']:
        return 'Central'
    elif territory in ['Northeast', 'East']:
        return 'Northeast'
    elif territory in ['Southern']:
        return 'South'
    elif territory == 'MCOW':
        return 'Special'
    elif territory == 'INTL':
        return 'International'
    elif territory == 'Unknown':
        return 'Unknown'
    else:
        return territory

current['Comp Terr Grouped'] = current['Comp Terr Imputed'].apply(map_territory)
territory_counts = current['Comp Terr Grouped'].value_counts()

print(current[['C1AccountNo', 'CXRecords', 'Comp Terr Imputed', 'Comp Terr Grouped']].head())
print(territory_counts)
print(len(current))

        C1AccountNo        CXRecords Comp Terr Imputed Comp Terr Grouped
0  A2061733056$=L._  91470FS$KD_> W<           Western              West
1  A2061733057#*N-2  91470GQ%![I! W<           Central           Central
2  A2061733058)U90^  91470I8%.+ML W<           Western              West
3  A2061733059#5)>2  91470KJ(,V-0 W<           Central           Central
4  A2061733060)(W<%  91470M0#>H/T W<           Midwest           Midwest
South            9447
Central          9237
West             8538
Northeast        8378
Midwest          4536
Unknown          1080
International     804
Special            13
North               1
Name: Comp Terr Grouped, dtype: int64
42034


In [None]:
#Replace the row with the value "North" to "Northeast"
current['Comp Terr Grouped'] = current['Comp Terr Grouped'].replace('North', 'Northeast')

territory_counts_new = current['Comp Terr Grouped'].value_counts()

print(territory_counts_new)



South            9447
Central          9237
West             8538
Northeast        8379
Midwest          4536
Unknown          1080
International     804
Special            13
Name: Comp Terr Grouped, dtype: int64


In [None]:
print(current.head())

        C1AccountNo        CXRecords  Year        1         2       3    4  \
0  A2061733056$=L._  91470FS$KD_> W<  2018     0.00      0.00    0.00  0.0   
1  A2061733057#*N-2  91470GQ%![I! W<  2018  1730.06  10023.32  108.18  0.0   
2  A2061733058)U90^  91470I8%.+ML W<  2018     0.00      0.00    0.00  0.0   
3  A2061733059#5)>2  91470KJ(,V-0 W<  2018     0.00      0.00    0.00  0.0   
4  A2061733060)(W<%  91470M0#>H/T W<  2018     0.00    302.72    0.00  0.0   

        5    6    7  ...     11       12  q1_calculated  q2_calculated  \
0    0.00  0.0  0.0  ...   0.00  1438.00           0.00           0.00   
1    0.00  0.0  0.0  ...   0.00   861.14       11861.56           0.00   
2  306.86  0.0  0.0  ...  66.01     0.00           0.00         306.86   
3    0.00  0.0  0.0  ...   0.00     0.00           0.00           0.00   
4    0.00  0.0  0.0  ...   0.00     0.00         302.72           0.00   

   q3_calculated  q4_calculated  annual_sales_calculated  Territory  \
0         639.0

In [None]:
# Drop the Comp Terr Imputed column and the Territory column
current_dropped = current.drop(columns=['Comp Terr Imputed', 'Territory'])
print(current_dropped.head())

        C1AccountNo        CXRecords  Year        1         2       3    4  \
0  A2061733056$=L._  91470FS$KD_> W<  2018     0.00      0.00    0.00  0.0   
1  A2061733057#*N-2  91470GQ%![I! W<  2018  1730.06  10023.32  108.18  0.0   
2  A2061733058)U90^  91470I8%.+ML W<  2018     0.00      0.00    0.00  0.0   
3  A2061733059#5)>2  91470KJ(,V-0 W<  2018     0.00      0.00    0.00  0.0   
4  A2061733060)(W<%  91470M0#>H/T W<  2018     0.00    302.72    0.00  0.0   

        5    6    7  ...       9      10     11       12  q1_calculated  \
0    0.00  0.0  0.0  ...  639.00    0.00   0.00  1438.00           0.00   
1    0.00  0.0  0.0  ...    0.00  282.60   0.00   861.14       11861.56   
2  306.86  0.0  0.0  ...    0.00  211.37  66.01     0.00           0.00   
3    0.00  0.0  0.0  ...  153.46    0.00   0.00     0.00           0.00   
4    0.00  0.0  0.0  ...    0.00    0.00   0.00     0.00         302.72   

   q2_calculated  q3_calculated  q4_calculated  annual_sales_calculated  \
0    

In [None]:
#Rename all the columns
column_rename_map = {
    '1': 'Jan', '2': 'Feb', '3': 'Mar', '4': 'Apr', '5': 'May', '6': 'Jun',
    '7': 'Jul', '8': 'Aug', '9': 'Sep', '10': 'Oct', '11': 'Nov', '12': 'Dec',
    'q1_calculated': 'q1_sales', 'q2_calculated': 'q2_sales', 'q3_calculated': 'q3_sales', 'q4_calculated': 'q4_sales',
    'annual_sales_calculated': 'annual_sales'
}

# Rename the columns in the dataframe
current_renamed = current_dropped.rename(columns=column_rename_map)

# Check the renamed dataframe
print(current_renamed.head())

        C1AccountNo        CXRecords  Year      Jan       Feb     Mar  Apr  \
0  A2061733056$=L._  91470FS$KD_> W<  2018     0.00      0.00    0.00  0.0   
1  A2061733057#*N-2  91470GQ%![I! W<  2018  1730.06  10023.32  108.18  0.0   
2  A2061733058)U90^  91470I8%.+ML W<  2018     0.00      0.00    0.00  0.0   
3  A2061733059#5)>2  91470KJ(,V-0 W<  2018     0.00      0.00    0.00  0.0   
4  A2061733060)(W<%  91470M0#>H/T W<  2018     0.00    302.72    0.00  0.0   

      May  Jun  Jul  ...     Sep     Oct    Nov      Dec  q1_sales  q2_sales  \
0    0.00  0.0  0.0  ...  639.00    0.00   0.00  1438.00      0.00      0.00   
1    0.00  0.0  0.0  ...    0.00  282.60   0.00   861.14  11861.56      0.00   
2  306.86  0.0  0.0  ...    0.00  211.37  66.01     0.00      0.00    306.86   
3    0.00  0.0  0.0  ...  153.46    0.00   0.00     0.00      0.00      0.00   
4    0.00  0.0  0.0  ...    0.00    0.00   0.00     0.00    302.72      0.00   

   q3_sales  q4_sales  annual_sales  Comp Terr Gro

In [None]:
#Rename the company terr grouped column and rearrange the columns
current_renamed = current_renamed.rename(columns={'Comp Terr Grouped': 'Territory'})

cols = current_renamed.columns.tolist()

cols.insert(cols.index('Year') + 1, cols.pop(cols.index('Territory')))

current_reordered = current_renamed[cols]

print(current_reordered.head())

        C1AccountNo        CXRecords  Year Territory      Jan       Feb  \
0  A2061733056$=L._  91470FS$KD_> W<  2018      West     0.00      0.00   
1  A2061733057#*N-2  91470GQ%![I! W<  2018   Central  1730.06  10023.32   
2  A2061733058)U90^  91470I8%.+ML W<  2018      West     0.00      0.00   
3  A2061733059#5)>2  91470KJ(,V-0 W<  2018   Central     0.00      0.00   
4  A2061733060)(W<%  91470M0#>H/T W<  2018   Midwest     0.00    302.72   

      Mar  Apr     May  Jun  ...  Aug     Sep     Oct    Nov      Dec  \
0    0.00  0.0    0.00  0.0  ...  0.0  639.00    0.00   0.00  1438.00   
1  108.18  0.0    0.00  0.0  ...  0.0    0.00  282.60   0.00   861.14   
2    0.00  0.0  306.86  0.0  ...  0.0    0.00  211.37  66.01     0.00   
3    0.00  0.0    0.00  0.0  ...  0.0  153.46    0.00   0.00     0.00   
4    0.00  0.0    0.00  0.0  ...  0.0    0.00    0.00   0.00     0.00   

   q1_sales  q2_sales  q3_sales  q4_sales  annual_sales  
0      0.00      0.00    639.00   1438.00       2077

In [None]:
print(f"Number of columns in current_reordered: {len(current_reordered.columns)}")
print(f"Number of columns in current_renamed: {len(current_renamed.columns)}")

Number of columns in current_reordered: 21
Number of columns in current_renamed: 21


In [None]:
# Export the cleaned dataframe into a csv file
current_reordered.to_csv('cleaned_data_V3.csv', index=False)