In [325]:
import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt

In [326]:
def readFile(name):
    df = pd.read_json(name, encoding = 'ISO-8859-1')
    print(df.head())
    return df

In [327]:
industry = readFile('../Data/Maestria_Indsty_Dummy_S.json')

# Count the number of records (rows)
num_records = industry.shape[0]
print("Number of records:", num_records)

       PR        BRND   INDST    PMA    PMA_R    MS
0  202401      NISSAN  108293  19970  1 DE 44  0.18
1  202401   CHEVROLET  108293  14426  2 DE 44  0.13
2  202401  VOLKSWAGEN  108293  10264  3 DE 44  0.09
3  202401      TOYOTA  108293   9264  4 DE 44  0.09
4  202401         KIA  108293   8204  5 DE 44  0.08
Number of records: 126


In [328]:
industry.rename(columns={'PR': 'Period', 'BRND': 'Brand', 'INDST': 'Industry', 'PMA': 'PMA', 
                               'PMA_R': 'PMA_R', 'MS': 'MS'}, inplace=True)
industry.head()

Unnamed: 0,Period,Brand,Industry,PMA,PMA_R,MS
0,202401,NISSAN,108293,19970,1 DE 44,0.18
1,202401,CHEVROLET,108293,14426,2 DE 44,0.13
2,202401,VOLKSWAGEN,108293,10264,3 DE 44,0.09
3,202401,TOYOTA,108293,9264,4 DE 44,0.09
4,202401,KIA,108293,8204,5 DE 44,0.08


In [329]:
# Split the "Period" column into "Year" and "Month"
industry['Year'] = industry['Period'] // 100  # Extract the year
industry['Month'] = industry['Period'] % 100  # Extract the month

# Drop the "Period" column
industry.drop(columns=['Period'], inplace=True)

# Map numerical month values to month names
month_map = {
    1: 'January',
    2: 'February',
    3: 'March',
    4: 'April',
    5: 'May',
    6: 'June',
    7: 'July',
    8: 'August',
    9: 'September',
    10: 'October',
    11: 'November',
    12: 'December'
}
industry['Month'] = industry['Month'].map(month_map)

# Move the "Year" and "Month" columns to the 3rd position
industry.insert(2, 'Year', industry.pop('Year'))
industry.insert(3, 'Month', industry.pop('Month'))

# Display the DataFrame with the new "Year" and "Month" columns
industry.head()

Unnamed: 0,Brand,Industry,Year,Month,PMA,PMA_R,MS
0,NISSAN,108293,2024,January,19970,1 DE 44,0.18
1,CHEVROLET,108293,2024,January,14426,2 DE 44,0.13
2,VOLKSWAGEN,108293,2024,January,10264,3 DE 44,0.09
3,TOYOTA,108293,2024,January,9264,4 DE 44,0.09
4,KIA,108293,2024,January,8204,5 DE 44,0.08


In [330]:
grouped_indsutry_quarterly = industry.copy()
grouped_indsutry_quarterly['Time'] = 'QTR'
# Map 'Month' column to represent quarters
quarter_map = {
    'January': 'Q1',
    'February': 'Q1',
    'March': 'Q1',
    'April': 'Q2',
    'May': 'Q2',
    'June': 'Q2',
    'July': 'Q3',
    'August': 'Q3',
    'September': 'Q3',
    'October': 'Q4',
    'November': 'Q4',
    'December': 'Q4'
}
grouped_indsutry_quarterly['Month'] = grouped_indsutry_quarterly['Month'].map(quarter_map)

grouped_indsutry_quarterly = grouped_indsutry_quarterly.groupby(['Brand', 'Year', 'Month', 'Time'])[['PMA', 'MS']].sum().reset_index()

#grouped_indsutry_quarterly['MS'] = (grouped_indsutry_quarterly['MS'] / grouped_indsutry_quarterly['MS'].sum()) * 100

grouped_indsutry_quarterly

Unnamed: 0,Brand,Year,Month,Time,PMA,MS
0,ACURA,2024,Q1,QTR,0,0.0
1,ALFA ROMEO,2024,Q1,QTR,0,0.0
2,AUDI,2024,Q1,QTR,0,0.0
3,BAIC,2024,Q1,QTR,2712,0.03
4,BMW,2024,Q1,QTR,0,0.0
5,BUICK,2024,Q1,QTR,769,0.0
6,CADILLAC,2024,Q1,QTR,0,0.0
7,CHEVROLET,2024,Q1,QTR,46032,0.41
8,CHIREY,2024,Q1,QTR,5457,0.05
9,CUPRA,2024,Q1,QTR,1616,0.01


In [331]:
total_sales_by_quarter = grouped_indsutry_quarterly.groupby('Month')['MS'].sum().rename('Total Sales')

In [332]:
df = grouped_indsutry_quarterly.join(total_sales_by_quarter, on='Month')
df['%_SALES_PER_Q'] = (df['MS'] / df['Total Sales']) * 100
#df['PMA_Normalized_MinMax'] = (df['PMA'] - df['PMA'].min()) / (df['PMA'].max() - df['PMA'].min())

In [333]:
not_in_final = ['Total Sales', 'MS']
final_df = df[[v for v in df.columns if v not in not_in_final]]

In [334]:
filter_industry = grouped_indsutry_quarterly[grouped_indsutry_quarterly['Brand'] == 'NISSAN']
filter_industry

Unnamed: 0,Brand,Year,Month,Time,PMA,MS
28,NISSAN,2024,Q1,QTR,61829,0.55


In [335]:
documentation_template = {
    "documentation": {
        "context": "This file contains the sales information of Ford of Mexico vehicle sales for 2024 at vehicle line level.",
        "terms": [
            {"name": "Brand", "definition": "Brand of the vehicle"},
            {"name": "Year", "definition": "Year of the sales data"},
            {"name": "Month", "definition": "Month of the sales data"},
            {"name": "Time", "definition": "Time period of the sales data, MTD is Month to date and QTY means Quarter"},
            {"name": "PMA", "definition": "Total of vehicles available for sale"},
            {"name": "%_SALES_PER_Q", "definition": "Percentage of sales for each quarter"}
        ]
    }
}

# Convert DataFrame to list of dictionaries
industry_sales_data = final_df.to_dict(orient='records')

# Combine documentation and vehicle_sales data
json_data = {
    **documentation_template,
    "industry": industry_sales_data
}

# Write JSON data to file
file_location = '../json/llm_train_industry_data.json'
with open(file_location, 'w') as json_file:
    json.dump(json_data, json_file, indent=4)