In [0]:
#import libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from category_encoders import TargetEncoder
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf


Analysis of Raw Data

In [0]:
# Define the relative path to the Dataset folder
dataset_path = '../Datasets/btp_core_raw.parquet'

# Read the Parquet file into a DataFrame
raw_btp_core = pd.read_parquet(dataset_path, engine='pyarrow')

# Display the DataFrame
print(raw_btp_core)

In [0]:
#number of unique customers
raw_btp_core['CUSTOMER_ID'].nunique()

In [0]:
# Create a copy of the DataFrame
df_copy = raw_btp_core.copy()

# Convert the MONTH column to datetime format
df_copy['MONTH'] = pd.to_datetime(df_copy['MONTH'].astype(str), format='%Y%m')

# Group by MONTH and count unique CUSTOMER_IDs
unique_customers_per_month = df_copy.groupby(df_copy['MONTH'].dt.to_period('M'))['CUSTOMER_ID'].nunique()

# Plotting
unique_customers_per_month.plot(kind='bar', figsize=(12, 6))
plt.title('Unique Customers Per Month')
plt.xlabel('Month')
plt.ylabel('Count of Unique Customers')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [0]:
# Group by CUSTOMER_ID and count unique MONTHs
months_per_customer = df_copy.groupby('CUSTOMER_ID')['MONTH'].nunique()

# Plotting the distribution
plt.figure(figsize=(12, 6))
months_per_customer.value_counts().sort_index().plot(kind='bar', color='lightblue')
plt.title('Distribution of Number of Months Per Customer')
plt.xlabel('Number of Months')
plt.ylabel('Number of Customers')
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()

In [0]:
# Identify the most recent month in the dataset
most_recent_month = df_copy['MONTH'].max()

# Check which customers do not have data for the most recent month
customers_with_most_recent_month = df_copy[df_copy['MONTH'] == most_recent_month]['CUSTOMER_ID'].unique()
all_customers = df_copy['CUSTOMER_ID'].unique()
customers_without_most_recent_month = set(all_customers) - set(customers_with_most_recent_month)

# Count the number of customers without the most recent month
num_customers_without_most_recent_month = len(customers_without_most_recent_month)

num_customers_without_most_recent_month

In [0]:
# Check for null values
null_counts = df_copy.isnull().sum()

# Check for zeros
zero_counts = (df_copy == 0).sum()

# Display the results
print("Null Values in Each Column:")
print(null_counts)

print("\nZero Values in Each Column:")
print(zero_counts)

# To get a combined view
combined_counts = pd.DataFrame({'Nulls': null_counts, 'Zeros': zero_counts})
print("\nCombined Nulls and Zeros:")
print(combined_counts)

In [0]:
# Calculate basic statistics for TOTAL_CONSUMPTION
mean_consumption = df_copy['TOTAL_CONSUMPTION'].mean()
median_consumption = df_copy['TOTAL_CONSUMPTION'].median()
std_consumption = df_copy['TOTAL_CONSUMPTION'].std()

# Display the statistics
print("\nBasic Statistics for TOTAL_CONSUMPTION:")
print(f"Mean: {mean_consumption}")
print(f"Median: {median_consumption}")
print(f"Standard Deviation: {std_consumption}")

In [0]:
# Plotting the histogram
plt.figure(figsize=(12, 6))
plt.hist(df_copy['TOTAL_CONSUMPTION'], bins=30, color='lightblue', edgecolor='black')
plt.title('Distribution of TOTAL_CONSUMPTION', fontsize=16)
plt.xlabel('TOTAL_CONSUMPTION', fontsize=14)
plt.ylabel('Frequency', fontsize=14)
plt.grid(axis='y', linestyle='--', linewidth=0.7)
plt.tight_layout()
plt.show()


In [0]:
# Plotting the box plot
plt.figure(figsize=(12, 6))
plt.boxplot(df_copy['TOTAL_CONSUMPTION'], vert=False, patch_artist=True, boxprops=dict(facecolor='lightblue', color='black'), medianprops=dict(color='red'))
plt.title('Box Plot of TOTAL_CONSUMPTION', fontsize=16)
plt.xlabel('TOTAL_CONSUMPTION', fontsize=14)
plt.grid(axis='x', linestyle='--', linewidth=0.7)
plt.tight_layout()
plt.show()

In [0]:
import seaborn as sns

# Plotting the violin plot
plt.figure(figsize=(12, 6))
sns.violinplot(x=df_copy['TOTAL_CONSUMPTION'], inner="quartile", color='lightblue')
plt.title('Violin Plot of TOTAL_CONSUMPTION', fontsize=16)
plt.xlabel('TOTAL_CONSUMPTION', fontsize=14)
plt.grid(axis='x', linestyle='--', linewidth=0.7)
plt.tight_layout()
plt.show()


In [0]:
# Plotting the density plot
plt.figure(figsize=(12, 6))
sns.kdeplot(df_copy['TOTAL_CONSUMPTION'], color='lightblue', fill=True)
plt.title('Density Plot of TOTAL_CONSUMPTION', fontsize=16)
plt.xlabel('TOTAL_CONSUMPTION', fontsize=14)
plt.grid(axis='x', linestyle='--', linewidth=0.7)
plt.tight_layout()
plt.show()


In [0]:
# Check for duplicate rows
duplicate_rows = df_copy[df_copy.duplicated()]

# Display the count of duplicate rows
print(f"Number of duplicate rows: {duplicate_rows.shape[0]}")

# If you want to view the duplicate rows
if not duplicate_rows.empty:
    print("\nDuplicate Rows:")
    print(duplicate_rows)


Analysis of aggregated data with new features

In [0]:
# Define the relative path to the Dataset folder
dataset_path = '../Datasets/df_aggregated_btp_core_with_mada.parquet'


# Read DataFrame from Parquet
df_aggregated_btp_core_with_mada = pd.read_parquet(dataset_path, engine='pyarrow')
print(df_aggregated_btp_core_with_mada)

0018894795', '0050124806

In [0]:
# Sort the DataFrame by 'TOTAL_CONSUMPTION' in descending order
sorted_df = df_aggregated_btp_core_with_mada.sort_values(by='TOTAL_CONSUMPTION_SUM', ascending=False)

# Display the top 20 rows
top_20_customers = sorted_df[['CUSTOMER_ID', 'TOTAL_CONSUMPTION_SUM']].head(40)
print(top_20_customers)


In [0]:
# Check for null values in each column and display the results
null_per_column = df_aggregated_btp_core_with_mada.isnull().sum()

# Print the number of null values per column
print(null_per_column)


In [0]:
# Check for zero values in each column and display the results
zeros_per_column = (df_aggregated_btp_core_with_mada == 0).sum()

# Print the number of zero values per column
print(zeros_per_column)


In [0]:
# Get summary statistics
df_aggregated_btp_core_with_mada.describe()


In [0]:
# Check data types and memory usage
print(df_aggregated_btp_core_with_mada.info())


In [0]:
# Check unique values for each column
print(df_aggregated_btp_core_with_mada.nunique())


In [0]:
# Plot histograms for numerical columns
df_aggregated_btp_core_with_mada.hist(bins=20, figsize=(15, 10))
plt.tight_layout()
plt.show()


In [0]:
# Define the relative path to the Dataset folder
dataset_path = '../Datasets/df_aggregated_month_populated_btp_core.parquet'

# Read DataFrame from Parquet
df_aggregated_month_populated_btp_core = pd.read_parquet(dataset_path, engine='pyarrow')
print(df_aggregated_month_populated_btp_core)

In [0]:
# Correlation matrix
df_clean = df_aggregated_month_populated_btp_core.drop(columns=['ACTIVE_CONTRACT','YEAR','MONTH','MONTH_SIN','MONTH_COS','TREND'])
plt.figure(figsize=(12, 8))
sns.heatmap(df_clean.corr(), annot=True, fmt='.2f', cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()


In [0]:
# List of columns to drop
columns_to_drop = [
    'ACTIVE_CONTRACT',
    'LICENSE_COUNT_SUM',
    'MONTHLY_CONTRACT_NET_VALUE_SUM',
    'LATEST_CONTRACT_MIN',
    'CONTRACT_DURATION_SUM',
    'CONTRACT_DURATION_MEAN',
    'OVERCONSUMPTION_COUNT',
    'BUNDLE_INDICATOR',
    'ORDER_COUNT',
    'INTEGRATION_SUITE',
    'CLOUD_INTEGRATION',
    'TOTAL_CONSUMPTION_LAG_1',
    'TOTAL_CONSUMPTION_LAG_2',
    'TOTAL_CONSUMPTION_LAG_3',
    'TOTAL_CONSUMPTION_ROLLING_3',
    'TOTAL_CONSUMPTION_ROLLING_6',
    'TOTAL_CONSUMPTION_DIFF_1',
    'TOTAL_CONSUMPTION_CUMSUM',
    'TOTAL_CONSUMPTION_EMA_3'
]

# Drop the specified columns
df_mod = df_aggregated_month_populated_btp_core.drop(columns=columns_to_drop)

# One-hot encode 'ISS_TEXT' and 'GLOBAL_REGION'
df_mod = pd.get_dummies(df_mod, columns=['ISS_TEXT', 'GLOBAL_REGION'], drop_first=True)

# Target encode 'COUNTRY' and 'SAP_MASTER_CODE'
target_enc = TargetEncoder()
df_mod['COUNTRY'] = target_enc.fit_transform(df_mod['COUNTRY'], df_mod['TOTAL_CONSUMPTION_SUM'])
df_mod['SAP_MASTER_CODE'] = target_enc.fit_transform(df_mod['SAP_MASTER_CODE'], df_mod['TOTAL_CONSUMPTION_SUM'])

# Correlation matrix with the modified DataFrame
plt.figure(figsize=(14, 10))
sns.heatmap(df_mod.corr(), annot=True, fmt='.2f', cmap='coolwarm')
plt.title('Correlation Matrix with Encoded Features')
plt.show()


In [0]:
# Get value counts and the number of unique categories for categorical columns
categorical_columns = ['ISS_TEXT', 'GLOBAL_REGION', 'COUNTRY', 'SAP_MASTER_CODE'] 

for col in categorical_columns:
    print(f"Value counts for {col}:")
    print(df_aggregated_btp_core_with_mada[col].value_counts())
    print(f"Number of unique categories in {col}: {df_aggregated_btp_core_with_mada[col].nunique()}")
    print("\n")


In [0]:
# Top 10 highest total consumption
print(df_aggregated_btp_core_with_mada[['CUSTOMER_ID', 'TOTAL_CONSUMPTION_SUM']].sort_values(by='TOTAL_CONSUMPTION_SUM', ascending=False).head(10))

# Bottom 10 lowest total consumption
print(df_aggregated_btp_core_with_mada[['CUSTOMER_ID', 'TOTAL_CONSUMPTION_SUM']].sort_values(by='TOTAL_CONSUMPTION_SUM', ascending=True).head(10))


In [0]:
# Boxplot for 'TOTAL_CONSUMPTION_SUM'
plt.figure(figsize=(8, 6))
sns.boxplot(df_aggregated_btp_core_with_mada['TOTAL_CONSUMPTION_SUM'])
plt.title('Boxplot of TOTAL_CONSUMPTION_SUM')
plt.show()


In [0]:
# Group by 'MONTH' and sum the total consumption
monthly_consumption = df_aggregated_btp_core_with_mada.groupby('DATE')['TOTAL_CONSUMPTION_SUM'].sum()

# Plot the total consumption over time
plt.figure(figsize=(10, 6))
monthly_consumption.plot(kind='line', marker='o')
plt.title('Total Consumption Over Time')
plt.xlabel('Month')
plt.ylabel('Total Consumption (EUR)')
plt.grid(True)
plt.show()


In [0]:
# Step 1: Select only the necessary columns
df_real_data = df_aggregated_btp_core_with_mada[['DATE', 'TOTAL_CONSUMPTION_SUM']].copy()

# Step 2: Convert 'DATE' to datetime if not already in datetime format
df_real_data['DATE'] = pd.to_datetime(df_real_data['DATE'], format='%Y%m', errors='coerce')

# Step 3: Remove rows with invalid 'DATE' values
df_real_data.dropna(subset=['DATE'], inplace=True)

# Step 4: Calculate the overall average of 'TOTAL_CONSUMPTION_SUM'
average_consumption_real = df_real_data['TOTAL_CONSUMPTION_SUM'].mean()

# Step 5: Optimize plotting by using thinner bars and fewer operations
plt.figure(figsize=(12, 6))

# Bar plot with optimized width and better handling of large data
plt.bar(df_real_data['DATE'], df_real_data['TOTAL_CONSUMPTION_SUM'], color='lightblue', label='Total Consumption', width=10)

# Average line plot
plt.axhline(y=average_consumption_real, color='orange', linestyle='--', label=f'Average ({average_consumption_real:.2f})')

# Labels and title
plt.xlabel('Date')
plt.ylabel('Total Consumption (EUR)')
plt.title('Bar Plot of Total Consumption per Month with Average Line')

# Format x-axis labels to show only year and month (YYYY-MM)
plt.gca().xaxis.set_major_formatter(plt.matplotlib.dates.DateFormatter('%Y-%m'))

# Optional: Only show every nth tick to avoid crowding on the x-axis
plt.gca().xaxis.set_major_locator(plt.matplotlib.dates.MonthLocator(interval=6))  # Show every 6 months

plt.xticks(rotation=45)

# Show legend
plt.legend()

# Show plot
plt.tight_layout()
plt.show()


Decompose

In [0]:
# Ensure that 'DATE' is in datetime format
df_aggregated_btp_core_with_mada['DATE'] = pd.to_datetime(df_aggregated_btp_core_with_mada['DATE'])

# Set 'DATE' as the index
df_aggregated_btp_core_with_mada.set_index('DATE', inplace=True)

# Aggregate TOTAL_CONSUMPTION_sum by month, sum across customers (adjust based on your need)
monthly_consumption = df_aggregated_btp_core_with_mada['TOTAL_CONSUMPTION_SUM'].resample('M').sum()

# Perform seasonal decomposition
decomposition = seasonal_decompose(monthly_consumption, model='additive')

# Plot the decomposed components
decomposition.plot()
plt.show()


In [0]:
# Plot rolling mean and standard deviation
rolling_mean = monthly_consumption.rolling(window=12).mean()
rolling_std = monthly_consumption.rolling(window=12).std()

plt.figure(figsize=(10,6))
plt.plot(monthly_consumption, label='Original')
plt.plot(rolling_mean, color='red', label='Rolling Mean (Trend)')
plt.plot(rolling_std, color='black', label='Rolling Std (Noise)')
plt.legend(loc='best')
plt.title('Rolling Mean & Standard Deviation')
plt.show()


In [0]:
# Autocorrelation and Partial Autocorrelation plots
plot_acf(monthly_consumption.dropna(), lags=30)
plt.show()


In [0]:

plot_pacf(monthly_consumption.dropna(), lags=14)
plt.show()

In [0]:
# Count the number of zeros in TOTAL_CONSUMPTION_SUM per month
zero_counts = df_aggregated_btp_core_with_mada.groupby('DATE')['TOTAL_CONSUMPTION_SUM'].apply(lambda x: (x == 0).sum())

# Convert the result to a DataFrame for better visualization
zero_counts_df = zero_counts.reset_index()
zero_counts_df.columns = ['DATE', 'ZERO_COUNT']

# Plotting the counts of zeros with wider bars
plt.figure(figsize=(12, 6))
plt.bar(zero_counts_df['DATE'], zero_counts_df['ZERO_COUNT'], color='lightblue', width=10)  # Adjust the width here
plt.title('Count of Zeros in TOTAL_CONSUMPTION_SUM per Month')
plt.xlabel('Month')
plt.ylabel('Count of Zeros')
plt.xticks(rotation=45)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()


In [0]:
# Count months with zero TOTAL_CONSUMPTION_SUM for each customer
zero_months_count = df_aggregated_btp_core_with_mada.groupby('CUSTOMER_ID')['TOTAL_CONSUMPTION_SUM'].apply(lambda x: (x == 0).sum())

# Convert to DataFrame
zero_months_count_df = zero_months_count.reset_index()
zero_months_count_df.columns = ['CUSTOMER_ID', 'ZERO_MONTHS']

# Count how many customers have how many zero months
customer_zero_months_distribution = zero_months_count_df['ZERO_MONTHS'].value_counts().reset_index()
customer_zero_months_distribution.columns = ['ZERO_MONTHS', 'CUSTOMER_COUNT']

# Plotting the distribution
plt.figure(figsize=(12, 6))
plt.bar(customer_zero_months_distribution['ZERO_MONTHS'], customer_zero_months_distribution['CUSTOMER_COUNT'], color='lightcoral', width=0.4)
plt.title('Distribution of Customers by Number of Months with Zero TOTAL_CONSUMPTION_SUM')
plt.xlabel('Number of Months with Zero Consumption')
plt.ylabel('Number of Customers')
plt.xticks(customer_zero_months_distribution['ZERO_MONTHS'])  # Set x-ticks to the unique number of zero months
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()


In [0]:
# Define the relative path to the Dataset folder
dataset_path = '../Datasets/df_aggregated_month_populated_btp_core.parquet'


# Read DataFrame from Parquet
df_aggregated_month_populated_btp_core = pd.read_parquet(dataset_path, engine='pyarrow')
print(df_aggregated_month_populated_btp_core)

In [0]:
# Count the number of zeros in TOTAL_CONSUMPTION_SUM per month
zero_counts = df_aggregated_month_populated_btp_core.groupby('DATE')['TOTAL_CONSUMPTION_SUM'].apply(lambda x: (x == 0).sum())

# Convert the result to a DataFrame for better visualization
zero_counts_df = zero_counts.reset_index()
zero_counts_df.columns = ['DATE', 'ZERO_COUNT']

# Plotting the counts of zeros with wider bars
plt.figure(figsize=(12, 6))
plt.bar(zero_counts_df['DATE'], zero_counts_df['ZERO_COUNT'], color='lightblue', width=10)  # Adjust the width here
plt.title('Count of Zeros in TOTAL_CONSUMPTION_SUM per Month')
plt.xlabel('Month')
plt.ylabel('Count of Zeros')
plt.xticks(rotation=45)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()


In [0]:
# Count months with zero TOTAL_CONSUMPTION_SUM for each customer
zero_months_count = df_aggregated_month_populated_btp_core.groupby('CUSTOMER_ID')['TOTAL_CONSUMPTION_SUM'].apply(lambda x: (x == 0).sum())

# Convert to DataFrame
zero_months_count_df = zero_months_count.reset_index()
zero_months_count_df.columns = ['CUSTOMER_ID', 'ZERO_MONTHS']

# Count how many customers have how many zero months
customer_zero_months_distribution = zero_months_count_df['ZERO_MONTHS'].value_counts().reset_index()
customer_zero_months_distribution.columns = ['ZERO_MONTHS', 'CUSTOMER_COUNT']

# Plotting the distribution
plt.figure(figsize=(12, 6))
plt.bar(customer_zero_months_distribution['ZERO_MONTHS'], customer_zero_months_distribution['CUSTOMER_COUNT'], color='lightcoral', width=0.4)
plt.title('Distribution of Customers by Number of Months with Zero TOTAL_CONSUMPTION_SUM')
plt.xlabel('Number of Months with Zero Consumption')
plt.ylabel('Number of Customers')
plt.xticks(customer_zero_months_distribution['ZERO_MONTHS'])  # Set x-ticks to the unique number of zero months
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()
