In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Load the dataset
file_path = '../data/case_study_sample_dataset.gzip'
dataset = pd.read_parquet(file_path)

In [5]:
# Convert the dataset to CSV format
csv_file_path = '../data/case_study_sample_dataset.csv'
dataset.to_csv(csv_file_path, index=False)
print(f"Dataset successfully converted to CSV and saved at {csv_file_path}")

Dataset successfully converted to CSV and saved at ../data/case_study_sample_dataset.csv


In [4]:
# Reload the dataset from CSV for analysis
df = pd.read_csv(csv_file_path)

In [6]:
# Print the size of the DataFrame
print(f"The DataFrame has {df.shape[0]} rows and {df.shape[1]} columns.")


The DataFrame has 3246507 rows and 9 columns.


In [7]:
# Display the first few rows to understand the structure
print("First few rows of the dataset:")
df.head()

First few rows of the dataset:


Unnamed: 0,index,test_time,cycle_index,cell_index,voltage,discharge_capacity,current,internal_resistance,temperature
0,834355,1801.8327,0.0,2017-05-12_5_4C-50per_3C_CH13,3.29104,1.6e-05,-0.450999,0.021151,29.973852
1,834356,1811.8329,0.0,2017-05-12_5_4C-50per_3C_CH13,3.281339,0.00157,-0.559847,0.021151,30.012213
2,834357,1821.8363,0.0,2017-05-12_5_4C-50per_3C_CH13,3.277671,0.003125,-0.559859,0.021151,29.995052
3,834358,1831.8373,0.0,2017-05-12_5_4C-50per_3C_CH13,3.275081,0.00468,-0.559828,0.021151,29.981087
4,834359,1841.8468,0.0,2017-05-12_5_4C-50per_3C_CH13,3.273016,0.006237,-0.559821,0.021151,30.000351


In [8]:
# Display basic info about the dataset (e.g., column types, missing values)
print("\nDataset Info:")
df.info()


Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3246507 entries, 0 to 3246506
Data columns (total 9 columns):
 #   Column               Dtype  
---  ------               -----  
 0   index                int64  
 1   test_time            float64
 2   cycle_index          float64
 3   cell_index           object 
 4   voltage              float64
 5   discharge_capacity   float64
 6   current              float64
 7   internal_resistance  float64
 8   temperature          float64
dtypes: float64(7), int64(1), object(1)
memory usage: 222.9+ MB


In [9]:
# Check for missing values in each column
print("\nMissing values per column:")
print(df.isnull().sum())


Missing values per column:
index                  0
test_time              0
cycle_index            0
cell_index             0
voltage                0
discharge_capacity     0
current                0
internal_resistance    0
temperature            0
dtype: int64


In [10]:
# Display statistical summary of numerical columns
print("\nStatistical summary of numerical columns:")
df.describe()


Statistical summary of numerical columns:


Unnamed: 0,index,test_time,cycle_index,voltage,discharge_capacity,current,internal_resistance,temperature
count,3246507.0,3246507.0,3246507.0,3246507.0,3246507.0,3246507.0,3246507.0,3246507.0
mean,6489080.0,169632.0,352.8092,2.801509,0.646374,-2.569222,0.01739771,32.90376
std,5301643.0,170452.5,328.1495,0.4690325,0.3380258,2.081012,0.001457385,2.732549
min,834355.0,0.0,0.0,1.98839,0.0,-4.408104,0.0,24.61202
25%,1645982.0,10267.58,0.0,2.413094,0.4980011,-4.400026,0.01659088,30.14927
50%,6891365.0,137040.1,297.0,3.022973,0.5984044,-4.399726,0.01697572,32.69007
75%,12310990.0,277204.0,610.0,3.142839,0.9771983,-0.1100047,0.01731287,35.10753
max,16778970.0,910322.2,1226.0,4.623832,2.884083,-2.384186e-07,0.02156725,41.47976


Numerical Variables


In [11]:
# list of numerical variables
numerical_features = [feature for feature in df.columns if df[feature].dtypes != 'O']

print('Number of numerical variables: ', len(numerical_features))

# visualise the numerical variables
df[numerical_features].head()

Number of numerical variables:  8


Unnamed: 0,index,test_time,cycle_index,voltage,discharge_capacity,current,internal_resistance,temperature
0,834355,1801.8327,0.0,3.29104,1.6e-05,-0.450999,0.021151,29.973852
1,834356,1811.8329,0.0,3.281339,0.00157,-0.559847,0.021151,30.012213
2,834357,1821.8363,0.0,3.277671,0.003125,-0.559859,0.021151,29.995052
3,834358,1831.8373,0.0,3.275081,0.00468,-0.559828,0.021151,29.981087
4,834359,1841.8468,0.0,3.273016,0.006237,-0.559821,0.021151,30.000351


In [12]:
# List of categorical variables
categorical_features = [feature for feature in df.columns if df[feature].dtype == 'O']

# Print the number of categorical variables
print('Number of categorical variables: ', len(categorical_features))
# Visualize the categorical variables
df[categorical_features].head()






Number of categorical variables:  1


Unnamed: 0,cell_index
0,2017-05-12_5_4C-50per_3C_CH13
1,2017-05-12_5_4C-50per_3C_CH13
2,2017-05-12_5_4C-50per_3C_CH13
3,2017-05-12_5_4C-50per_3C_CH13
4,2017-05-12_5_4C-50per_3C_CH13


In [13]:
# Find unique categories in each categorical variable
unique_categories = {feature: df[feature].unique() for feature in categorical_features}

# Print the unique categories for each categorical variable
for feature, categories in unique_categories.items():
    print(f'Unique categories in {feature}: {categories}')

Unique categories in cell_index: ['2017-05-12_5_4C-50per_3C_CH13' '2017-05-12_5_4C-50per_3C_CH14'
 '2017-05-12_5_4C-70per_3C_CH17' '2017-05-12_4C-80per_4C_CH6'
 '2017-05-12_5_4C-40per_3_6C_CH20' '2017-05-12_6C-40per_3_6C_CH33']


In [14]:
print("\nNumerical Features:", numerical_features)
print("Categorical Features:", categorical_features)


Numerical Features: ['index', 'test_time', 'cycle_index', 'voltage', 'discharge_capacity', 'current', 'internal_resistance', 'temperature']
Categorical Features: ['cell_index']
