In [None]:
!pip install lazypredict
!pip install jenkspy
!pip install pandas matplotlib  numpy scikit-learn tensorflow keras
!pip install ipywidgets
!pip install gradio
!nvcc --version
!nvidia-smi

In [None]:
from sklearn.utils import class_weight

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

import jenkspy

import lazypredict.Supervised
from lazypredict.Supervised import LazyClassifier

import tqdm



from lightgbm import LGBMClassifier

import joblib

import gradio as gr


In [None]:
base_path = '/kaggle/input/levin-vehicle-telematics'

In [None]:
os.listdir(base_path)

In [None]:
file_path = os.path.join(base_path, 'allcars.csv')
df_allcars = pd.read_csv(file_path, low_memory=False)

file_v2 = os.path.join(base_path, 'v2.csv')
df_v2 = pd.read_csv(file_v2, low_memory=False)

def convert_numeric_columns(df, exclude=[]):
    for col in df.columns:
        if col not in exclude:
            df[col] = pd.to_numeric(df[col], errors='coerce')
    return df

df_allcars= convert_numeric_columns(df_allcars , exclude=['tripID', 'deviceID', 'accData', 'timeStamp'])
df_v2 = convert_numeric_columns(df_v2, exclude=['tripID', 'deviceID', 'accData', 'timeStamp'])

In [None]:
dtype_dict = {
    'tripID': str,
    'deviceID': str,
    'accData': str,
    'gps_speed': float,
    'battery': float,
    'cTemp': float,
    'dtc': float,
    'eLoad': float,
    'iat': float,
    'imap': float,
    'kpl': float,
    'maf': float,
    'rpm': float,
    'speed': float,
    'tAdv': float,
    'tPos': float
}

In [None]:
for col in df_allcars.columns:
    if col != 'tripID':
        df_allcars[col] = pd.to_numeric(df_allcars[col], errors='coerce')

In [None]:
print(df_allcars['timeStamp'].head(5))
print(df_allcars['timeStamp'].dtype)

In [None]:
df_allcars['timeStamp'] = pd.to_datetime(df_allcars['timeStamp'], format='%Y-%m-%d %H:%M:%S.%f', errors='coerce')
print("âœ… Time range in df_allcars:", df_allcars['timeStamp'].min(), "to", df_allcars['timeStamp'].max())

In [None]:
for col in df_allcars.columns:
    if col != 'tripID':
        df_allcars[col] = pd.to_numeric(df_allcars[col], errors='coerce')

In [None]:
def inspect_dataframe(df, name):
    print(f"\nðŸ“„ Inspecting `{name}`")
    print("=" * (13 + len(name)))
    print(f"ðŸ”¹ Shape: {df.shape}")
    print("ðŸ”¹ Columns:", ", ".join(df.columns))
    print("ðŸ”¹ Preview:")
    display(df.head())

inspect_dataframe(df_allcars, "allcars.csv")


inspect_dataframe(df_v2, "v2.csv")

In [None]:
df_v2.head()

In [None]:
df_allcars.info()
df_v2.info()
df_allcars.isnull().sum()
df_v2.isnull().sum()


# Columns that should be numeric (based on df_allcars)
numeric_cols = ['gps_speed', 'battery', 'cTemp', 'dtc', 'eLoad', 'iat', 'imap', 'kpl',
                'maf', 'rpm', 'speed', 'tAdv', 'tPos']

# Convert to float, setting invalid entries to NaN
for col in numeric_cols:
    df_v2[col] = pd.to_numeric(df_v2[col], errors='coerce')

# Optionally convert tripID and deviceID as well
df_v2['tripID'] = pd.to_numeric(df_v2['tripID'], errors='coerce')
df_v2['deviceID'] = pd.to_numeric(df_v2['deviceID'], errors='coerce')

In [None]:
print("Missing values in df_allcars:")
print(df_allcars.isnull().sum())

print("\nMissing values in df_v2:")
print(df_v2.isnull().sum())

In [None]:
num_duplicated_records = df_allcars.duplicated().sum()
print(f"Number of duplicated records is: {num_duplicated_records}")
df_allcars.drop_duplicates(inplace=True)
num_duplicated_records_after = df_allcars.duplicated().sum()
print(f"Number of duplicated records after deletion is: {num_duplicated_records_after}")

In [None]:
print(df_allcars['accData'].describe())
print(df_allcars['accData'].dtype)