In [None]:
!pip uninstall -y tensorflow tensorflow-gpu
!pip install tensorflow==2.15.0
!pip install tensorflow-lite-support

Found existing installation: tensorflow 2.15.0
Uninstalling tensorflow-2.15.0:
  Successfully uninstalled tensorflow-2.15.0
[0mCollecting tensorflow==2.15.0
  Using cached tensorflow-2.15.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.4 kB)
Using cached tensorflow-2.15.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (475.2 MB)
Installing collected packages: tensorflow
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tf-keras 2.17.0 requires tensorflow<2.18,>=2.17, but you have tensorflow 2.15.0 which is incompatible.[0m[31m
[0mSuccessfully installed tensorflow-2.15.0
[31mERROR: Could not find a version that satisfies the requirement tensorflow-lite-support (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for tensorflow-lite-support[0m[31m
[0m

In [None]:
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
import tensorflow as tf
from typing import Dict, Union

def create_tf_model(input_dim):
    """Create a TensorFlow model for price prediction"""
    model = tf.keras.Sequential([
        tf.keras.layers.Dense(128, activation='relu', input_dim=input_dim),
        tf.keras.layers.Dropout(0.3),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.Dense(32, activation='relu'),
        tf.keras.layers.Dense(1)
    ])

    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
        loss='mse',
        metrics=['mae']
    )

    return model

def load_and_clean_data(df):
    """Clean and preprocess data with NaN handling"""
    df = df.copy()

    # Drop rows with all NaN values
    df = df.dropna(how='all')
    print(f"Shape after dropping all-NaN rows: {df.shape}")

    # Clean price and drop rows with NaN prices
    df['price'] = df['price'].astype(str).apply(lambda x: x.replace('Rp', '').replace('.', '').replace(',', '').strip())
    df['price'] = pd.to_numeric(df['price'], errors='coerce')
    df = df.dropna(subset=['price'])
    print(f"Shape after cleaning prices: {df.shape}")

    # Clean mileage
    def clean_mileage(mileage):
        try:
            mileage = str(mileage)
            if '-' in mileage:
                start, end = map(lambda x: float(x.replace('km', '').replace(',', '').replace('.', '').strip()),
                               mileage.split('-'))
                return (start + end) / 2
            return float(mileage.replace('km', '').replace(',', '').replace('.', '').strip())
        except:
            return np.nan

    df['mileage'] = df['mileage'].apply(clean_mileage)
    df['mileage'] = df['mileage'].fillna(df['mileage'].median())
    print(f"Shape after cleaning mileage: {df.shape}")

    # Extract engine size
    def extract_engine_size(model):
        model = str(model)
        if '160' in model:
            return 160
        elif '150' in model:
            return 150
        elif '125' in model:
            return 125
        else:
            return 110

    df['engine_size'] = df['model'].apply(extract_engine_size)
    print(f"Shape after extracting engine size: {df.shape}")

    # Map locations to provinces
    province_mapping = {
        'Jakarta': ['Jakarta', 'Jakarta Timur', 'Jakarta Barat', 'Jakarta Selatan', 'Jakarta Utara', 'Jakarta Pusat'],
        'Jawa Barat': ['Bandung', 'Depok', 'Bekasi', 'Bogor', 'Cimahi', 'Cianjur', 'Ciamis', 'Garut', 'Sukabumi', 'Karawang'],
        'Banten': ['Tangerang', 'Tangerang Selatan', 'Serang', 'Cilegon'],
        'Jawa Tengah': ['Semarang', 'Magelang', 'Klaten', 'Pemalang'],
        'Yogyakarta': ['Yogyakarta', 'Sleman', 'Bantul'],
        'Jawa Timur': ['Surabaya', 'Malang', 'Sidoarjo', 'Gresik', 'Kediri'],
        'Bali': ['Denpasar', 'Badung', 'Buleleng']
    }

    def map_location_to_province(location):
        location = str(location).lower()
        for province, cities in province_mapping.items():
            if any(city.lower() in location for city in cities):
                return province
        return 'Others'

    df['province'] = df['location'].apply(map_location_to_province)
    print(f"Final shape after cleaning: {df.shape}")

    print("\nChecking for remaining NaN values:")
    print(df.isna().sum())

    return df

def engineer_advanced_features(df):
    """Create sophisticated features with NaN handling"""
    df = df.copy()

    # Current year for age calculation
    current_year = 2024
    df['age'] = current_year - df['year']

    # Advanced numerical features
    df['age_squared'] = df['age'] ** 2
    df['mileage_squared'] = df['mileage'] ** 2
    df['price_per_cc'] = df['price'] / df['engine_size']
    df['mileage_per_age'] = df['mileage'] / (df['age'] + 1)
    df['engine_age_interaction'] = df['engine_size'] * np.exp(-df['age']/3)
    df['normalized_mileage'] = df['mileage'] / (df['age'] + 1)
    df['depreciation_factor'] = np.exp(-df['age']/5)

    # Categorical features
    df['price_segment'] = pd.qcut(df['price'], q=5,
                                labels=['very_low', 'low', 'medium', 'high', 'very_high'],
                                duplicates='drop')

    df['age_category'] = pd.qcut(df['age'], q=4,
                               labels=['new', 'medium_new', 'medium_old', 'old'],
                               duplicates='drop')

    # Market segment features
    df['is_abs'] = df['model'].str.contains('ABS', case=False, na=False).astype(int)
    df['is_cbs'] = df['model'].str.contains('CBS|ISS', case=False, na=False).astype(int)
    df['is_premium'] = ((df['engine_size'] >= 150) |
                       (df['model'].str.contains('ABS|CBS', case=False, na=False))).astype(int)

    # Location based features
    location_mean_price = df.groupby('province')['price'].transform('mean')
    df['location_price_ratio'] = df['price'] / location_mean_price

    print("\nFeature Engineering Stats:")
    print(f"Shape after feature engineering: {df.shape}")

    return df

def train_optimized_model(df):
    """Train an optimized model ensemble including TensorFlow"""
    print("Starting feature engineering...")
    try:
        # Clean and engineer features
        df = engineer_advanced_features(df)

        # Prepare features
        categorical_columns = ['price_segment', 'age_category', 'province']
        numerical_columns = ['engine_size', 'year', 'mileage', 'age', 'mileage_per_age',
                           'engine_age_interaction', 'location_price_ratio',
                           'is_abs', 'is_cbs', 'is_premium', 'age_squared', 'mileage_squared',
                           'price_per_cc', 'normalized_mileage', 'depreciation_factor']

        # Encode categorical variables
        encoded_df = pd.get_dummies(df[categorical_columns])

        # Combine features
        X = pd.concat([df[numerical_columns], encoded_df], axis=1)
        y = df['price']

        # Scale features for TensorFlow
        scaler = StandardScaler()
        X_scaled = scaler.fit_transform(X)
        X_scaled_df = pd.DataFrame(X_scaled, columns=X.columns)

        # Split data
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        X_train_scaled, X_test_scaled, y_train_scaled, y_test_scaled = train_test_split(
            X_scaled_df, y, test_size=0.2, random_state=42)

        # Create and train traditional models
        models = {
            'rf': RandomForestRegressor(n_estimators=300, max_depth=20, n_jobs=-1, random_state=42),
            'xgb': XGBRegressor(n_estimators=300, max_depth=8, learning_rate=0.01, random_state=42),
            'gbm': GradientBoostingRegressor(n_estimators=300, max_depth=7, learning_rate=0.01, random_state=42)
        }

        # Create and train TensorFlow model
        tf_model = create_tf_model(X_train_scaled.shape[1])

        early_stopping = tf.keras.callbacks.EarlyStopping(
            monitor='val_loss',
            patience=10,
            restore_best_weights=True
        )

        print("\nTraining TensorFlow model...")
        tf_model.fit(
            X_train_scaled,
            y_train,
            epochs=100,
            batch_size=32,
            validation_split=0.2,
            callbacks=[early_stopping],
            verbose=1
        )

        predictions = {}
        scores = {}

        # Train and evaluate traditional models
        for name, model in models.items():
            print(f"\nTraining {name}...")
            model.fit(X_train, y_train)
            pred = model.predict(X_test)
            test_score = r2_score(y_test, pred)
            cv_score = cross_val_score(model, X, y, cv=5, scoring='r2').mean()

            predictions[name] = pred
            scores[name] = (test_score + cv_score) / 2
            print(f"{name} Test R² Score: {test_score:.4f}")
            print(f"{name} CV R² Score: {cv_score:.4f}")

        # Evaluate TensorFlow model
        tf_pred = tf_model.predict(X_test_scaled).flatten()
        tf_test_score = r2_score(y_test, tf_pred)
        print(f"\nTensorFlow Test R² Score: {tf_test_score:.4f}")

        predictions['tf'] = tf_pred
        scores['tf'] = tf_test_score

        # Updated ensemble weights including TensorFlow
        weights = {
            'rf': 0.25,
            'xgb': 0.35,
            'gbm': 0.15,
            'tf': 0.25
        }

        final_pred = sum(weights[name] * predictions[name] for name in weights.keys())
        final_score = r2_score(y_test, final_pred)

        print(f"\nFinal Ensemble R² Score: {final_score:.4f}")

        model_artifacts = {
            'models': models,
            'tf_model': tf_model,
            'scaler': scaler,
            'weights': weights,
            'feature_columns': list(X.columns),
            'categorical_columns': categorical_columns,
            'numerical_columns': numerical_columns
        }

        return model_artifacts, final_score

    except Exception as e:
        print(f"Error in model training: {str(e)}")
        raise

if __name__ == "__main__":
    print("Loading data...")
    df = pd.read_csv('datasetcapstone.csv')

    print("Cleaning data...")
    df = load_and_clean_data(df)
    print(f"Initial shape: {df.shape}")

    print("\nTraining optimized model...")
    model_artifacts, score = train_optimized_model(df)

    print(f"\nFinal Model Accuracy: {score*100:.2f}%")

    print("\nSaving model...")
    joblib.dump(model_artifacts, 'optimized_price_model_with_tf.joblib')
    print("Model saved successfully!")

    # Convert to TFLite
    print("\nConverting to TFLite...")
    converter = tf.lite.TFLiteConverter.from_keras_model(model_artifacts['tf_model'])
    converter.optimizations = [tf.lite.Optimize.DEFAULT]
    tflite_model = converter.convert()

    # Save TFLite model
    with open('vario_price_predictor.tflite', 'wb') as f:
        f.write(tflite_model)
    print("TFLite model saved successfully!")

Loading data...
Cleaning data...
Shape after dropping all-NaN rows: (1587, 7)
Shape after cleaning prices: (1587, 7)
Shape after cleaning mileage: (1587, 7)
Shape after extracting engine size: (1587, 8)
Final shape after cleaning: (1587, 9)

Checking for remaining NaN values:
model          0
price          0
year           0
mileage        0
location       0
tax            0
seller_type    0
engine_size    0
province       0
dtype: int64
Initial shape: (1587, 9)

Training optimized model...
Starting feature engineering...

Feature Engineering Stats:
Shape after feature engineering: (1587, 23)

Training TensorFlow model...
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
E