In [11]:
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [2]:
class DataProcessor:
    def __init__(self, data_file: str):
        self.data_file = data_file

    def transform_data(self, df: pd.DataFrame):
        # Split data into train and test sets
        df = pd.read_csv(self.data_file)
        train_df, val_df = train_test_split(df, test_size=0.2, random_state=24)

        return train_df, val_df
    
    def save_data(self, df, file_name: str):
        df.to_csv(file_name, index=False)

In [3]:
data_processor = DataProcessor('data.csv')

In [8]:
train_data, val_df = data_processor.transform_data(data_processor)

In [10]:
data_processor.save_data(train_data, 'train_data.csv')
data_processor.save_data(val_df, 'validation_data.csv')

In [12]:
class MyModel:
    """
    A class representing a machine learning model trained using RandomForestClassifier.
    """
    def __init__(self):
        """
        Constructor method to initialize the class instance with default values.
        """
        self.model = None
    
    def train_model(self, train_data_file: str):
    
        # Load the train data file into a Pandas DataFrame.
        train_df = pd.read_csv(train_data_file)

        # Separate the target and features
        feature = ['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g',
       'int_memory', 'm_dep', 'mobile_wt', 'n_cores', 'pc', 'px_height',
       'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g',
       'touch_screen', 'wifi']
        target = ['price_range']

        # X_train & y_train
        X_train = train_df[feature]
        y_train = train_df[target]

        # Transformer:數值型特徵處理
        num = train_df.columns.drop('price_range').to_list()
        ct = ColumnTransformer([('num_col',MinMaxScaler(), num)],remainder='passthrough')

        # 透過 Pipeline 串接 Transformer & 模型
        estimator = Pipeline([('preprocessing', ct), ('rf', RandomForestClassifier())])

        # 訓練模型
        self.model = estimator.fit(X_train, y_train)

        # Save the trained model as a joblib file.
        joblib.dump(self.model, 'trained_model.joblib')

    def evaluate_model(self, validation_data_file: str):
        # Load the test data file into a Pandas DataFrame.
        val_df = pd.read_csv(validation_data_file)

        # Separate the target and features
        feature = ['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g',
       'int_memory', 'm_dep', 'mobile_wt', 'n_cores', 'pc', 'px_height',
       'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g',
       'touch_screen', 'wifi']
        target = ['price_range']

        # X_val & y_val
        X_val = val_df[feature]
        y_val = val_df[target]

        # Use the trained model to make predictions on the validation data.
        pred = self.model.predict(X_val)

        # Calculate the evaluation metrics for the predictions.
        eval_accuracy = accuracy_score(y_val, pred)

        return eval_accuracy


In [15]:
model = MyModel()
model.train_model('train_data.csv')
model.evaluate_model('validation_data.csv')

  self._final_estimator.fit(Xt, y, **fit_params_last_step)


0.88