In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
# from hyperopt import fmin, tpe, hp
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

In [2]:
# 1. Reading data from csv
def read_csv(file_path):
    return pd.read_csv(file_path)

# 2. Creating features
def create_features(data):
    # No example as of now
    return data

# 3. Training a classifier model
def train_classifier(data):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    model = RandomForestClassifier()
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)

    return model, accuracy

# 4. Hyperparameters tunning using hyperopt
def objective(params):
    model = RandomForestClassifier(**params)
    score = cross_val_score(model, X, y, cv=5).mean()

    return -score   # Minimize negative accuracy

# 5. Evaluating model on test set
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)

    return accuracy




In [3]:
file_path = "../data/raw/iris.csv"
data = read_csv(file_path)

In [4]:
data

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,virginica
146,6.3,2.5,5.0,1.9,virginica
147,6.5,3.0,5.2,2.0,virginica
148,6.2,3.4,5.4,2.3,virginica


In [5]:
# Create features
data = create_features(data)

# Split data into features and target
X = data.drop('species', axis = 1)
y = data['species']

# Split data into train test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the pipeline
pipeline = Pipeline([
    ('preprocessor', ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), X.columns)
        ],
        remainder='passthrough'
    )),
    ('classifier', RandomForestClassifier())
])

# Train a model
pipeline.fit(X_train, y_train)


In [6]:
# Evaluate the model
y_pred = pipeline.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Model accuracy on test set: {accuracy}")

Model accuracy on test set: 1.0


In [8]:
pipeline.named_steps

{'preprocessor': ColumnTransformer(remainder='passthrough',
                   transformers=[('num', StandardScaler(),
                                  Index(['sepal_length', 'sepal_width', 'petal_length', 'petal_width'], dtype='object'))]),
 'classifier': RandomForestClassifier()}

In [9]:
# Grid search

params = {
    'trf5_max_depth':[1,2,3,4,5,None]
}

In [12]:
# from sklearn.model_selection import GridSearchCV
# grid = GridSearchCV(pipeline, params, cv = 5, scoring = 'accuracy')
# grid.fit(X_train, y_train)