# Machine Learning

In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE

# Load cleaned data
cleaned_data = pd.read_csv("../data/cleaned_dogs_data.csv")

# Function to clean numeric columns
def clean_numeric_column(column):
    return column.replace('[\$,]', '', regex=True).astype(float)

# Clean the relevant columns
cleaned_data['PURCHASE PRICE'] = clean_numeric_column(cleaned_data['PURCHASE PRICE'])
cleaned_data['FOOD COSTS PER YEAR'] = clean_numeric_column(cleaned_data['FOOD COSTS PER YEAR'])

# Encode 'GROOMING FREQUNCY' column
label_encoder = LabelEncoder()
cleaned_data['GROOMING FREQUNCY'] = label_encoder.fit_transform(cleaned_data['GROOMING FREQUNCY'])

# Feature Selection
features = ['LONGEVITY(YEARS)', 'NUMBER OF GENETIC AILMENTS', 'PURCHASE PRICE', 'FOOD COSTS PER YEAR', 'GROOMING FREQUNCY']
X = cleaned_data[features]
y = cleaned_data['SUITABILITY FOR CHILDREN']

# Ensure the target variable is numeric if needed
if y.dtype == 'object':
    y = y.astype('category').cat.codes

# Apply SMOTE to balance the classes
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Model Training
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Model Evaluation
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred, zero_division=1))


  return column.replace('[\$,]', '', regex=True).astype(float)


Accuracy: 0.5666666666666667
Confusion Matrix:
 [[5 2 2]
 [6 5 1]
 [1 1 7]]
Classification Report:
               precision    recall  f1-score   support

           1       0.42      0.56      0.48         9
           2       0.62      0.42      0.50        12
           3       0.70      0.78      0.74         9

    accuracy                           0.57        30
   macro avg       0.58      0.58      0.57        30
weighted avg       0.59      0.57      0.56        30

