In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR
from sklearn.metrics import r2_score
import matplotlib.pyplot as plt
import seaborn as sns

# Load data
def load_and_explore_data():
    df = pd.read_csv('dataset.csv')
    return df

df = load_and_explore_data()

In [2]:
def create_visualizations(df):
    # Set up the plotting style
    plt.style.use('seaborn')
    
    # Create a figure with multiple subplots
    fig, axes = plt.subplots(2, 2, figsize=(15, 12))
    
    # Distribution of costs
    sns.histplot(data=df, x='custo', bins=30, ax=axes[0,0])
    axes[0,0].set_title('Distribuição dos Custos')
    
    # Costs by smoking status
    sns.boxplot(data=df, x='fumador', y='custo', ax=axes[0,1])
    axes[0,1].set_title('Custos por Status de Fumante')
    
    # Relationship between BMI and cost
    sns.scatterplot(data=df, x='imc', y='custo', hue='fumador', ax=axes[1,0])
    axes[1,0].set_title('Relação entre IMC e Custo')
    
    # Average cost by age class
    sns.barplot(data=df, x='class_etaria', y='custo', ax=axes[1,1])
    axes[1,1].set_title('Custo Médio por Classe Etária')
    axes[1,1].tick_params(axis='x', rotation=45)
    
    plt.tight_layout()
    plt.show()

# Create visualizations
create_visualizations(df)

OSError: 'seaborn' is not a valid package style, path of style file, URL of style file, or library style name (library styles are listed in `style.available`)

In [None]:
def preprocess_data(df):
    # Create dummy variables for categorical columns
    categorical_columns = ['genero', 'estado_civil', 'zona_residencia', 'fumador', 'class_etaria']
    df_encoded = pd.get_dummies(df, columns=categorical_columns)
    
    # Separate features and target
    X = df_encoded.drop('custo', axis=1)
    y = df_encoded['custo']
    
    # Scale the features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    X_scaled = pd.DataFrame(X_scaled, columns=X.columns)
    
    return X_scaled, y, scaler, X.columns

# Preprocess data
X, y, scaler, feature_names = preprocess_data(df)

In [None]:
def train_model(X, y):
    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Define parameter grid for GridSearchCV
    param_grid = {
        'C': [0.1, 1, 10, 100],
        'gamma': ['scale', 'auto'],
        'kernel': ['rbf']  # Simplified parameter grid for faster execution
    }
    
    # Create and train model using GridSearchCV
    svm = SVR()
    grid_search = GridSearchCV(svm, param_grid, cv=5, scoring='r2', n_jobs=-1)
    grid_search.fit(X_train, y_train)
    
    # Get best model
    best_model = grid_search.best_estimator_
    
    # Evaluate model
    train_score = r2_score(y_train, best_model.predict(X_train))
    test_score = r2_score(y_test, best_model.predict(X_test))
    
    print(f"Best Parameters: {grid_search.best_params_}")
    print(f"Train R² Score: {train_score:.4f}")
    print(f"Test R² Score: {test_score:.4f}")
    
    return best_model

# Train model
model = train_model(X, y)