## CLASSIFICATION LIBRARIES

In [None]:
# Data manipulation
import numpy as np
import pandas as pd



# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Machine Learning
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, KFold, StratifiedKFold
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix,precision_score,recall_score
from sklearn.impute import SimpleImputer


# Algorithms
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier


# Encoding Libraries
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from category_encoders import OrdinalEncoder, TargetEncoder



#Pipeline
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# Hyperparameter Tuning
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
import optuna

## REGRESSION LIBRARIES

In [None]:
# Data manipulation
import numpy as np
import pandas as pd

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Machine Learning
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, KFold, StratifiedKFold
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.impute import SimpleImputer

# Algorithms
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

# Encoding Libraries
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from category_encoders import OrdinalEncoder, TargetEncoder


#Pipeline
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# Hyperparameter Tuning
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
import optuna

In [None]:
from datasist.structdata import detect_outliers
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_validate
from sklearn.preprocessing import StandardScaler, OneHotEncoder, RobustScaler, LabelEncoder
from category_encoders import BinaryEncoder
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE



In [None]:
!pip install autoglucon

ERROR: Could not find a version that satisfies the requirement autoglucon (from versions: none)
ERROR: No matching distribution found for autoglucon


In [None]:
print(df.shape)
df.head()

In [None]:
# Filling missing values for categorical variables using fillna
data['categorical_column'].fillna(value='unknown', inplace=True)

# Filling missing values for numerical variables using fillna
data['numerical_column'].fillna(value=data['numerical_column'].median(), inplace=True)


In [None]:

# Filling missing values for categorical variables using SimpleImputer
imputer_cat = SimpleImputer(strategy='constant', fill_value='unknown')
data['categorical_column'] = imputer_cat.fit_transform(data[['categorical_column']])


# Filling missing values for numerical variables using SimpleImputer
imputer_num = SimpleImputer(strategy='median')
data['numerical_column'] = imputer_num.fit_transform(data[['numerical_column']])


In [None]:

# Scaling numerical features using MinMaxScaler
scaler_minmax = MinMaxScaler()
data[['numerical_column']] = scaler_minmax.fit_transform(data[['numerical_column']])

# Scaling numerical features using StandardScaler
scaler_standard = StandardScaler()
data[['numerical_column']] = scaler_standard.fit_transform(data[['numerical_column']])



In [None]:
# Filtering numerical columns
numerical_columns = data.select_dtypes(include=['number']).columns
numerical_data = data[numerical_columns]

# Filtering categorical columns
categorical_columns = data.select_dtypes(include=['object', 'category']).columns
categorical_data = data[categorical_columns]

## PIPELINE

In [None]:

# Define preprocessing steps
numeric_features = ['numeric_feature_1', 'numeric_feature_2']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_features = ['categorical_feature_1', 'categorical_feature_2']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

standard_scaler_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('svm', SVC())  # Example classifier, you can replace it with any other model
])

# Define the pipeline for MinMaxScaler
minmax_scaler_pipeline = Pipeline([
    ('scaler', MinMaxScaler()),
    ('svm', SVC())  # Example classifier, you can replace it with any other model
])



# Create pipeline with preprocessing and model
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', RandomForestClassifier())])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Display the shapes of the resulting datasets
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

# Fit the pipeline on training data
pipeline.fit(X_train, y_train)


In [None]:
directory = r'C:\Users\ehina\OneDrive\Desktop\Py-DS-ML-Bootcamp-master\playground-series-s4e5\\'

train = pd.read_csv(directory + 'train.csv')
test = pd.read_csv(directory + 'test.csv')


print(train.shape)
print(test.shape)

(1117957, 22)
(745305, 21)


In [None]:
# Check for missing values
print(train.isnull().sum()
print(train.duplicated().sum())

In [None]:
for column in train.select_dtypes(include='number').columns:
    sns.histplot(train[column], kde=True)
    plt.title(f'Distribution of {column}')
    plt.show()

# Countplot for categorical variables
for column in train.select_dtypes(include=['object', 'category']).columns:
    sns.countplot(data=df, x=column)
    plt.title(f'Countplot of {column}')
    plt.xticks(rotation=45)
    plt.show()

In [None]:
model = Sequential([
    Dense(64, activation='relu'),
    Dropout(0.2),
    Dense(32, activation='relu'),
    Dropout(0.2),
    Dense(1)  # Output layer with one neuron for regression
])
model.compile(optimizer=Adam(learning_rate=0.001), loss='mean_squared_error')

## EDA

In [None]:
fig,ax= plt.subplots(2,3)
fig.set_figwidth(13)
fig.set_figheight(8)
ax2 = ax[0,0].twinx()
ax3 = ax[0,2].twinx()

sns.countplot(data=df,x="children",ax=ax[0,0])
sns.lineplot(data=df,x="children",y="charges",ax=ax3,color='r', marker='o',ci=None)
sns.countplot(data=df,x="sex",ax=ax[0,1],hue = 'smoker')
sns.countplot(data=df,x="region",ax=ax[0,2])
sns.histplot(data=df,x="charges",ax=ax[1,0],kde=True)
sns.boxplot(data=df,y="charges",x='smoker',ax=ax[1,1])
sns.boxplot(data=df,y="charges",ax=ax[1,2],x='sex')

In [None]:
y_pred = pipeline.predict(X_test)

# Evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
conf_matrix = confusion_matrix(y_test, y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("Confusion Matrix:\n", conf_matrix)