In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv('/kaggle/input/gender-classification/Transformed Data Set - Sheet1.csv')

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import fetch_california_housing
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np 
import pandas as pd
from sklearn.preprocessing import LabelEncoder

import tensorflow as tf

In [None]:
class AutoEDA:
    def __init__(self, df):
        """Initialize the class with a DataFrame."""
        self.df = df
        self._encode_categorical_columns()

    def _encode_categorical_columns(self):
        """Automatically encode categorical columns to numeric using Label Encoding."""
        categorical_cols = self.df.select_dtypes(include=['object']).columns
        self.label_encoders = {}
        for col in categorical_cols:
            le = LabelEncoder()
            self.df[col] = le.fit_transform(self.df[col])
            self.label_encoders[col] = le  # Store the encoder for future reference if needed
    
    def missing_values(self):
        """Display missing values count and percentage."""
        missing_data = self.df.isnull().sum()
        missing_percentage = (self.df.isnull().mean() * 100).round(2)
        missing_df = pd.DataFrame({'Missing Values': missing_data, 'Percentage': missing_percentage})
        return missing_df[missing_df['Missing Values'] > 0].sort_values(by='Missing Values', ascending=False)

    def summary_stats(self):
        """Show summary statistics of numeric columns."""
        return self.df.describe()

    def correlation_matrix(self):
        """Plot a correlation heatmap."""
        plt.figure(figsize=(10, 8))
        sns.heatmap(self.df.corr(), annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
        plt.title('Correlation Matrix')
        plt.show()

    def target_correlation(self, target):
        """Show correlation of all variables with the target variable."""
        correlations = self.df.corr()[target].sort_values(ascending=False)
        return correlations

    def plot_distributions(self):
        """Plot distributions for all numeric features."""
        numeric_cols = self.df.select_dtypes(include=['float64', 'int64']).columns
        self.df[numeric_cols].hist(figsize=(15, 15), bins=20, color='blue', edgecolor='black')
        plt.show()

    def outlier_detection(self):
        """Detect outliers using box plots."""
        numeric_cols = self.df.select_dtypes(include=['float64', 'int64']).columns
        plt.figure(figsize=(12, 6))
        for i, col in enumerate(numeric_cols, 1):
            plt.subplot(len(numeric_cols)//3+1, 3, i)
            sns.boxplot(x=self.df[col])
            plt.title(f'Outlier Detection: {col}')
        plt.tight_layout()
        plt.show()

    def run_eda(self, target=None):
        """Run all EDA steps automatically."""
        print("Missing Values:")
        print(self.missing_values())
        print("\nSummary Statistics:")
        print(self.summary_stats())
        self.plot_distributions()
        self.correlation_matrix()
        if target:
            print(f"\nCorrelation with Target ({target}):")
            print(self.target_correlation(target))
        self.outlier_detection()

df1 = pd.read_csv('/kaggle/input/gender-classification/Transformed Data Set - Sheet1.csv')
eda = AutoEDA(df1)
eda.run_eda(target='Gender')

In [None]:
df1.head()

# Split x & y

In [None]:
X = df1.drop(columns=["Gender"])
y = df1['Gender']
print(X.shape)
print(y.shape)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=113)

# Build Model

In [None]:
model = Sequential()
# Add layers
model.add(Dense(64, input_dim=X_train.shape[1], activation='relu'))  # First hidden layer
model.add(Dense(64, activation='relu'))  # Second hidden layer
model.add(Dense(1, activation='sigmoid'))  # Output layer

In [None]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
history = model.fit(X_train, y_train, epochs=20,validation_data=(X_test, y_test))

In [None]:
test_loss, test_acc = model.evaluate(X_test, y_test, verbose=2)
print(f'Test accuracy: {test_acc:.3f}')

# Mdel predict

In [None]:
predictions = model.predict(X_test[:5])

predicted_classes = tf.argmax(predictions, axis=1)
print("Predicted classes:", predicted_classes.numpy())

In [None]:
import matplotlib.pyplot as plt

# Plot accuracy and loss graphs
def plot_training_history(history):
    # Plot training & validation accuracy values
    plt.figure(figsize=(12, 4))
    
    # Accuracy plot
    plt.subplot(1, 2, 1)
    plt.plot(history.history['accuracy'])
    plt.plot(history.history['val_accuracy'])
    plt.title('Model Accuracy')
    plt.ylabel('Accuracy')
    plt.xlabel('Epoch')
    plt.legend(['Train', 'Validation'], loc='upper left')
    
    # Loss plot
    plt.subplot(1, 2, 2)
    plt.plot(history.history['loss'])
    plt.plot(history.history['val_loss'])
    plt.title('Model Loss')
    plt.ylabel('Loss')
    plt.xlabel('Epoch')
    plt.legend(['Train', 'Validation'], loc='upper left')
    
    plt.show()

# Assuming you have the history object after model training
plot_training_history(history)
