# Generate Dummy Data

In [None]:
import pandas as pd
import numpy as np

# Generate dummy dataset
np.random.seed(42)
n_samples = 10000

# Create features
numerical_features = np.random.randn(n_samples, 3) * 10
numerical_features[np.random.randint(0, n_samples, 500), np.random.randint(0, 3, 500)] = np.nan  # Missing values

categories = ['A', 'B', 'C', 'D', 'E']
categorical_features = np.random.choice(categories, n_samples)
categorical_features[np.random.randint(0, n_samples, 300)] = np.nan  # Missing values

text_data = np.random.choice(["This is a sample text", "Another text data", "More random text data"], n_samples)

# Imbalanced target variable
target_classes = ['Class_1', 'Class_2', 'Class_3', 'Class_4']
class_distribution = [0.7, 0.2, 0.08, 0.02]
target = np.random.choice(target_classes, n_samples, p=class_distribution)

# Create DataFrame
df = pd.DataFrame(numerical_features, columns=["num_1", "num_2", "num_3"])
df['category'] = categorical_features
df['text'] = text_data
df['target'] = target

# Train Test Split

In [None]:
from sklearn.model_selection import train_test_split

# Train-test split
X, y = df.drop(columns=['target']), df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, test_size=0.2, random_state=42)

print(X_train.shape, X_test.shape)
print(y_train.shape, y_test.shape)

# Data pre processing

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer

# Impute missing values
num_imputer = SimpleImputer(strategy='median')
cat_imputer = SimpleImputer(strategy='most_frequent')

# One-Hot Encoding with 'Other' category handling
ohe = OneHotEncoder(handle_unknown='infrequent_if_exist', min_frequency=0.01)  # Less frequent categories go to 'other'

# Standard Scaling
scaler = StandardScaler()

# TF-IDF Vectorization
tfidf = TfidfVectorizer(max_features=100)

print(df.columns)

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# Column Transformer
# Each transformation is defined as a tuple: ('name', transformer, columns)
preprocessor = ColumnTransformer([
    ('numerical_transformation', Pipeline([('imputer', num_imputer), ('scaler', scaler)]), ['num_1', 'num_2', 'num_3']),
    ('categorical_transformation', Pipeline([('imputer', cat_imputer), ('ohe', ohe)]), ['category']),
    ('text_transformation', tfidf, 'text')
])

# Apply transformations
X_train_transformed = preprocessor.fit_transform(X_train)
X_test_transformed = preprocessor.transform(X_test)

- ColumnTransformer applies different transformations to different types of columns in the dataset.
- Each transformation is defined as a tuple: ('name', transformer, columns), where:
  
  - name: A label for the transformation step.
  - transformer: The actual transformation pipeline (e.g., Pipeline, TfidfVectorizer).
  - columns: The specific columns this transformation applies to.

In [None]:
from imblearn.over_sampling import SMOTE
# Handle class imbalance using SMOTE
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train_transformed, y_train)

# Models

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, classification_report

model = RandomForestClassifier(random_state=42)
model.fit(X_train_smote, y_train_smote)
pred = model.predict(X_test_transformed)

report = classification_report(y_test, pred)
f1_macro = f1_score(y_test, pred, average='macro')

# Print classification reports
print("Classification Report:\n", report)
print("F1 Macro Score:", f1_macro)
print("--"*10)

# Neural Network

In [None]:
import tensorflow as tf
from tensorflow.keras import layers, Input, Model
from tensorflow.keras.callbacks import EarlyStopping

input_shape = X_train_smote.shape[1]
output_shape = len(np.unique(y_train))
print(input_shape)
print(output_shape)

In [None]:
np.unique(y_train_smote)

In [None]:
dict_map = {'Class_1':0, 'Class_2':1, 'Class_3':2, 'Class_4':3}
y_train_smote_ids = np.array([dict_map[y] for y in y_train_smote])


In [None]:
# NN Model
inputs = Input(shape=(input_shape,))

x = layers.Dense(64)(inputs)  # No activation yet
x = layers.BatchNormalization()(x)  # Normalize before activation
x = layers.ReLU()(x)  # Apply activation
x = layers.Dropout(0.2)(x)  # Dropout AFTER activation

x = layers.Dense(32, activation='relu')(x) # just to show that relu can also be applied here, but it is less efficient
x = layers.BatchNormalization()(x) 
x = layers.Dropout(0.2)(x)

outputs = layers.Dense(output_shape, activation='softmax')(x)
model = Model(inputs=inputs, outputs=outputs)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy' , metrics = ['accuracy'])
# model.compile(optimizer='adam', loss='mse' , metrics = ['mae']) # Fore regression

# Train model
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)              
history = model.fit(X_train_smote, y_train_smote_ids, epochs=3, batch_size=16, verbose=1, validation_split=0.2, callbacks=[early_stopping])

# Predictions
y_pred = model.predict(X_test_transformed)
y_pred = np.argmax(y_pred, axis=1)

In [None]:
dict_map_inverse = {v:k for k, v in dict_map.items()}
dict_map_inverse

In [None]:
y_pred = np.array([dict_map_inverse[y] for y in y_pred])
y_pred

In [None]:
report = classification_report(y_test, y_pred)
f1_macro = f1_score(y_test, pred, average='macro')

# Print classification reports
print("Classification Report:\n", report)
print("F1 Macro Score:", f1_macro)
print("--"*10)