# Generate Dummy Data

In [1]:
import pandas as pd
import numpy as np

# Generate dummy dataset
np.random.seed(42)
n_samples = 10000

# Create features
numerical_features = np.random.randn(n_samples, 3) * 10
numerical_features[np.random.randint(0, n_samples, 500), np.random.randint(0, 3, 500)] = np.nan  # Missing values

categories = ['A', 'B', 'C', 'D', 'E']
categorical_features = np.random.choice(categories, n_samples)
categorical_features[np.random.randint(0, n_samples, 300)] = np.nan  # Missing values

text_data = np.random.choice(["This is a sample text", "Another text data", "More random text data"], n_samples)

# Imbalanced target variable
target_classes = ['Class_1', 'Class_2', 'Class_3', 'Class_4']
class_distribution = [0.7, 0.2, 0.08, 0.02]
target = np.random.choice(target_classes, n_samples, p=class_distribution)

# Create DataFrame
df = pd.DataFrame(numerical_features, columns=["num_1", "num_2", "num_3"])
df['category'] = categorical_features
df['text'] = text_data
df['target'] = target

In [2]:
df

Unnamed: 0,num_1,num_2,num_3,category,text,target
0,4.967142,-1.382643,6.476885,B,More random text data,Class_2
1,15.230299,-2.341534,-2.341370,A,This is a sample text,Class_1
2,15.792128,7.674347,-4.694744,D,More random text data,Class_1
3,5.425600,-4.634177,-4.657298,B,This is a sample text,Class_1
4,2.419623,,-17.249178,D,More random text data,Class_2
...,...,...,...,...,...,...
9995,-3.435382,-3.940473,2.579958,A,More random text data,Class_1
9996,10.442918,16.029643,-2.185383,E,Another text data,Class_1
9997,18.248892,8.767068,-12.320085,A,More random text data,Class_1
9998,-12.612083,-5.772879,14.446040,A,More random text data,Class_1


In [3]:
df['target'].value_counts()

target
Class_1    6874
Class_2    2109
Class_3     820
Class_4     197
Name: count, dtype: int64

# Train Test Split

In [4]:
from sklearn.model_selection import train_test_split

# Train-test split
X, y = df.drop(columns=['target']), df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, test_size=0.2, random_state=42)

print(X_train.shape, X_test.shape)
print(y_train.shape, y_test.shape)

(8000, 5) (2000, 5)
(8000,) (2000,)


# Data pre processing

In [5]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer

# Impute missing values
num_imputer = SimpleImputer(strategy='median')
cat_imputer = SimpleImputer(strategy='most_frequent')

# One-Hot Encoding with 'Other' category handling
ohe = OneHotEncoder(handle_unknown='infrequent_if_exist', min_frequency=0.01)  # Less frequent categories go to 'other'

# Standard Scaling
scaler = StandardScaler()

# TF-IDF Vectorization
tfidf = TfidfVectorizer(max_features=100)

In [6]:
df.columns

Index(['num_1', 'num_2', 'num_3', 'category', 'text', 'target'], dtype='object')

In [7]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# Column Transformer
preprocessor = ColumnTransformer([
    ('numerical_transformation', Pipeline([('imputer', num_imputer), ('scaler', scaler)]), ['num_1', 'num_2', 'num_3']),
    ('categorical_transformation', Pipeline([('imputer', cat_imputer), ('ohe', ohe)]), ['category']),
    ('text_transformation', tfidf, 'text')
])

# Apply transformations
X_train_transformed = preprocessor.fit_transform(X_train)
X_test_transformed = preprocessor.transform(X_test)

- ColumnTransformer applies different transformations to different types of columns in the dataset.
- Each transformation is defined as a tuple: ('name', transformer, columns), where:
  
  - name: A label for the transformation step.
  - transformer: The actual transformation pipeline (e.g., Pipeline, TfidfVectorizer).
  - columns: The specific columns this transformation applies to.

In [8]:
from imblearn.over_sampling import SMOTE
# Handle class imbalance using SMOTE
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train_transformed, y_train)

# Models

In [9]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, classification_report

model = RandomForestClassifier(random_state=42)
model.fit(X_train_smote, y_train_smote)
pred = model.predict(X_test_transformed)

report = classification_report(y_test, pred)
f1_macro = f1_score(y_test, pred, average='macro')

# Print classification reports
print("Classification Report:\n", report)
print("F1 Macro Score:", f1_macro)
print("--"*10)

Classification Report:
               precision    recall  f1-score   support

     Class_1       0.69      0.59      0.64      1375
     Class_2       0.20      0.22      0.21       422
     Class_3       0.06      0.10      0.08       164
     Class_4       0.01      0.03      0.02        39

    accuracy                           0.46      2000
   macro avg       0.24      0.23      0.23      2000
weighted avg       0.52      0.46      0.49      2000

F1 Macro Score: 0.2345918172837115
--------------------


# Neural Network

In [10]:
import tensorflow as tf
from tensorflow.keras import layers, Input, Model
from tensorflow.keras.callbacks import EarlyStopping

input_shape = X_train_smote.shape[1]
output_shape = len(np.unique(y_train))
print(input_shape)
print(output_shape)

17
4


In [11]:
np.unique(y_train_smote)

array(['Class_1', 'Class_2', 'Class_3', 'Class_4'], dtype=object)

In [12]:
dict_map = {'Class_1':0, 'Class_2':1, 'Class_3':2, 'Class_4':3}
y_train_smote_ids = np.array([dict_map[y] for y in y_train_smote])


In [30]:
# NN Model
inputs = Input(shape=(input_shape,))

x = layers.Dense(64)(inputs)  # No activation yet
x = layers.BatchNormalization()(x)  # Normalize before activation
x = layers.ReLU()(x)  # Apply activation
x = layers.Dropout(0.2)(x)  # Dropout AFTER activation

x = layers.Dense(32, activation='relu')(x) # just to show that relu can also be applied here, but it is less efficient
x = layers.BatchNormalization()(x) 
x = layers.Dropout(0.2)(x)

outputs = layers.Dense(output_shape, activation='softmax')(x)
model = Model(inputs=inputs, outputs=outputs)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy' , metrics = ['accuracy'])
# model.compile(optimizer='adam', loss='mse' , metrics = ['mae']) # Fore regression

# Train model
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)              
history = model.fit(X_train_smote, y_train_smote_ids, epochs=3, batch_size=16, verbose=1, validation_split=0.2, callbacks=[early_stopping])

# Predictions
y_pred = model.predict(X_test_transformed)
y_pred = np.argmax(y_pred, axis=1)

Epoch 1/3
[1m1100/1100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 16ms/step - accuracy: 0.3024 - loss: 1.4774 - val_accuracy: 0.0000e+00 - val_loss: 2.6658
Epoch 2/3
[1m1100/1100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 17ms/step - accuracy: 0.3333 - loss: 1.2743 - val_accuracy: 0.0000e+00 - val_loss: 2.6882
Epoch 3/3
[1m1100/1100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 16ms/step - accuracy: 0.3397 - loss: 1.2647 - val_accuracy: 0.0000e+00 - val_loss: 2.7290
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step


In [31]:
dict_map_inverse = {v:k for k, v in dict_map.items()}
dict_map_inverse

{0: 'Class_1', 1: 'Class_2', 2: 'Class_3', 3: 'Class_4'}

In [35]:
y_pred = np.array([dict_map_inverse[y] for y in y_pred])
y_pred

array(['Class_2', 'Class_3', 'Class_1', ..., 'Class_1', 'Class_1',
       'Class_2'], dtype='<U7')

In [36]:
report = classification_report(y_test, y_pred)
f1_macro = f1_score(y_test, pred, average='macro')

# Print classification reports
print("Classification Report:\n", report)
print("F1 Macro Score:", f1_macro)
print("--"*10)

Classification Report:
               precision    recall  f1-score   support

     Class_1       0.69      0.51      0.59      1375
     Class_2       0.23      0.28      0.25       422
     Class_3       0.07      0.20      0.11       164
     Class_4       0.00      0.00      0.00        39

    accuracy                           0.43      2000
   macro avg       0.25      0.25      0.24      2000
weighted avg       0.53      0.43      0.47      2000

F1 Macro Score: 0.2345918172837115
--------------------


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
