In [None]:
!pip install opendatasets

Collecting opendatasets
  Downloading opendatasets-0.1.22-py3-none-any.whl.metadata (9.2 kB)
Downloading opendatasets-0.1.22-py3-none-any.whl (15 kB)
Installing collected packages: opendatasets
Successfully installed opendatasets-0.1.22


In [None]:
import opendatasets as od


Importing all the datasets

In [None]:

od.download("https://www.kaggle.com/datasets/uwrfkaggler/ravdess-emotional-speech-audio")


Dataset URL: https://www.kaggle.com/datasets/uwrfkaggler/ravdess-emotional-speech-audio
Downloading ravdess-emotional-speech-audio.zip to ./ravdess-emotional-speech-audio


100%|██████████| 429M/429M [00:04<00:00, 101MB/s] 





converting ravdess audio dataset into a csv file

In [None]:
import os
import pandas as pd
import kagglehub


def create_ravdess_dataset(directory_path):
    emotion_map = {
        '01': 'neutral',
        '02': 'calm',
        '03': 'happy',
        '04': 'sad',
        '05': 'angry',
        '06': 'fearful',
        '07': 'disgust',
        '08': 'surprised'
    }
    dataset = []
    for subdir, _, files in os.walk(directory_path):
        for file in files:
            if file.endswith('.wav'):
                parts = file.split('-')
                if len(parts) > 2:
                    emo_code = parts[2]
                    if emo_code in emotion_map:
                        emotion = emotion_map[emo_code]
                        full_path = os.path.join(subdir, file)
                        dataset.append({'path': full_path, 'emotion': emotion})
    return dataset

# Use the correct path for extraction
ravdess_path = './ravdess-emotional-speech-audio' # Define ravdess_path
ravdess_data = create_ravdess_dataset(ravdess_path)
df_ravdess = pd.DataFrame(ravdess_data)

# Save and display all data
df_ravdess.to_csv('ravdess.csv', index=False)

In [None]:
df_ravdess

Unnamed: 0,path,emotion
0,./ravdess-emotional-speech-audio/Actor_17/03-0...,angry
1,./ravdess-emotional-speech-audio/Actor_17/03-0...,surprised
2,./ravdess-emotional-speech-audio/Actor_17/03-0...,surprised
3,./ravdess-emotional-speech-audio/Actor_17/03-0...,happy
4,./ravdess-emotional-speech-audio/Actor_17/03-0...,sad
...,...,...
2875,./ravdess-emotional-speech-audio/Actor_08/03-0...,happy
2876,./ravdess-emotional-speech-audio/Actor_08/03-0...,sad
2877,./ravdess-emotional-speech-audio/Actor_08/03-0...,disgust
2878,./ravdess-emotional-speech-audio/Actor_08/03-0...,surprised


In [None]:
df_ravdess["emotion"].value_counts()

Unnamed: 0_level_0,count
emotion,Unnamed: 1_level_1
angry,384
surprised,384
happy,384
sad,384
fearful,384
disgust,384
calm,384
neutral,192


Performing MFCC for feature extraction

Features extracted by MFCC are :  vocals Timbre, Pitch and intonation , Loudness and Energy Dynamics and phonetic content


Main : NMFCC - we use this for selecting the specific features from the audio spectrogram we have n is the number of features we will be selecting


Max_PAD_LEN - Is used to set maximum length to the audio files

In [None]:
import librosa
import numpy as np
import pandas as pd
import os

def extract_features(df_ravdess,mfcc=13):
  features=[]

  for idx, row in df_ravdess.iterrows():
    try:
      if not os.path.exists(row['path']):
        print(f'File not found: {row["path"]}')
        continue


      y,sr=librosa.load(row['path'],sr=16000)

      if len(y)==0:
        print(f'Empty audio file: {row["path"]}')
        continue
      #mfccs

      mfccs=librosa.feature.mfcc(y=y,sr=sr,n_mfcc=mfcc)
      mfccs_mean=np.mean(mfccs,axis=1)

      #chromagram
      chroma=librosa.feature.chroma_stft(y=y,sr=sr)
      chroma_mean=np.mean(chroma,axis=1) # Corrected 'meam' to 'mean'

      #can use mel-spectrogram

      features.append({
          'mfccs': mfccs_mean,
          'chroma':chroma_mean,
          'emotion':row['emotion']
      })

      print(f"Successfully processsed file {idx+1}: {os.path.basename(row['path'])}")

    except Exception as e:
      print(f"Error Processing file{row['path']}:{str(e)}")
      print(f"Error Type: {type(e).__name__}")
  return features

# Use the correct path for extraction
ravdess_path = './ravdess-emotional-speech-audio' # Define ravdess_path
ravdess_data = create_ravdess_dataset(ravdess_path) # Keep create_ravdess_dataset call as it's needed
df_ravdess = pd.DataFrame(ravdess_data) # Keep DataFrame creation

In [None]:
extracted_features = extract_features(df_ravdess)
display(extracted_features)

Successfully processsed file 1: 03-01-05-01-01-01-17.wav
Successfully processsed file 2: 03-01-08-02-02-02-17.wav
Successfully processsed file 3: 03-01-08-02-01-01-17.wav
Successfully processsed file 4: 03-01-03-02-02-02-17.wav
Successfully processsed file 5: 03-01-04-01-02-02-17.wav
Successfully processsed file 6: 03-01-04-01-01-01-17.wav
Successfully processsed file 7: 03-01-06-02-02-02-17.wav
Successfully processsed file 8: 03-01-07-01-01-01-17.wav
Successfully processsed file 9: 03-01-02-02-02-01-17.wav
Successfully processsed file 10: 03-01-06-01-02-02-17.wav
Successfully processsed file 11: 03-01-03-01-02-02-17.wav
Successfully processsed file 12: 03-01-06-01-01-02-17.wav
Successfully processsed file 13: 03-01-04-02-02-01-17.wav
Successfully processsed file 14: 03-01-07-02-01-01-17.wav
Successfully processsed file 15: 03-01-03-02-01-02-17.wav
Successfully processsed file 16: 03-01-01-01-02-01-17.wav
Successfully processsed file 17: 03-01-02-01-01-02-17.wav
Successfully processsed

[{'mfccs': array([-636.48724  ,   53.280937 ,    5.460449 ,   21.589674 ,
           10.845659 ,    8.578912 ,    3.4164295,    1.2602779,
            4.551249 ,    2.2888138,    3.098254 ,    7.0163946,
            1.0258842], dtype=float32),
  'chroma': array([0.6068325 , 0.5561984 , 0.5198484 , 0.5507194 , 0.561074  ,
         0.56359714, 0.6101416 , 0.5692949 , 0.50750965, 0.5539096 ,
         0.6334105 , 0.5652408 ], dtype=float32),
  'emotion': 'angry'},
 {'mfccs': array([-455.3738    ,   30.516335  ,    4.952711  ,    3.8039374 ,
           -0.61687565,    2.3925638 ,   -1.1002429 ,   -2.3227339 ,
           -3.3577921 ,    2.4469945 ,   -5.4739056 ,    0.68742794,
           -3.9409778 ], dtype=float32),
  'chroma': array([0.5256063 , 0.5412709 , 0.5524142 , 0.56598276, 0.6012675 ,
         0.56337154, 0.40865704, 0.4210122 , 0.47188887, 0.51284426,
         0.51934123, 0.52042276], dtype=float32),
  'emotion': 'surprised'},
 {'mfccs': array([-4.9762607e+02,  3.3997051e+01,  4.

In [None]:
# Flatten the features and create a DataFrame
flattened_features = []
for item in extracted_features:
    features_dict = {'emotion': item['emotion']}
    # Flatten mfccs and add to dictionary
    for i, mfcc_value in enumerate(item['mfccs']):
        features_dict[f'mfcc_{i}'] = mfcc_value
    # Flatten chroma and add to dictionary
    for i, chroma_value in enumerate(item['chroma']):
        features_dict[f'chroma_{i}'] = chroma_value
    flattened_features.append(features_dict)

df_features = pd.DataFrame(flattened_features)

# Save the DataFrame to a CSV file
csv_filename = 'extracted_audio_features.csv'
df_features.to_csv(csv_filename, index=False)

print(f"Extracted features saved to {csv_filename}")

Extracted features saved to extracted_audio_features.csv


In [None]:
df_extracted=pd.read_csv("/content/extracted_audio_features.csv")
df_extracted

Unnamed: 0,emotion,mfcc_0,mfcc_1,mfcc_2,mfcc_3,mfcc_4,mfcc_5,mfcc_6,mfcc_7,mfcc_8,...,chroma_2,chroma_3,chroma_4,chroma_5,chroma_6,chroma_7,chroma_8,chroma_9,chroma_10,chroma_11
0,angry,-636.48724,53.280937,5.460449,21.589674,10.845659,8.578912,3.416430,1.260278,4.551249,...,0.519848,0.550719,0.561074,0.563597,0.610142,0.569295,0.507510,0.553910,0.633410,0.565241
1,surprised,-455.37380,30.516335,4.952711,3.803937,-0.616876,2.392564,-1.100243,-2.322734,-3.357792,...,0.552414,0.565983,0.601267,0.563372,0.408657,0.421012,0.471889,0.512844,0.519341,0.520423
2,surprised,-497.62607,33.997050,4.876971,12.641810,1.329894,6.252095,1.439563,0.932855,-0.317947,...,0.584852,0.598310,0.584603,0.516144,0.509712,0.542692,0.534641,0.561271,0.610193,0.597798
3,happy,-484.29184,33.704037,-4.465340,1.582990,-0.680330,1.886181,-10.407587,-8.034545,-4.371917,...,0.426924,0.440919,0.482818,0.486777,0.440686,0.441244,0.478734,0.467584,0.455272,0.418882
4,sad,-579.20750,59.373756,-3.508263,17.097740,2.080728,9.371759,0.295555,-10.199896,2.266109,...,0.440219,0.535456,0.507718,0.416741,0.421550,0.526536,0.587132,0.477471,0.437559,0.386037
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2875,happy,-630.88430,53.284170,-6.611326,22.070906,-1.626653,4.588445,-8.483583,0.009003,-1.741244,...,0.446802,0.453330,0.494336,0.610332,0.601271,0.552302,0.464193,0.445422,0.414330,0.402134
2876,sad,-715.00946,58.274570,13.990718,20.753004,6.839467,8.207243,-7.080389,-5.250118,0.712516,...,0.441957,0.513707,0.497012,0.508376,0.520377,0.575462,0.662095,0.585286,0.501075,0.358456
2877,disgust,-667.24830,69.397840,7.269443,15.230010,-0.112782,4.041139,-9.834456,-5.204443,-0.097808,...,0.429735,0.517270,0.537620,0.624759,0.557804,0.496696,0.469979,0.483323,0.494329,0.480290
2878,surprised,-599.15950,46.924374,-2.810279,10.725401,-4.349322,6.078587,-12.118999,-4.973942,-0.724555,...,0.436206,0.481801,0.509291,0.536966,0.485584,0.503607,0.562100,0.496679,0.438119,0.426763



Build a hybrid LSTM-Transformer model for speech emotion recognition using the extracted features from the CSV file "extracted_features.csv".

#----------------------------------------------------------------------

using a AST pre trained transformer model for SER

preparing dataset for training

In [None]:
import numpy as np
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split

x = np.array(df_features.drop('emotion', axis=1).values.tolist())
y = np.array(df_features.emotion.tolist())


label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

y_categorical = to_categorical(y)

print("Data prepared for model training.")
print("Shape of x:", x.shape)
print("Shape of y:", y.shape)
print("Shape of y_categorical:", y_categorical.shape)

Data prepared for model training.
Shape of x: (2880, 25)
Shape of y: (2880,)
Shape of y_categorical: (2880, 8)


Define model arch

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,Dropout

def create_model(input_data,num_classes):
  model=Sequential([
      Dense(256,activation='relu',input_shape=(input_data,)),
      Dropout(0.5),
      Dense(128,activation='relu'),
      Dropout(0.5),
      Dense(num_classes,activation='softmax')
  ])

  model.compile(optimizer='adam',loss='categorical_crossentropy',metrics=['accuracy'])
  return model

## K-Fold Cross-Validation with Feature Scaling

Perform K-Fold cross-validation on the extracted features with feature scaling applied within each fold to evaluate the model's performance more robustly.

## Shows a MLP - multi layer perceptron

In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.utils import to_categorical
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Input # Import Input layer

# Re-define create_model to be accessible in this cell
def create_model(input_dim, num_classes):
  model=Sequential([
      Input(shape=(input_dim,)), # Use Input layer as the first layer
      Dense(256,activation='relu'), # Remove input_shape from Dense layer
      Dropout(0.5),
      Dense(128,activation='relu'),
      Dropout(0.5),
      Dense(64, activation='relu'), # Added another Dense layer
      Dropout(0.5), # Added Dropout for the new layer
      Dense(num_classes,activation='softmax')
  ])

  model.compile(optimizer='adam',loss='categorical_crossentropy',metrics=['accuracy'])
  return model

# Add the code to load and prepare the data here
# Assuming 'extracted_audio_features.csv' is available
df_features = pd.read_csv('extracted_audio_features.csv')

# Separate features (X) and labels (y)
X = np.array(df_features.drop('emotion', axis=1))
y = np.array(df_features['emotion'])

# Encode the emotion labels to integers
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Define the number of classes
num_classes = len(label_encoder.classes_)


# Define the number of folds for K-Fold cross-validation
num_folds = 5
skf = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=42)
accuracy_scores = []
loss_scores = []

print(f"Starting {num_folds}-Fold Cross-Validation with Feature Scaling...")

# --- K-Fold Loop with SCALING ---
for fold, (train_index, test_index) in enumerate(skf.split(X, y_encoded)): # Use y_encoded for stratified splitting
    print(f"--- Starting Fold {fold+1}/{num_folds} ---")

    # 1. Split data for this fold
    X_train, X_test = X[train_index], X[test_index]
    y_train_encoded, y_test_encoded = y_encoded[train_index], y_encoded[test_index]


    # 2. Apply Feature Scaling
    scaler = StandardScaler()
    # Fit the scaler ONLY on the training data
    X_train_scaled = scaler.fit_transform(X_train)
    # Apply the SAME scaling to the test data
    X_test_scaled = scaler.transform(X_test)

    # 3. One-hot encode labels
    y_train_categorical = to_categorical(y_train_encoded, num_classes=num_classes)
    y_test_categorical = to_categorical(y_test_encoded, num_classes=num_classes)

    # 4. Create and train the model using the SCALED data
    model = create_model(X_train_scaled.shape[1], num_classes) # Use scaled data shape for input_dim
    history = model.fit(X_train_scaled, y_train_categorical,
                        epochs=50, # You can adjust the number of epochs per fold
                        batch_size=32,
                        verbose=0) # Set verbose to 0 to reduce output during training

    # 5. Evaluate the model on the SCALED test data
    loss, accuracy = model.evaluate(X_test_scaled, y_test_categorical, verbose=0)

    print(f"Accuracy for Fold {fold+1}: {accuracy*100:.2f}%")
    print(f"Loss for Fold {fold+1}: {loss:.4f}")


    # 6. Save results
    accuracy_scores.append(accuracy)
    loss_scores.append(loss)

# Calculate and print the average performance across all folds
average_accuracy = np.mean(accuracy_scores)
average_loss = np.mean(loss_scores)
std_accuracy = np.std(accuracy_scores)


print("\n--- K-Fold Cross-Validation Results with Feature Scaling ---")
print(f"Average Accuracy: {average_accuracy*100:.2f}%")
print(f"Standard Deviation of Accuracy: {std_accuracy*100:.2f}%")
print(f"Average Loss: {average_loss:.4f}")

Starting 5-Fold Cross-Validation with Feature Scaling...
--- Starting Fold 1/5 ---
Accuracy for Fold 1: 72.40%
Loss for Fold 1: 0.8204
--- Starting Fold 2/5 ---
Accuracy for Fold 2: 71.35%
Loss for Fold 2: 0.8294
--- Starting Fold 3/5 ---
Accuracy for Fold 3: 72.22%
Loss for Fold 3: 0.8020
--- Starting Fold 4/5 ---
Accuracy for Fold 4: 69.44%
Loss for Fold 4: 0.8650
--- Starting Fold 5/5 ---
Accuracy for Fold 5: 69.27%
Loss for Fold 5: 0.8971

--- K-Fold Cross-Validation Results with Feature Scaling ---
Average Accuracy: 70.94%
Standard Deviation of Accuracy: 1.34%
Average Loss: 0.8428


----- pytorch code -------

In [None]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.6


In [None]:
!pip install -U transformers




In [None]:
#SER WITH AST IN PYTORCH
import os
import pandas as pd
import numpy as np
import librosa
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold

import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoFeatureExtractor, AutoModelForAudioClassification, TrainingArguments, Trainer, EvalPrediction
from datasets import Dataset as HFDataset
import evaluate



 #LOADING RAVDESS FILES


def create_ravdess_dataset(directory_path):
    emotion_map = {
        '01': 'neutral', '02': 'calm', '03': 'happy', '04': 'sad',
        '05': 'angry', '06': 'fearful', '07': 'disgust', '08': 'surprised'
    }
    dataset = []
    print(f"Scanning directory: {directory_path}")
    for subdir, _, files in os.walk(directory_path):
        for file in files:
            if file.endswith('.wav'):
                parts = file.split('-')
                if len(parts) > 2:
                    emo_code = parts[2]
                    if emo_code in emotion_map:
                        emotion = emotion_map[emo_code]
                        full_path = os.path.join(subdir, file)
                        dataset.append({'path': full_path, 'emotion': emotion})
    print(f"Found {len(dataset)} audio files.")
    return pd.DataFrame(dataset)

ravdess_path = './ravdess-emotional-speech-audio'
df = create_ravdess_dataset(ravdess_path)

if df.empty:
    raise ValueError("No audio files found in the given path!")

# Encode labels
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['emotion'])
num_classes = len(label_encoder.classes_)
print(f"Number of emotion classes: {num_classes}")



# FEATURE EXTRACTION (AST)

model_checkpoint = "MIT/ast-finetuned-audioset-10-10-0.4593"
feature_extractor = AutoFeatureExtractor.from_pretrained(model_checkpoint)

def preprocess_function(batch):
    # Load audio file
    try:
        audio, sr = librosa.load(batch["path"], sr=feature_extractor.sampling_rate)
    except Exception as e:
        print(f"Error loading {batch['path']}: {e}")

        return {"input_values": np.zeros(160000), "label": -1}

    # Extract features
    inputs = feature_extractor(
        audio,
        sampling_rate=feature_extractor.sampling_rate,
        max_length=160000,
        truncation=True,
        padding="max_length"
    )
    batch["input_values"] = inputs["input_values"][0]
    batch["label"] = int(batch["label"])
    return batch


# MODEL SETUP

id2label = {i: label for i, label in enumerate(label_encoder.classes_)}
label2id = {label: i for i, label in id2label.items()}



#METRICS & K-FOLD CONFIG

metric = evaluate.load("accuracy")

def compute_metrics(eval_pred: EvalPrediction):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    acc = metric.compute(predictions=predictions, references=labels)
    return acc

# Define K-Fold settings
N_SPLITS = 5  # You can change this number
kfold = KFold(n_splits=N_SPLITS, shuffle=True, random_state=42)

# Store results from each fold
all_fold_accuracies = []

for fold, (train_ids, test_ids) in enumerate(kfold.split(df)):
    print(f"\n--- FOLD {fold + 1}/{N_SPLITS} ---")

    # --- 5.1: Create fold-specific data ---
    train_df = df.iloc[train_ids]
    test_df = df.iloc[test_ids]

    # Convert pandas → Hugging Face Dataset
    train_dataset = HFDataset.from_pandas(train_df)
    test_dataset = HFDataset.from_pandas(test_df)

    print(f"Extracting features for Fold {fold + 1}...")
    train_dataset = train_dataset.map(preprocess_function, batched=False)
    test_dataset = test_dataset.map(preprocess_function, batched=False)

    columns_to_remove = [col for col in train_dataset.column_names if col not in ['label', 'input_values']]

    train_dataset = train_dataset.remove_columns(columns_to_remove)
    test_dataset = test_dataset.remove_columns(columns_to_remove)


    print(f"Train samples: {len(train_dataset)}, Test samples: {len(test_dataset)}")
    print("Feature extraction complete!")

    model = AutoModelForAudioClassification.from_pretrained(
        model_checkpoint,
        num_labels=num_classes,
        label2id=label2id,
        id2label=id2label,
        ignore_mismatched_sizes=True
    )

    training_args = TrainingArguments(
        output_dir=f"./ast-ravdess-results-fold-{fold+1}",
        eval_strategy="epoch",
        save_strategy="epoch",
        learning_rate=5e-5,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        num_train_epochs=10, # Number of epochs to train *per fold*
        weight_decay=0.01,
        logging_dir=f"./logs-fold-{fold+1}",
        logging_steps=50,
        load_best_model_at_end=True,
        metric_for_best_model="accuracy", # Make sure to set this
        push_to_hub=False,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        compute_metrics=compute_metrics,
    )

    print(f"Training Fold {fold + 1}...")
    trainer.train()

    print(f"Evaluating Fold {fold + 1}...")
    results = trainer.evaluate()

    fold_accuracy = results['eval_accuracy']
    all_fold_accuracies.append(fold_accuracy)

    print(f"\n Fold {fold + 1} Test Accuracy: {fold_accuracy * 100:.2f}%")

    # Clean up to save space (optional)
    del model
    del trainer
    torch.cuda.empty_cache()

#results
mean_accuracy = np.mean(all_fold_accuracies)
std_accuracy = np.std(all_fold_accuracies)

print("\n--- K-Fold Cross-Validation Complete ---")
print(f"Accuracies across all {N_SPLITS} folds:")
for i, acc in enumerate(all_fold_accuracies):
    print(f"  Fold {i+1}: {acc * 100:.2f}%")

print(f"\n↗↗ Final Model Accuracy (Mean): {mean_accuracy * 100:.2f}%")
print(f"   Accuracy Standard Deviation: {std_accuracy * 100:.2f}%")

Scanning directory: ./ravdess-emotional-speech-audio
Found 2880 audio files.
Number of emotion classes: 8

--- FOLD 1/5 ---
Extracting features for Fold 1...


Map:   0%|          | 0/2304 [00:00<?, ? examples/s]

Map:   0%|          | 0/576 [00:00<?, ? examples/s]

Train samples: 2304, Test samples: 576
Feature extraction complete!


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/346M [00:00<?, ?B/s]

Some weights of ASTForAudioClassification were not initialized from the model checkpoint at MIT/ast-finetuned-audioset-10-10-0.4593 and are newly initialized because the shapes did not match:
- classifier.dense.bias: found shape torch.Size([527]) in the checkpoint and torch.Size([8]) in the model instantiated
- classifier.dense.weight: found shape torch.Size([527, 768]) in the checkpoint and torch.Size([8, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training Fold 1...


  | |_| | '_ \/ _` / _` |  _/ -_)


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33maadishbane23022005[0m ([33maadishbane23022005-sardar-vallabhbhai-national-institute[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Accuracy
1,0.9097,0.836592,0.715278
2,0.3553,0.391262,0.883681
3,0.0781,0.379652,0.913194
4,0.0362,0.411383,0.928819
5,0.0006,0.229883,0.967014
6,0.0001,0.27257,0.954861
7,0.0,0.271887,0.954861
8,0.0,0.285435,0.961806
