In [1]:
import os
import cv2
import numpy as np
import librosa
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report
from sklearn.svm import SVC  # Example classifier (you can use others like RandomForest, XGBoost)
from sklearn.feature_extraction.text import TfidfVectorizer

import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import random
import cv2
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.preprocessing import image
from tensorflow.keras.applications.resnet50 import preprocess_input
import os

In [2]:
# Load the CSV file
train_df = pd.read_csv('/kaggle/input/ml-hackathon-ec-campus-set-2/set_2_train/train_emotion.csv', encoding='ISO-8859-1')

# Define path to video clips
video_dir = '/kaggle/input/ml-hackathon-ec-campus-set-2/set_2_train/train_data'

# Function to get video file path from IDs
def get_video_clip_path(row):
    # Get the Dialogue_ID and Utterance_ID for each row
    dialogue_id = row['Dialogue_ID']
    utterance_id = row['Utterance_ID']

    filename = f"dia{dialogue_id}_utt{utterance_id}.mp4"
    
    # Join the directory path with the filename
    return os.path.join(video_dir, filename)

# Apply the function to get file paths for each sampled clip
train_df['video_clip_path'] = train_df.apply(get_video_clip_path, axis=1)

# Check sample paths
print(train_df.head())


   Sr No.                                          Utterance          Speaker  \
0       8  But therell be perhaps 30 people under you so...  The Interviewer   
1      12  All right then, well have a definite answer f...  The Interviewer   
2      32                                  Can I get a beer.         Chandler   
3      40           He was with her when he wrote this poem.           Phoebe   
4      42  Now that I've touched you, you seem emptier st...           Phoebe   

   Emotion  Dialogue_ID  Utterance_ID  Season  Episode     StartTime  \
0  neutral            0             7       8       21  00:16:48,800   
1  neutral            0            11       8       21  00:17:05,025   
2  neutral            2             8       3        6   0:06:07,367   
3  neutral            3             3       3       12  00:10:21,078   
4  neutral            3             5       3       12  00:10:26,667   

        EndTime                                    video_clip_path  
0  00:16:54

In [3]:
def extract_frames_from_video(video_path, frame_rate=30):
    cap = cv2.VideoCapture(video_path)  # Open video file
    frames = []
    frame_count = 0

    while True:
        ret, frame = cap.read()  # Read a frame
        if not ret:
            break
        
        # Capture frame at specific interval (frame_rate)
        if frame_count % frame_rate == 0:
            frames.append(frame)  # Store frame

        frame_count += 1

    cap.release()  # Release the video capture object
    return frames

# Load ResNet50 model (without the top layer, only the convolutional base)
model = ResNet50(weights='imagenet', include_top=False, pooling='avg')

# Function to extract features from a single frame
def extract_features_from_frame(frame):
    # Resize frame to 224x224 for ResNet input
    img = cv2.resize(frame, (224, 224))
    img = image.img_to_array(img)         # Convert image to array
    img = np.expand_dims(img, axis=0)    # Add batch dimension
    img = preprocess_input(img)          # Preprocess image as per ResNet requirements
    features = model.predict(img)        # Extract features from the frame
    return features.flatten()            # Flatten the features into a 1D vector

# Function to extract features from all frames in a video
def extract_features_from_video(video_path, frame_rate=30):
    frames = extract_frames_from_video(video_path, frame_rate)
    features = [extract_features_from_frame(frame) for frame in frames]
    return features

# Function to get the average feature vector for the video (averaging across all frames)
def extract_avg_features_from_video(video_path, frame_rate=30):
    features = extract_features_from_video(video_path, frame_rate)
    if len(features) > 0:
        avg_features = np.mean(features, axis=0)  # Average features over all frames
    else:
        avg_features = np.zeros((2048,))  # In case no frames were extracted, return a zero vector
    return avg_features

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/resnet/resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5
[1m94765736/94765736[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step


In [4]:
# Apply the function to extract features for each video clip
train_df['features'] = train_df['video_clip_path'].apply(lambda x: extract_avg_features_from_video(x, frame_rate=30))

# Check the features extracted for a sample row
print(train_df[['video_clip_path', 'features']].head())

I0000 00:00:1731322907.966543      66 service.cc:145] XLA service 0x78ed340020d0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1731322907.966601      66 service.cc:153]   StreamExecutor device (0): Tesla T4, Compute Capability 7.5
I0000 00:00:1731322907.966608      66 service.cc:153]   StreamExecutor device (1): Tesla T4, Compute Capability 7.5


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 5s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step


I0000 00:00:1731322911.198295      66 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23

In [5]:
# Convert the list of features to a numpy array
X = np.array(train_df['features'].tolist())
print(train_df.head())
# Check the shape of the extracted features
print(X.shape)

   Sr No.                                          Utterance          Speaker  \
0       8  But therell be perhaps 30 people under you so...  The Interviewer   
1      12  All right then, well have a definite answer f...  The Interviewer   
2      32                                  Can I get a beer.         Chandler   
3      40           He was with her when he wrote this poem.           Phoebe   
4      42  Now that I've touched you, you seem emptier st...           Phoebe   

   Emotion  Dialogue_ID  Utterance_ID  Season  Episode     StartTime  \
0  neutral            0             7       8       21  00:16:48,800   
1  neutral            0            11       8       21  00:17:05,025   
2  neutral            2             8       3        6   0:06:07,367   
3  neutral            3             3       3       12  00:10:21,078   
4  neutral            3             5       3       12  00:10:26,667   

        EndTime                                    video_clip_path  \
0  00:16:5

In [6]:
from transformers import BertTokenizer, BertModel
import torch
import numpy as np

# Load the pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')

# Ensure the model is in evaluation mode
bert_model.eval()

# Function to extract text features (subtitles) using BERT
def extract_text_features(text):
    # Tokenize the input text and convert it to BERT's input format
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)

    # Forward pass through BERT
    with torch.no_grad():
        outputs = bert_model(**inputs)

    # Get the embeddings of the [CLS] token (first token) for the entire sequence
    cls_embeddings = outputs.last_hidden_state[:, 0, :].squeeze().numpy()  # Shape: (768,)
    return cls_embeddings

# Apply the feature extraction to all subtitles in the DataFrame
def extract_all_text_features(df):
    text_features = []
    for text in df['Utterance']:
        features = extract_text_features(text)
        text_features.append(features)
    return np.array(text_features)

# Extract features for all samples in train_df
text_features = extract_all_text_features(train_df)

# Print the shape of the extracted text features
print(f"Extracted text features shape: {text_features.shape}")


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Extracted text features shape: (1000, 768)


In [7]:
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
import pandas as pd

# Early Fusion: Concatenate video and text features
def early_fusion(video_features, text_features):
    # Concatenate video and text features for early fusion
    return np.concatenate([video_features, text_features], axis=1)

# Ensure both video and text feature arrays have the same number of samples
assert X.shape[0] == text_features.shape[0], "Number of samples in video and text features must match."

# Apply early fusion on the complete dataset
X_fusion = early_fusion(X, text_features)

# Normalize features across the entire dataset
scaler = StandardScaler()
X_fusion = scaler.fit_transform(X_fusion)

# Train the classifier on the entire dataset with optimized SVC parameters
classifier = SVC(kernel='rbf', C=1.5, gamma='scale', decision_function_shape='ovr', random_state=42)
classifier.fit(X_fusion, train_df['Emotion'])  # Train using the full dataset

print("Model training complete. Ready for prediction on test data.")


Model training complete. Ready for prediction on test data.


In [8]:
import os
import numpy as np
import pandas as pd

# Load test dataset
df = pd.read_csv('/kaggle/input/ml-hackathon-ec-campus-set-2/set_2_test/test_emotion.csv',encoding='ISO-8859-1')
video_dir = '/kaggle/input/ml-hackathon-ec-campus-set-2/set_2_test/test_data'

# Function to get video file path from Dialogue_ID and Utterance_ID
def get_video_clip_path(row):
    dialogue_id = row['Dialogue_ID']
    utterance_id = row['Utterance_ID']
    filename = f"dia{dialogue_id}_utt{utterance_id}.mp4"
    return os.path.join(video_dir, filename)

# Add video file paths to DataFrame
df['video_clip_path'] = df.apply(get_video_clip_path, axis=1)

# Extract video features for test data
def extract_test_video_features(df):
    features = []
    for path in df['video_clip_path']:
        features.append(extract_avg_features_from_video(path, frame_rate=30))  # replace with your extraction function
    return np.array(features)

# Extract text features for test data
def extract_test_text_features(df):
    text_features = []
    for text in df['Utterance']:
        text_features.append(extract_text_features(text))  # replace with your extraction function
    return np.array(text_features)

# Extract features from the test set
test_video_features = extract_test_video_features(df)
test_text_features = extract_test_text_features(df)

# Apply early fusion on test data
def early_fusion(video_features, text_features):
    return np.concatenate([video_features, text_features], axis=1)

# Ensure features have the same number of samples
assert test_video_features.shape[0] == test_text_features.shape[0], "Mismatch in sample numbers"

# Apply early fusion to test data
X_test_fusion = early_fusion(test_video_features, test_text_features)

# Normalize test features using the same scaler fitted on training data
X_test_fusion = scaler.transform(X_test_fusion)

# Use the trained classifier to predict labels for test data
y_test_pred = classifier.predict(X_test_fusion)

# Prepare submission file
submission_df = pd.DataFrame({
    'Sr No.': df["Sr No."],  # Use 'Sr No.' from test file
    'Emotion': y_test_pred
})

# Save predictions to CSV in the required format
submission_df.to_csv("submission.csv", index=False)
print("Predictions saved to submission.csv")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21