In [12]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Load the datasets
train_emoticon_df = pd.read_csv("datasets/train/train_emoticon.csv")
train_emoticon_X = train_emoticon_df['input_emoticon'].tolist()
train_emoticon_Y = train_emoticon_df['label'].tolist()

# test_emoticon_X = pd.read_csv("datasets/valid/valid_emoticon.csv")['input_emoticon'].tolist()
valid_emoticon_df = pd.read_csv("datasets/valid/valid_emoticon.csv")
valid_emoticon_X = valid_emoticon_df['input_emoticon'].tolist()
valid_emoticon_Y = valid_emoticon_df['label'].tolist()

train_seq_df = pd.read_csv("datasets/train/train_text_seq.csv")
train_seq_X = train_seq_df['input_str'].tolist()
train_seq_Y = train_seq_df['label'].tolist()

# test_seq_X = pd.read_csv("datasets/valid/valid_text_seq.csv")['input_str'].tolist()
valid_seq_df = pd.read_csv("datasets/valid/valid_text_seq.csv")
valid_seq_X = valid_seq_df['input_str'].tolist()
valid_seq_Y = valid_seq_df['label'].tolist()

train_feat = np.load("datasets/train/train_feature.npz", allow_pickle=True)
train_feat_X = train_feat['features']
train_feat_Y = train_feat['label']

# test_feat_X = np.load("datasets/valid/valid_feature.npz", allow_pickle=True)['features']
valid_feat = np.load("datasets/valid/valid_feature.npz", allow_pickle=True)
valid_feat_X = valid_feat['features']
valid_feat_Y = valid_feat['label']

# Print sizes of train and test datasets
print(f"Train dataset size: ")
print(f"train_emoticon_X: {len(train_emoticon_X)} train_emoticon_Y: {len(train_emoticon_Y)}")
print(f"train_seq_X: {len(train_seq_X)} train_seq_Y: {len(train_seq_Y)}")
print(f"train_feat_X: {train_feat_X.shape} train_feat_Y: {train_feat_Y.shape}")

print("\nTest dataset size: ")
print(f"valid_emoticon_X: {len(valid_emoticon_X)}")
print(f"valid_seq_X: {len(valid_seq_X)}")
print(f"valid_feat_X: {valid_feat_X.shape}")

# OneHot Encoding for Emoticon and Text Sequence Datasets
onehot_encoder = OneHotEncoder(handle_unknown='ignore')

# One-Hot Encode Emoticon Dataset (Training)
train_emoticon_encoded = onehot_encoder.fit_transform(np.array(train_emoticon_X).reshape(-1, 1)).toarray()

# One-Hot Encode Emoticon Dataset (Test)
valid_emoticon_encoded = onehot_encoder.transform(np.array(valid_emoticon_X).reshape(-1, 1)).toarray()

# One-Hot Encode Text Sequences Dataset (Training)
train_seq_encoded = onehot_encoder.fit_transform(np.array(train_seq_X).reshape(-1, 1)).toarray()

# One-Hot Encode Text Sequences Dataset (Test)
valid_seq_encoded = onehot_encoder.transform(np.array(valid_seq_X).reshape(-1, 1)).toarray()

# Scale the Feature Matrix (Feature Dataset)
scaler = StandardScaler()

# Scale feature matrix for training data
train_feat_scaled = scaler.fit_transform(train_feat_X.reshape(train_feat_X.shape[0], -1))

# Scale feature matrix for test data
valid_feat_scaled = scaler.transform(valid_feat_X.reshape(test_feat_X.shape[0], -1))

# Concatenate all encoded/processed datasets
train_X_combined = np.hstack((train_emoticon_encoded, train_seq_encoded, train_feat_scaled))
train_Y_combined = np.array(train_emoticon_Y)  # Labels are same for all datasets

valid_X_combined = np.hstack((valid_emoticon_encoded, valid_seq_encoded, valid_feat_scaled))
valid_Y_combined = np.array(valid_emoticon_Y) 

# Train a Random Forest Classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(train_X_combined, train_Y_combined)

# Make predictions on the test set
test_pred = rf_classifier.predict(valid_X_combined)

# Assuming test labels are in a similar file, you can evaluate the model
# For demonstration purposes, assume that you have test labels in `test_labels`
# test_labels = ... (load the test labels accordingly)

# Compute accuracy (uncomment this line when test labels are available)
accuracy = accuracy_score(valid_Y_combined, test_pred)
print(f"Test Accuracy: {accuracy:.4f}")

# You can add code to evaluate the model using appropriate metrics if test labels are available.


Train dataset size: 
train_emoticon_X: 7080 train_emoticon_Y: 7080
train_seq_X: 7080 train_seq_Y: 7080
train_feat_X: (7080, 13, 768) train_feat_Y: (7080,)

Test dataset size: 
valid_emoticon_X: 489
valid_seq_X: 489
valid_feat_X: (489, 13, 768)
Test Accuracy: 0.9796


In [14]:
train_X_combined

array([[ 0.        ,  0.        ,  0.        , ..., -0.28983393,
        -0.24607322,  1.4753989 ],
       [ 0.        ,  0.        ,  0.        , ...,  1.64987433,
        -0.70957798,  0.2628164 ],
       [ 0.        ,  0.        ,  0.        , ...,  0.54141045,
        -0.55918396,  0.31773353],
       ...,
       [ 0.        ,  0.        ,  0.        , ...,  0.7908113 ,
         1.15807533, -0.98032582],
       [ 0.        ,  0.        ,  0.        , ...,  0.64058501,
        -0.87167823, -0.86577237],
       [ 0.        ,  0.        ,  0.        , ..., -0.77007782,
         0.22331148, -1.24868262]])

In [16]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

# Load the datasets
train_emoticon_df = pd.read_csv("datasets/train/train_emoticon.csv")
train_emoticon_X = train_emoticon_df['input_emoticon'].tolist()
train_emoticon_Y = train_emoticon_df['label'].tolist()

valid_emoticon_df = pd.read_csv("datasets/valid/valid_emoticon.csv")
valid_emoticon_X = valid_emoticon_df['input_emoticon'].tolist()
valid_emoticon_Y = valid_emoticon_df['label'].tolist()

train_seq_df = pd.read_csv("datasets/train/train_text_seq.csv")
train_seq_X = train_seq_df['input_str'].tolist()
train_seq_Y = train_seq_df['label'].tolist()

valid_seq_df = pd.read_csv("datasets/valid/valid_text_seq.csv")
valid_seq_X = valid_seq_df['input_str'].tolist()
valid_seq_Y = valid_seq_df['label'].tolist()

train_feat = np.load("datasets/train/train_feature.npz", allow_pickle=True)
train_feat_X = train_feat['features']
train_feat_Y = train_feat['label']

valid_feat = np.load("datasets/valid/valid_feature.npz", allow_pickle=True)
valid_feat_X = valid_feat['features']
valid_feat_Y = valid_feat['label']

# Print sizes of train and test datasets
print(f"Train dataset size: ")
print(f"train_emoticon_X: {len(train_emoticon_X)} train_emoticon_Y: {len(train_emoticon_Y)}")
print(f"train_seq_X: {len(train_seq_X)} train_seq_Y: {len(train_seq_Y)}")
print(f"train_feat_X: {train_feat_X.shape} train_feat_Y: {train_feat_Y.shape}")

print("\nTest dataset size: ")
print(f"valid_emoticon_X: {len(valid_emoticon_X)}")
print(f"valid_seq_X: {len(valid_seq_X)}")
print(f"valid_feat_X: {valid_feat_X.shape}")

# OneHot Encoding for Emoticon and Text Sequence Datasets
onehot_encoder = OneHotEncoder(handle_unknown='ignore')

# One-Hot Encode Emoticon Dataset (Training)
train_emoticon_encoded = onehot_encoder.fit_transform(np.array(train_emoticon_X).reshape(-1, 1)).toarray()

# One-Hot Encode Emoticon Dataset (Test)
valid_emoticon_encoded = onehot_encoder.transform(np.array(valid_emoticon_X).reshape(-1, 1)).toarray()

# One-Hot Encode Text Sequences Dataset (Training)
train_seq_encoded = onehot_encoder.fit_transform(np.array(train_seq_X).reshape(-1, 1)).toarray()

# One-Hot Encode Text Sequences Dataset (Test)
valid_seq_encoded = onehot_encoder.transform(np.array(valid_seq_X).reshape(-1, 1)).toarray()

# Scale the Feature Matrix (Feature Dataset)
scaler = StandardScaler()

# Scale feature matrix for training data
train_feat_scaled = scaler.fit_transform(train_feat_X.reshape(train_feat_X.shape[0], -1))

# Scale feature matrix for test data
valid_feat_scaled = scaler.transform(valid_feat_X.reshape(valid_feat_X.shape[0], -1))

# Concatenate all encoded/processed datasets
train_X_combined = np.hstack((train_emoticon_encoded, train_seq_encoded, train_feat_scaled))
train_Y_combined = np.array(train_emoticon_Y)  # Labels are same for all datasets

valid_X_combined = np.hstack((valid_emoticon_encoded, valid_seq_encoded, valid_feat_scaled))
valid_Y_combined = np.array(valid_emoticon_Y) 

# Train an SVM Classifier
svm_classifier = SVC(kernel='linear', random_state=42)
svm_classifier.fit(train_X_combined, train_Y_combined)

# Make predictions on the test set
test_pred = svm_classifier.predict(valid_X_combined)

# Compute accuracy
accuracy = accuracy_score(valid_Y_combined, test_pred)
print(f"Test Accuracy: {accuracy:.4f}")


Train dataset size: 
train_emoticon_X: 7080 train_emoticon_Y: 7080
train_seq_X: 7080 train_seq_Y: 7080
train_feat_X: (7080, 13, 768) train_feat_Y: (7080,)

Test dataset size: 
valid_emoticon_X: 489
valid_seq_X: 489
valid_feat_X: (489, 13, 768)
Test Accuracy: 0.9796
