In [1]:
import pandas as pd

# Load the datasets
train_file_path = 'crisismmd_datasplit_all/crisismmd_datasplit_all/task_informative_text_img_train.tsv'
dev_file_path = 'crisismmd_datasplit_all/crisismmd_datasplit_all/task_informative_text_img_dev.tsv'
test_file_path = 'crisismmd_datasplit_all/crisismmd_datasplit_all/task_informative_text_img_test.tsv'

# Load TSV files into DataFrames
train_data = pd.read_csv(train_file_path, sep='\t')
dev_data = pd.read_csv(dev_file_path, sep='\t')
test_data = pd.read_csv(test_file_path, sep='\t')

# Display the first few rows of each dataset
train_data.head(), dev_data.head(), test_data.head()

(             event_name            tweet_id              image_id  \
 0  california_wildfires  917791291823591425  917791291823591425_0   
 1  california_wildfires  917791291823591425  917791291823591425_1   
 2  california_wildfires  917793137925459968  917793137925459968_0   
 3  california_wildfires  917793137925459968  917793137925459968_1   
 4  california_wildfires  917793137925459968  917793137925459968_2   
 
                                           tweet_text  \
 0  RT @Cal_OES: PLS SHARE: Weâ€™re capturing wild...   
 1  RT @Cal_OES: PLS SHARE: Weâ€™re capturing wild...   
 2  RT @KAKEnews: California wildfires destroy mor...   
 3  RT @KAKEnews: California wildfires destroy mor...   
 4  RT @KAKEnews: California wildfires destroy mor...   
 
                                                image            label  \
 0  data_image/california_wildfires/10_10_2017/917...      informative   
 1  data_image/california_wildfires/10_10_2017/917...  not_informative   
 2  data_ima

In [3]:
# Calculate the number of entries in each dataset
train_count = len(train_data)
dev_count = len(dev_data)
test_count = len(test_data)

# Calculate the total number of entries
total_entries = train_count + dev_count + test_count

# Display the result
print(f"Total entries in all datasets: {total_entries}")
print(f"Total entries in train datasets: {train_count}")
print(f"Total entries in validation datasets: {dev_count}")
print(f"Total entries in test datasets: {test_count}")


Total entries in all datasets: 18082
Total entries in train datasets: 13608
Total entries in validation datasets: 2237
Total entries in test datasets: 2237


In [5]:
# Count the number of images classified as informative and non-informative in the training set
train_informative = train_data['label_image'].value_counts().get('informative', 0)
train_non_informative = train_data['label_image'].value_counts().get('not_informative', 0)

# Count the number of images classified as informative and non-informative in the validation set
dev_informative = dev_data['label_image'].value_counts().get('informative', 0)
dev_non_informative = dev_data['label_image'].value_counts().get('not_informative', 0)

# Count the number of images classified as informative and non-informative in the test set
test_informative = test_data['label_image'].value_counts().get('informative', 0)
test_non_informative = test_data['label_image'].value_counts().get('not_informative', 0)

# Display the results
print(f"Training Set - Informative: {train_informative}, Non-Informative: {train_non_informative}")
print(f"Validation Set - Informative: {dev_informative}, Non-Informative: {dev_non_informative}")
print(f"Test Set - Informative: {test_informative}, Non-Informative: {test_non_informative}")



Training Set - Informative: 7059, Non-Informative: 6549
Validation Set - Informative: 1164, Non-Informative: 1073
Test Set - Informative: 1151, Non-Informative: 1086


In [6]:
# Count the number of texts classified as informative and non-informative in the training set
train_text_informative = train_data['label_text'].value_counts().get('informative', 0)
train_text_non_informative = train_data['label_text'].value_counts().get('not_informative', 0)

# Count the number of texts classified as informative and non-informative in the validation set
dev_text_informative = dev_data['label_text'].value_counts().get('informative', 0)
dev_text_non_informative = dev_data['label_text'].value_counts().get('not_informative', 0)

# Count the number of texts classified as informative and non-informative in the test set
test_text_informative = test_data['label_text'].value_counts().get('informative', 0)
test_text_non_informative = test_data['label_text'].value_counts().get('not_informative', 0)

# Calculate totals across all datasets
total_text_informative = train_text_informative + dev_text_informative + test_text_informative
total_text_non_informative = train_text_non_informative + dev_text_non_informative + test_text_non_informative

# Display the results
print(f"Training Set - Informative: {train_text_informative}, Non-Informative: {train_text_non_informative}")
print(f"Validation Set - Informative: {dev_text_informative}, Non-Informative: {dev_text_non_informative}")
print(f"Test Set - Informative: {test_text_informative}, Non-Informative: {test_text_non_informative}")
print(f"Total - Informative: {total_text_informative}, Non-Informative: {total_text_non_informative}")


Training Set - Informative: 9638, Non-Informative: 3970
Validation Set - Informative: 1612, Non-Informative: 625
Test Set - Informative: 1612, Non-Informative: 625
Total - Informative: 12862, Non-Informative: 5220


In [7]:
# Function to calculate similarity and dissimilarity
def calculate_similarity(data):
    # Count similarly classified (same label for text and image)
    similar = ((data['label_text'] == 'informative') & (data['label_image'] == 'informative')).sum() + \
              ((data['label_text'] == 'not_informative') & (data['label_image'] == 'not_informative')).sum()
    
    # Count dissimilarly classified (different label for text and image)
    dissimilar = ((data['label_text'] == 'informative') & (data['label_image'] == 'not_informative')).sum() + \
                 ((data['label_text'] == 'not_informative') & (data['label_image'] == 'informative')).sum()
    
    return similar, dissimilar

# Calculate for each dataset
train_similar, train_dissimilar = calculate_similarity(train_data)
dev_similar, dev_dissimilar = calculate_similarity(dev_data)
test_similar, test_dissimilar = calculate_similarity(test_data)

# Sum up for the total across all datasets
total_similar = train_similar + dev_similar + test_similar
total_dissimilar = train_dissimilar + dev_dissimilar + test_dissimilar

# Display the results
print(f"Training Set - Similar: {train_similar}, Dissimilar: {train_dissimilar}")
print(f"Validation Set - Similar: {dev_similar}, Dissimilar: {dev_dissimilar}")
print(f"Test Set - Similar: {test_similar}, Dissimilar: {test_dissimilar}")
print(f"Total - Similar: {total_similar}, Dissimilar: {total_dissimilar}")


Training Set - Similar: 9601, Dissimilar: 4007
Validation Set - Similar: 1573, Dissimilar: 664
Test Set - Similar: 1534, Dissimilar: 703
Total - Similar: 12708, Dissimilar: 5374


In [1]:
import pandas as pd

# Load the datasets
train_file_path = 'crisismmd_datasplit_all/crisismmd_datasplit_all/task_informative_text_img_train.tsv'
dev_file_path = 'crisismmd_datasplit_all/crisismmd_datasplit_all/task_informative_text_img_dev.tsv'
test_file_path = 'crisismmd_datasplit_all/crisismmd_datasplit_all/task_informative_text_img_test.tsv'

# Load TSV files into DataFrames
train_data = pd.read_csv(train_file_path, sep='\t')
dev_data = pd.read_csv(dev_file_path, sep='\t')
test_data = pd.read_csv(test_file_path, sep='\t')

# Function to calculate similar and dissimilar labels for informative and non-informative
def calculate_similarity(data):
    # Similar labels (label_text == label_image)
    similar = (data['label_text'] == data['label_image']).sum()
    
    # Dissimilar labels (label_text != label_image)
    dissimilar = (data['label_text'] != data['label_image']).sum()

    # Further divide into informative and non_informative counts
    informative_similar = ((data['label_text'] == 'informative') & (data['label_image'] == 'informative')).sum()
    non_informative_similar = ((data['label_text'] == 'not_informative') & (data['label_image'] == 'not_informative')).sum()
    
    informative_dissimilar = ((data['label_text'] == 'informative') & (data['label_image'] == 'not_informative')).sum()
    non_informative_dissimilar = ((data['label_text'] == 'not_informative') & (data['label_image'] == 'informative')).sum()

    return {
        "similar": similar,
        "dissimilar": dissimilar,
        "informative_similar": informative_similar,
        "non_informative_similar": non_informative_similar,
        "informative_dissimilar": informative_dissimilar,
        "non_informative_dissimilar": non_informative_dissimilar
    }

# Calculate results for train, dev, and test datasets
train_results = calculate_similarity(train_data)
dev_results = calculate_similarity(dev_data)
test_results = calculate_similarity(test_data)

# Sum up the results across all datasets
overall_results = {
    "similar": train_results['similar'] + dev_results['similar'] + test_results['similar'],
    "dissimilar": train_results['dissimilar'] + dev_results['dissimilar'] + test_results['dissimilar'],
    "informative_similar": train_results['informative_similar'] + dev_results['informative_similar'] + test_results['informative_similar'],
    "non_informative_similar": train_results['non_informative_similar'] + dev_results['non_informative_similar'] + test_results['non_informative_similar'],
    "informative_dissimilar": train_results['informative_dissimilar'] + dev_results['informative_dissimilar'] + test_results['informative_dissimilar'],
    "non_informative_dissimilar": train_results['non_informative_dissimilar'] + dev_results['non_informative_dissimilar'] + test_results['non_informative_dissimilar']
}

# Display the results
print("Training Set Results:")
print(f"Similar: {train_results['similar']}, Dissimilar: {train_results['dissimilar']}")
print(f"Informative Similar: {train_results['informative_similar']}, Non-informative Similar: {train_results['non_informative_similar']}")
print(f"Informative Dissimilar: {train_results['informative_dissimilar']}, Non-informative Dissimilar: {train_results['non_informative_dissimilar']}")
print("\nValidation Set Results:")
print(f"Similar: {dev_results['similar']}, Dissimilar: {dev_results['dissimilar']}")
print(f"Informative Similar: {dev_results['informative_similar']}, Non-informative Similar: {dev_results['non_informative_similar']}")
print(f"Informative Dissimilar: {dev_results['informative_dissimilar']}, Non-informative Dissimilar: {dev_results['non_informative_dissimilar']}")
print("\nTest Set Results:")
print(f"Similar: {test_results['similar']}, Dissimilar: {test_results['dissimilar']}")
print(f"Informative Similar: {test_results['informative_similar']}, Non-informative Similar: {test_results['non_informative_similar']}")
print(f"Informative Dissimilar: {test_results['informative_dissimilar']}, Non-informative Dissimilar: {test_results['non_informative_dissimilar']}")
print("\nOverall Results:")
print(f"Similar: {overall_results['similar']}, Dissimilar: {overall_results['dissimilar']}")
print(f"Informative Similar: {overall_results['informative_similar']}, Non-informative Similar: {overall_results['non_informative_similar']}")
print(f"Informative Dissimilar: {overall_results['informative_dissimilar']}, Non-informative Dissimilar: {overall_results['non_informative_dissimilar']}")


Training Set Results:
Similar: 9601, Dissimilar: 4007
Informative Similar: 6345, Non-informative Similar: 3256
Informative Dissimilar: 3293, Non-informative Dissimilar: 714

Validation Set Results:
Similar: 1573, Dissimilar: 664
Informative Similar: 1056, Non-informative Similar: 517
Informative Dissimilar: 556, Non-informative Dissimilar: 108

Test Set Results:
Similar: 1534, Dissimilar: 703
Informative Similar: 1030, Non-informative Similar: 504
Informative Dissimilar: 582, Non-informative Dissimilar: 121

Overall Results:
Similar: 12708, Dissimilar: 5374
Informative Similar: 8431, Non-informative Similar: 4277
Informative Dissimilar: 4431, Non-informative Dissimilar: 943
