In [24]:
def custom_train_test_split(df, test_size=0.2, random_state=None):
    np.random.seed(random_state)
    
    # Extract features and labels
    X = df[['ade', 'soc_code']]
    y = df['label']
    
    # Identify classes and their counts
    classes, counts = np.unique(y, return_counts=True)
    
    # Identify small classes
    small_classes = classes[counts < 5]
    
    # Initialize lists for train and test sets
    X_train_list = []
    y_train_list = []
    X_test_list = []
    y_test_list = []
    train_indices = []
    test_indices = []
    
    # Handle small classes separately
    for cls in small_classes:
        cls_mask = (y == cls)
        cls_X = X[cls_mask]
        cls_y = y[cls_mask]
        cls_idx = df.index[cls_mask].tolist()
        
        if len(cls_X) == 1:
            # If only one instance, put it in test set
            test_indices.append(cls_idx[0])
        else:
            # Randomly choose one instance for testing
            test_idx = np.random.choice(len(cls_X))
            test_indices.append(cls_idx[test_idx])
            
            # Remaining instances go to training
            train_indices.extend(np.delete(cls_idx, test_idx))
    
    # Combine the small class data into test and train sets
    test_indices = np.array(test_indices)
    train_indices = np.array(train_indices)
    
    X_test = df.loc[test_indices]
    y_test = X_test['label']
    
    X_train = df.loc[train_indices]
    y_train = X_train['label']
    
    # Handle large classes with stratified split
    large_class_mask = ~np.isin(y, small_classes)
    X_large = X[large_class_mask]
    y_large = y[large_class_mask]
    
    X_train_large, X_test_large, y_train_large, y_test_large = train_test_split(
        X_large, y_large, test_size=test_size, random_state=random_state, stratify=y_large
    )
    
    # Combine large class data with the small class data
    X_train = pd.concat([X_train, X_train_large], axis=0)
    y_train = pd.concat([y_train, y_train_large], axis=0)
    
    X_test = pd.concat([X_test, X_test_large], axis=0)
    y_test = pd.concat([y_test, y_test_large], axis=0)
    
    return X_train, X_test, y_train, y_test

In [21]:
import numpy as np
from sklearn.model_selection import train_test_split
import pandas as pd
#Read data from git:
#https://raw.githubusercontent.com/FANMISUA/TweetAENormalization/main/ADENormalization/Data/CADEC/3.csv
# URL of the CSV file
cadec_csv_url = "https://raw.githubusercontent.com/FANMISUA/TweetAENormalization/main/ADENormalization/Data/CADEC/3.csv"
# read data from smm4h
smm4h_csv_url = "https://raw.githubusercontent.com/FANMISUA/ADE_Norm/main/Data/smm4h_soc.tsv"

allSMM4H = [10037175, 10018065,10029205, 10017947, 10028395, 10022891, 10027433, 10040785, 10038738, 10022117, 10015919, 10038604, 10047065, 
            10021428,10041244, 10007541, 10038359, 10021881, 10013993, 10019805, 10042613, 10029104, 10077536, 10010331, 10014698]

label_dict = {
    10037175: 0,
    10018065: 1,
    10029205: 2,
    10017947: 3,
    10028395: 4,
    10022891: 5,
    10027433: 6,
    10040785: 7,
    10038738: 8,
    10022117: 9,
    10015919: 10,
    10038604: 11,
    10047065: 12,
    10021428: 13,
    10041244: 14,
    10007541: 15,
    10038359: 16,
    10021881: 17,
    10013993: 18,
    10019805: 19,
    10042613: 20,
    10029104: 21,
    10077536: 22,
    10010331: 23,
    10014698: 24
}


# Read the CSV file into a pandas DataFrame
column_names = ["ade", "soc_code"]
smm4h_all = pd.read_csv(smm4h_csv_url,names=column_names, sep = '\t', header=None)
print("smm4h data:",smm4h_all.shape)

smm4h_all['soc_code'] = pd.to_numeric(smm4h_all['soc_code'], errors='coerce').astype('Int64')
smm4h_all = smm4h_all[smm4h_all['soc_code'] != 0]

smm4h_unique = smm4h_all.drop_duplicates(subset='ade')

print("smm4h data after filtering:",smm4h_all.shape)
smm4h_soc_code_counts = smm4h_unique['soc_code'].value_counts()
# Sort the counts from high to low and print the result
print("SOC count in SMM4H: ",smm4h_soc_code_counts)
# Filter DataFrame
smm4h_filtered_data3 = smm4h_unique[smm4h_unique['soc_code'].isin(allSMM4H)]
# filtered_data6 = cadec_unique[cadec_unique['soc_code'].isin(top6SMM4H)]

# Select only the Term and SOC columns
allinSMM4H = smm4h_filtered_data3[['ade', 'soc_code']]
# CADECtop6inSMM4H = filtered_data6[['ade', 'soc_code']]

# Read the CSV file into a pandas DataFrame
column_names = ["TT", "llt_code", "ade", "soc_code"]
cadec_all = pd.read_csv(cadec_csv_url,names=column_names, header=None)

# Remove duplicate rows based on the 'ade' column
cadec_unique = cadec_all.drop_duplicates(subset='ade')

# Display the resulting DataFrame
# print("clean cadec data:",cadec_unique.shape)
# Count occurrences of each 'soc_code'
cadec_soc_code_counts = cadec_unique['soc_code'].value_counts()
# Sort the counts from high to low and print the result
print("SOC count in CADEC: ",cadec_soc_code_counts)


# Filter DataFrame
cadec_filtered_data3 = cadec_unique[cadec_unique['soc_code'].isin(allSMM4H)]
# filtered_data6 = cadec_unique[cadec_unique['soc_code'].isin(top6SMM4H)]

# Select only the Term and SOC columns
CADECallinSMM4H = cadec_filtered_data3[['ade', 'soc_code']]
# CADECtop6inSMM4H = filtered_data6[['ade', 'soc_code']]


# For SMM4H data
df1 = allinSMM4H.copy()
df1.loc[:, 'label'] = df1['soc_code'].map(label_dict)

# For CADEC data
df2 = CADECallinSMM4H.copy()
df2.loc[:, 'label'] = df2['soc_code'].map(label_dict)

print("SMM4H :",df1)
print("CADEC :",df2)

#smm4h data
df = df1


smm4h data: (1712, 2)
smm4h data after filtering: (1710, 2)
SOC count in SMM4H:  soc_code
10037175    287
10018065    235
10029205    212
10017947     63
10028395     58
10022891     54
10027433     48
10040785     28
10038738     22
10022117     16
10015919     16
10038604     10
10047065     10
10021428      8
10041244      7
10007541      7
10038359      6
10021881      5
10013993      4
10019805      2
10042613      2
10029104      2
10077536      1
10010331      1
10014698      1
Name: count, dtype: Int64
SOC count in CADEC:  soc_code
10028395    962
10018065    654
10037175    401
10017947    300
10029205    286
10040785    184
10007541     92
10038738     91
10022891     82
10015919     67
10038604     59
10038359     50
10022117     35
10047065     25
10013993     16
10019805     15
10041244      7
10027433      6
10021881      5
10021428      4
10014698      3
10005329      3
10029104      1
Name: count, dtype: int64
SMM4H :                             ade  soc_code  label
1  

In [6]:
print(df)

                            ade  soc_code  label
1                     allergies  10021428     13
2               HURT YOUR Liver  10019805     19
3                            AD  10037175      0
4                         focus  10029205      2
5                          died  10018065      1
...                         ...       ...    ...
1703                 chest hurt  10018065      1
1704   got ten minutes of sleep  10037175      0
1706                  Nosebleed  10038738      8
1708  never have another orgasm  10037175      0
1710        gain so much weight  10022891      5

[1105 rows x 3 columns]


In [27]:
    X_train, X_val, y_train, y_val = custom_train_test_split(df, test_size=0.2, random_state=2)
    df['data_type'] = ['not_set'] * df.shape[0]
    df.loc[X_train, 'data_type'] = 'train'
    df.loc[X_val, 'data_type'] = 'val'
    logger.info(df.groupby(['soc_code', 'label', 'data_type']).count())
    print(df.groupby(['soc_code', 'label', 'data_type']).count())

KeyError: "None of [Index([                              ('a', 'd', 'e'),\n            ('s', 'o', 'c', '_', 'c', 'o', 'd', 'e'),\n                           ('l', 'a', 'b', 'e', 'l'),\n       ('d', 'a', 't', 'a', '_', 't', 'y', 'p', 'e')],\n      dtype='object')] are in the [index]"

In [11]:
print(df.label.values)

[13 19  0 ...  8  0  5]
