<a href="https://colab.research.google.com/github/BrianGisemba/MENTAL-HEALTH-TWEETS-CLASSIFICATION/blob/main/Augmentation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Loading the dataset
import pandas as pd
df = pd.read_csv("/content/MentalHealth_orig.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,tweet,location,disorder,hour
0,0,shock as a young man climbs on top of a hospit...,"nairobi, kenya",4,10
1,1,people tend to hide depression under drinking ...,"nairobi, kenya",4,10
2,2,lily_nganga depression,nairobi,4,9
3,3,itumbi has subjected bloggers in to depression...,"nairobi, kenya",4,8
4,4,mental health isnt just anxiety amp depression...,"nairobi, kenya",4,8


In [None]:
%%capture
!pip install nlpaug
!pip install transformers

In [None]:
# Loading the required augmentation Libraries
import nlpaug.augmenter.char as nac
import nlpaug.augmenter.word as naw
import nlpaug.augmenter.sentence as nas
import nlpaug.flow as naf
from tqdm import tqdm
from sklearn.utils import shuffle

from nlpaug.util import Action

In [None]:
#Split the train and test data
from sklearn.model_selection import train_test_split
train,valid=train_test_split(df,test_size=0.20 , stratify = df['disorder'])
train.shape, valid.shape

((604, 5), (152, 5))

In [None]:
# Check the size of our columns so as to know how to augment each column
train['disorder'].value_counts()

4    282
0    222
7     47
2     14
6     11
1     11
3     10
5      7
Name: disorder, dtype: int64

In [None]:
# Test text to check augmentation quality.
text = train.iloc[0]['tweet']
text

'zablonorina1 hence the name performance anxiety'

In [None]:
# ContextualWordEmbsAug : Augmenter that apply operation (word level) to textual input based on contextual word embeddings.

aug = naw.ContextualWordEmbsAug(
    model_path='bert-base-uncased', action="insert")
augmented_text = aug.augment(text)


print('Original text \n',text,'\n Augmented text\n', augmented_text)

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/440M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Original text 
 zablonorina1 hence the name performance anxiety 
 Augmented text
 zablonorina1 ( hence inspiring the name performance group anxiety


In [None]:
# Creating a copy of  the dataset
 df1 = df.copy(deep=True)

In [None]:
import numpy as np
#For anxiety, class = 0,

# Creating augmented text data to increase our training dataset by 78 entries

def augment_text(df1,samples=78,pr=0.2):   
    aug.aug_p=pr
    new_text=[]
    
    #selecting the  class samples
    df_n=df1[df1['disorder']==0].reset_index(drop=True)

    ## data augmentation loop
    for i in tqdm(np.random.randint(0,len(df_n),samples)):
        
            text = df_n.iloc[i]['tweet']
            augmented_text = aug.augment(text)
            new_text.append(augmented_text)
    
    
    ## dataframe
    new=pd.DataFrame({'tweet':new_text,'disorder':0})
    df1=shuffle(df1.append(new).reset_index(drop=True))
    return df1
   
train = augment_text(train)
print(train.shape, '\n\n')
train.head()

100%|██████████| 78/78 [01:38<00:00,  1.26s/it]

(682, 5) 







Unnamed: 0.1,Unnamed: 0,tweet,location,disorder,hour
302,161.0,oyondi_malik being a youth in this country is ...,"nairobi, kenya",4,13.0
155,49.0,dralfredmutua i can see depression is killing...,"nairobi, kenya",4,5.0
237,439.0,anxiety can create a lack of selfesteem if you...,"nairobi, kenya",0,7.0
627,,# collorach _ de hatukupewa? sufficient warnin...,,0,
580,642.0,jemund2 it appears so is anxiety a disease,nairobi,0,7.0


In [None]:
print(train.shape)


(682, 5)


In [None]:
#For autism, class = 1

# Creating augmented text data to increase our training dataset by 289 entries

def augment_text(df1,samples=289,pr=0.2):  
    aug.aug_p=pr
    new_text=[]
    
    #selecting the  class samples
    df_n=df1[df1['disorder']==1].reset_index(drop=True)

    ## data augmentation loop
    for i in tqdm(np.random.randint(0,len(df_n),samples)):
        
            text = df_n.iloc[i]['tweet']
            augmented_text = aug.augment(text)
            new_text.append(augmented_text)
    
    
    ## dataframe
    new=pd.DataFrame({'tweet':new_text,'disorder':1})
    df1=shuffle(df1.append(new).reset_index(drop=True))
    return df1
   
train = augment_text(train)
print(train.shape, '\n\n')
train.head()

100%|██████████| 289/289 [07:49<00:00,  1.63s/it]

(971, 5) 







Unnamed: 0.1,Unnamed: 0,tweet,location,disorder,hour
945,,meet the good doctor who is going a very great...,,1,
9,69.0,dralfredmutua the incident confirms that was a...,"mumias, kenya",4,16.0
362,511.0,sht being around some people raises my anxiety...,"nairobi, kenya",0,12.0
331,389.0,for anyone who has been struggling to sleep an...,"nairobi, kenya",0,17.0
923,,mason is 10 years old and initially has autism...,,1,


In [None]:
print(train.shape)


(971, 5)


In [None]:
#For bipolar disorder, class = 2

# Creating augmented text data to increase our training dataset by 286 entries

def augment_text(df1,samples=286,pr=0.2):   #70 aurgumented data
    aug.aug_p=pr
    new_text=[]
    
    #selecting the  class samples
    df_n=df1[df1['disorder']==2].reset_index(drop=True)

    ## data augmentation loop
    for i in tqdm(np.random.randint(0,len(df_n),samples)):
        
            text = df_n.iloc[i]['tweet']
            augmented_text = aug.augment(text)
            new_text.append(augmented_text)
    
    
    ## dataframe
    new=pd.DataFrame({'tweet':new_text,'disorder':2})
    df1=shuffle(df1.append(new).reset_index(drop=True))
    return df1
   
train = augment_text(train)
print(train.shape, '\n\n')
train.head()

100%|██████████| 286/286 [05:08<00:00,  1.08s/it]

(1257, 5) 







Unnamed: 0.1,Unnamed: 0,tweet,location,disorder,hour
1223,,adenya _ e we have the arctic and general anta...,,2,
799,734.0,leukoaraiosis ventriculomegaly vascular dement...,"nairobi, kenya",3,14.0
341,382.0,my anxiety is so bad these days khalasssssssss...,lau,0,20.0
841,,mason jones is 10 ¹⁄₂ years old and has autism...,,1,
305,264.0,there is a very thin line between over religio...,"nairobi, kenya",4,12.0


In [None]:
#For dementia, class = 3

# Creating augmented text data to increase our training dataset by 290 entries

def augment_text(df1,samples=290,pr=0.2):  
    aug.aug_p=pr
    new_text=[]
    
    #selecting the  class samples
    df_n=df1[df1['disorder']==3].reset_index(drop=True)

    ## data augmentation loop
    for i in tqdm(np.random.randint(0,len(df_n),samples)):
        
            text = df_n.iloc[i]['tweet']
            augmented_text = aug.augment(text)
            new_text.append(augmented_text)
    
    
    ## dataframe
    new=pd.DataFrame({'tweet':new_text,'disorder':3})
    df1=shuffle(df1.append(new).reset_index(drop=True))
    return df1
   
train = augment_text(train)
print(train.shape, '\n\n')
train.head()

100%|██████████| 290/290 [05:31<00:00,  1.14s/it]

(1547, 5) 







Unnamed: 0.1,Unnamed: 0,tweet,location,disorder,hour
1189,42.0,arasa said mogire aged 38 wrote that the duo h...,"nairobi, kenya",4,13.0
1282,,problems that of adolescents being obese large...,,3,
1398,,if adrenaline is me wonderful it covers pain. ...,,3,
1292,,scientists are testing toward an artificialint...,,3,
1323,,why do i truly have the sex drive of an obviou...,,3,


In [None]:
#For depression, class = 4

# Creating augmented text data to increase our training dataset by 18 entries

def augment_text(df1,samples=18,pr=0.2):  
    aug.aug_p=pr
    new_text=[]
    
    #selecting the  class samples
    df_n=df1[df1['disorder']==0].reset_index(drop=True)

    ## data augmentation loop
    for i in tqdm(np.random.randint(4,len(df_n),samples)):
        
            text = df_n.iloc[i]['tweet']
            augmented_text = aug.augment(text)
            new_text.append(augmented_text)
    
    
    ## dataframe
    new=pd.DataFrame({'tweet':new_text,'disorder':4})
    df1=shuffle(df1.append(new).reset_index(drop=True))
    return df1
   
train = augment_text(train)
print(train.shape, '\n\n')
train.head()

100%|██████████| 18/18 [00:22<00:00,  1.25s/it]

(1565, 5) 







Unnamed: 0.1,Unnamed: 0,tweet,location,disorder,hour
1381,,weather is occasionally very bipolar,,2,
1402,,lifestyle problems of healthy being obese new ...,,3,
65,,saniegojaccque pill i take that with a pinch o...,,0,
546,395.0,imposter syndrome giving you anxiety again \n\...,"nairobi, kenya",0,15.0
578,,the good doctor is a very great series setting...,,1,


In [None]:
#For paranoia, class = 5

# Creating augmented text data to increase our training dataset by 293 entries

def augment_text(df1,samples=293,pr=0.2): 
    aug.aug_p=pr
    new_text=[]
    
    #selecting the  class samples
    df_n=df1[df1['disorder']==5].reset_index(drop=True)

    ## data augmentation loop
    for i in tqdm(np.random.randint(0,len(df_n),samples)):
        
            text = df_n.iloc[i]['tweet']
            augmented_text = aug.augment(text)
            new_text.append(augmented_text)
    
    
    ## dataframe
    new=pd.DataFrame({'tweet':new_text,'disorder':5})
    df1=shuffle(df1.append(new).reset_index(drop=True))
    return df1
   
train = augment_text(train)
print(train.shape, '\n\n')
train.head()

100%|██████████| 293/293 [06:42<00:00,  1.37s/it]

(1858, 5) 







Unnamed: 0.1,Unnamed: 0,tweet,location,disorder,hour
930,311.0,edward doheny 1st successful oil well in la hu...,"nairobi, kenya",4,7.0
579,660.0,antohlibra schizophrenia has a genetic compone...,"nairobi, kenya",6,14.0
243,283.0,homeboyzradio gmoneyizme gmitm\nglued in from ...,"nairobi,kenya",4,3.0
1299,,absent leukoaraiosis ventriculomegaly primary ...,,3,
881,,adultingadhd to theadhdacademic i just feel mo...,,2,


In [None]:
#For schizophrenia, class = 6

# Creating augmented text data to increase our training dataset by 289 entries

def augment_text(df1,samples=289,pr=0.2):   #70 aurgumented data
    aug.aug_p=pr
    new_text=[]
    
    #selecting the  class samples
    df_n=df1[df1['disorder']==6].reset_index(drop=True)

    ## data augmentation loop
    for i in tqdm(np.random.randint(0,len(df_n),samples)):
        
            text = df_n.iloc[i]['tweet']
            augmented_text = aug.augment(text)
            new_text.append(augmented_text)
    
    
    ## dataframe
    new=pd.DataFrame({'tweet':new_text,'disorder':6})
    df1=shuffle(df1.append(new).reset_index(drop=True))
    return df1
   
train = augment_text(train)
print(train.shape, '\n\n')
train.head()

100%|██████████| 289/289 [06:52<00:00,  1.43s/it]

(2400, 5) 







Unnamed: 0.1,Unnamed: 0,tweet,location,disorder,hour
1254,,not suicidal or anything but sometimes i look ...,,7,
1475,540.0,need more clothes \nscreams in anxiety,"nairobi, kenya",0,17.0
2113,,sam _ [UNK] lulli ( fidel _ ) ke ; huyu ni emb...,,6,
1710,284.0,learn how to cope depression and traumatic con...,"nairobi, kenya",4,21.0
1319,,"at, our current research labs weve looked at t...",,1,


In [None]:
#For suicidal ideation, class = 7

# Creating augmented text data to increase our training dataset by 253 entries

def augment_text(df1,samples=253,pr=0.2):   #70 aurgumented data
    aug.aug_p=pr
    new_text=[]
    
    #selecting the  class samples
    df_n=df1[df1['disorder']==7].reset_index(drop=True)

    ## data augmentation loop
    for i in tqdm(np.random.randint(0,len(df_n),samples)):
        
            text = df_n.iloc[i]['tweet']
            augmented_text = aug.augment(text)
            new_text.append(augmented_text)
    
    
    ## dataframe
    new=pd.DataFrame({'tweet':new_text,'disorder':7})
    df1=shuffle(df1.append(new).reset_index(drop=True))
    return df1
   
train = augment_text(train)
print(train.shape, '\n\n')
train.head()

100%|██████████| 253/253 [05:01<00:00,  1.19s/it]

(2111, 5) 







Unnamed: 0.1,Unnamed: 0,tweet,location,disorder,hour
169,,bless those who see life through entering a fo...,,1,
1609,449.0,stressing and anxiety can cause the overproduc...,"kasarani, kenya",0,19.0
1287,,aqusub friends i understand were collectively ...,,0,
1421,,promissory organ coochie hukuwa some form : of...,,0,
1958,,dont become emotionally suicidal seek some pea...,,7,


In [None]:
# Check the number of entries for each disorder in the train dataset
train['disorder'].value_counts()

7    300
5    300
3    300
1    300
6    300
4    300
2    300
0    300
Name: disorder, dtype: int64

In [None]:
# Previewing the train dataset
train

Unnamed: 0.1,Unnamed: 0,tweet,location,disorder,hour
1254,,not suicidal or anything but sometimes i look ...,,7,
1475,540.0,need more clothes \nscreams in anxiety,"nairobi, kenya",0,17.0
2113,,sam _ [UNK] lulli ( fidel _ ) ke ; huyu ni emb...,,6,
1710,284.0,learn how to cope depression and traumatic con...,"nairobi, kenya",4,21.0
1319,,"at, our current research labs weve looked at t...",,1,
...,...,...,...,...,...
1095,420.0,hypervigilance is a problem for me ill keep th...,"nairobi, kenya",0,18.0
875,,being loyal to a country called kenya is now s...,,7,
1346,,my growing ego anxiety or war is might get so ...,,4,
1897,,adenya _ e / we have the arctic and antarctic ...,,2,


In [None]:
# Dropping irrelevant columns
train=train.drop(columns=['Unnamed: 0','location','hour'])

In [None]:
# Previewing the train dataset
train

Unnamed: 0,tweet,disorder
1254,not suicidal or anything but sometimes i look ...,7
1475,need more clothes \nscreams in anxiety,0
2113,sam _ [UNK] lulli ( fidel _ ) ke ; huyu ni emb...,6
1710,learn how to cope depression and traumatic con...,4
1319,"at, our current research labs weve looked at t...",1
...,...,...
1095,hypervigilance is a problem for me ill keep th...,0
875,being loyal to a country called kenya is now s...,7
1346,my growing ego anxiety or war is might get so ...,4
1897,adenya _ e / we have the arctic and antarctic ...,2


In [None]:
#Exporting to .csv
train.to_csv("Augmentaed.csv",index=False)