In [None]:
! pip install transformers torchtext 

In [None]:
! pip install nlpaug nltk 

In [None]:
from google.colab import files 
files.upload()

In [None]:
import re
import string 
import random
import warnings  
import numpy as np
import pandas as pd 
import seaborn as sns 
from tqdm.notebook import tqdm 
import matplotlib.pyplot as plt

In [None]:
import nltk
from nltk.corpus import stopwords 
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import TweetTokenizer

In [None]:
tqdm.pandas()
nltk.download('omw-1.4')
nltk.download('wordnet')
nltk.download('stopwords')
warnings.filterwarnings('ignore')

[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
data = pd.read_csv('train.csv')

In [None]:
data.head()

Unnamed: 0,text,reason,label
0,this is an amazing app for online classes!but,good app for conducting online classes,1
1,very practical and easy to use,app is user-friendly,1
2,this app is very good for video conferencing.,good for video conferencing,1
3,i can not download this zoom app,unable to download zoom app,1
4,i am not able to download this app,want to download the app,1


In [None]:
# some utility functions to clean the text 

lemmatizer = WordNetLemmatizer()

def decontract(text):
    text = re.sub(r"won\'t", "will not", text)
    text = re.sub(r"can\'t", "can not", text)
    text = re.sub(r"n\'t", " not", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'s", " is", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'t", " not", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'m", " am", text)
    return text

def process_text(text):
    stemmer = PorterStemmer()
    stopwords_english = stopwords.words('english')
    text = re.sub(r'$\w*', '', text)
    text = re.sub(r'^RT[\s]+', '', text)
    text = re.sub(r'https?:\/\/.*[\r\n]*', '', text)
    text = re.sub(r'#', '', text)
    text = str(re.sub("\S*\d\S*", "", text).strip()) 
    text=decontract(text)
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True,reduce_len=True)
    tokens = tokenizer.tokenize(text)

    texts_clean = []
    for word in tokens:
        if (word not in stopwords_english and  # remove stopwords
                word not in string.punctuation+'...'):  # remove punctuation
            # 
            stem_word = lemmatizer.lemmatize(word,"v")  # Lemmatizing word
            texts_clean.append(stem_word)

    return " ".join(texts_clean)

In [None]:
data['text'] = data['text'].apply(lambda text: process_text(text))
data['reason'] = data['reason'].apply(lambda reason: process_text(reason))

#### **Here starts the augmentation strategy**

### **Sample 1**

In [None]:
sample1 = data.sample(1000)
sample1_index = sample1.index

In [None]:
left = len(data) - 1000
sample2 = data[~data.index.isin(sample1_index)]

In [None]:
sample1, sample2 = sample1.reset_index(drop=True), sample2.reset_index(drop=True)

In [None]:
sample1

Unnamed: 0,text,reason,label
0,improve mobile tablet app lot,mobile tablet app need improvement,1
1,watch ipl match app keep crash every,app crash watch ipl match,1
2,take long time log manage log,app take long login,1
3,make dark mode ios,need dark mode ios,1
4,sync another feature work,sync work,1
...,...,...,...
995,even let create account,unable create account,1
996,try create account,want create account,1
997,pick profile,want select profile,1
998,stream functionality awful,worst app stream,1


In [None]:
sample2

Unnamed: 0,text,reason,label
0,amaze app online class,good app conduct online class,1
1,practical easy use,app user-friendly,1
2,download zoom app,unable download zoom app,1
3,zoom excellent meet app,good app conduct online meet,1
4,video quality poor,video quality poor,1
...,...,...,...
1056,bad even update worst,frequent update annoy,1
1057,pretty good video player suck,video player work,1
1058,good quality picture graphics,good quality picture,1
1059,learn netflix interface,netflix ui better,1


In [None]:
import nlpaug
import nlpaug.augmenter.word as naw

In [None]:
aug = naw.ContextualWordEmbsAug(
    model_path='bert-base-uncased', 
    action="substitute",aug_max=3)

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [None]:
def augment_text(text):
    return aug.augment(text, n=1)[0]

In [None]:
# Using sample 1 

sample1['text'] = sample1['text'].progress_apply(lambda text: augment_text(text))

  0%|          | 0/1000 [00:00<?, ?it/s]

In [None]:
sample1['reason'] = sample1['reason'].progress_apply(lambda text: augment_text(text))
sample1['label'] = 0

  0%|          | 0/1000 [00:00<?, ?it/s]

In [None]:
# save the sample 1 into csv

sample1.to_csv('sample1.csv')

### **Sample 2**

In [None]:
# Using sample 2 

from transformers import pipeline  

generator = pipeline(
    'text-generation', model='gpt2'
)

Downloading:   0%|          | 0.00/665 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/548M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [None]:
'''
- Github Actions
- Jenkins
- Prometheus
- Terraform
- AWS/GCP
- K8S
- Kubeflow
- Airflow
- Kalfka 
'''

'\n- Github Actions\n'

In [None]:
ps = sample1.copy()

In [None]:
ps[['text', 'reason']] = ps[['text', 'reason']].apply(fun, axis=1)