# EDA


## Loading big dataset

### Imports

In [2]:
import pandas as pd
import seaborn as sns
import numpy as np
import string
import tensorflow as tf
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
from nltk.stem import WordNetLemmatizer
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from scipy.special import softmax

  from .autonotebook import tqdm as notebook_tqdm


### Loading data

In [3]:
data=pd.read_csv('../raw_data/training_data.csv',  header=None, encoding='latin-1')
data.head()

Unnamed: 0,0,1,2,3,4,5
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [4]:
#set names to columns
data.columns=['label','id', 'date','query','username','tweet']

In [5]:
#set names to columns
data.columns=['label','id', 'date','query','username','tweet']
# drop columns
data=data.drop(columns=['id', 'date','query','username'])
#check the balance of the classes
data.label.value_counts()

0    800000
4    800000
Name: label, dtype: int64

In [6]:
#check the balance of the classes
data.label.value_counts()

0    800000
4    800000
Name: label, dtype: int64

## Little split


In [7]:
small=data.sample(n=500)
small.label=small.label*0.25
small.label=small.label.astype(int)

# Preprocessing

## Basic cleaning

In [8]:
#In this function we lower case everything, remove numbers puntuation and stopwords and strip the text
def basic_cleaning(sentence, stop_words):
    sentence = sentence.lower()
    sentence = ''.join(char for char in sentence if not char.isdigit())
    for punctuation in string.punctuation:
        sentence = sentence.replace(punctuation, '') 
    sentence = sentence.strip()
    word_tokens = word_tokenize(sentence)
    sentence = [w for w in word_tokens if not w in stop_words]
    sentence= [WordNetLemmatizer().lemmatize(word, pos = "v")  # v --> verbs
              for word in sentence]
    sentence=[WordNetLemmatizer().lemmatize(word, pos = "n")  # v --> verbs
              for word in sentence]
    return ' '.join(word for word in sentence)

def chunk_cleaning(chunk):
    stop_words = stopwords.words('english')
    stop_words.append('u')
    stop_words.append('r')
    stop_words=set(stop_words)
    small_cleaned=[basic_cleaning(tweet, stop_words) for tweet in chunk]
    return small_cleaned
     


In [9]:
train=pd.DataFrame(data=np.array([chunk_cleaning(small.tweet[:300]),small.label[:300]]).T,columns=['tweet','label'])
val=pd.DataFrame(data=np.array([chunk_cleaning(small.tweet[300:400]),small.label[300:400]]).T,columns=['tweet','label'])

In [10]:
from transformers import BertTokenizer, TFBertForSequenceClassification
from transformers import InputExample, InputFeatures

In [11]:
model = TFBertForSequenceClassification.from_pretrained("bert-base-uncased")
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

2022-08-30 15:52:01.374010: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Model

In [12]:
def convert_data_to_examples(train, test, DATA_COLUMN, LABEL_COLUMN): 
    train_InputExamples = train.apply(lambda x: InputExample(guid=None, # Globally unique ID for bookkeeping, unused in this case
                                                          text_a = x[DATA_COLUMN], 
                                                          text_b = None,
                                                          label = x[LABEL_COLUMN]), axis = 1)

    validation_InputExamples = test.apply(lambda x: InputExample(guid=None, # Globally unique ID for bookkeeping, unused in this case
                                                          text_a = x[DATA_COLUMN], 
                                                          text_b = None,
                                                          label = x[LABEL_COLUMN]), axis = 1)
  
    return train_InputExamples, validation_InputExamples

train_InputExamples, validation_InputExamples = convert_data_to_examples(train, 
                                                                               val, 
                                                                               'tweet', 
                                                                               'label')
  
def convert_examples_to_tf_dataset(examples, tokenizer, max_length=128):
    features = [] # -> will hold InputFeatures to be converted later

    for e in examples:
        # Documentation is really strong for this method, so please take a look at it
        input_dict = tokenizer.encode_plus(
            e.text_a,
            add_special_tokens=True,
            max_length=max_length, # truncates if len(s) > max_length
            return_token_type_ids=True,
            return_attention_mask=True,
            pad_to_max_length=True, # pads to the right by default # CHECK THIS for pad_to_max_length
            truncation=True
        )

        input_ids, token_type_ids, attention_mask = (input_dict["input_ids"],
            input_dict["token_type_ids"], input_dict['attention_mask'])

        features.append(
            InputFeatures(
                input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, label=e.label
            )
        )

    def gen():
        for f in features:
            yield (
                {
                    "input_ids": f.input_ids,
                    "attention_mask": f.attention_mask,
                    "token_type_ids": f.token_type_ids,
                },
                f.label,
            )

    return tf.data.Dataset.from_generator(
        gen,
        ({"input_ids": tf.int32, "attention_mask": tf.int32, "token_type_ids": tf.int32}, tf.int64),
        (
            {
                "input_ids": tf.TensorShape([None]),
                "attention_mask": tf.TensorShape([None]),
                "token_type_ids": tf.TensorShape([None]),
            },
            tf.TensorShape([]),
        ),
    )


DATA_COLUMN = 'tweet'
LABEL_COLUMN = 'label'

In [13]:
train_InputExamples, validation_InputExamples = convert_data_to_examples(train, val, DATA_COLUMN, LABEL_COLUMN)

train_data = convert_examples_to_tf_dataset(list(train_InputExamples), tokenizer)
train_data = train_data.shuffle(100).batch(32).repeat(2)

validation_data = convert_examples_to_tf_dataset(list(validation_InputExamples), tokenizer)
validation_data = validation_data.batch(32)



In [14]:
val

Unnamed: 0,tweet,label
0,watch ep jonasgtgtgt httpbitlypeqqg,1
1,aileenu permanent bad hair whatever weather,1
2,dannymcfly cant wait see chile,1
3,another series screw dream rain wonder dislike...,0
4,willieday omg love ya voice sooooo much chitow...,1
...,...,...
95,really look forward school today barely day le...,0
96,deepakkasi thank succes meet je website,1
97,httpbitlyyfr lt stand front row,1
98,kenseto saw video kimchi cute want chinchilla,0


In [15]:
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001, epsilon=1e-08, clipnorm=1.0), 
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), 
              metrics=[tf.keras.metrics.SparseCategoricalAccuracy('accuracy')])

model.fit(train_data, epochs=2, validation_data=validation_data)

Epoch 1/2


KeyboardInterrupt: 

In [None]:
pred_sentences=chunk_cleaning(small.tweet[400:500].values.tolist())
tf_batch = tokenizer(pred_sentences, max_length=128, padding=True, truncation=True, return_tensors='tf')
tf_outputs = model(tf_batch)
tf_predictions = tf.nn.softmax(tf_outputs[0], axis=-1)
labels = ['Negative','Positive']
label = tf.argmax(tf_predictions, axis=1)
label = label.numpy()
for i in range(len(pred_sentences)):
    print(pred_sentences[i], ": \n", labels[label[i]], np.array(small.label[400:500])[i])


NameError: name 'chunk_cleaning' is not defined

In [None]:
first_model=model