In [None]:
# link this notebook to my google drive so I can retrieve data files from there
# Import the dedicated library
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Data loading & handling
import numpy as np
import pandas as pd
import json
import os



# NLP-dedicated libraries
import string
import re
import nltk
from nltk import ne_chunk, pos_tag, word_tokenize, sent_tokenize
from nltk.tree import Tree
from nltk.corpus import stopwords, wordnet
# For stopwords, we will only drop those of small length that could mess with
# abbreviations (e.g. "who" being confused with the World Health Organization)
stop_words = [word for word in stopwords.words('english') if len(word) <= 3]
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')
%pip install emot
from emot.emo_unicode import UNICODE_EMOJI, EMOTICONS_EMO
import spacy
# efficient library for spell checking
!pip install autocorrect
from autocorrect import Speller




# NER-Specific
from spacy import displacy
from spacy.tokens import DocBin
nlp = spacy.load('en_core_web_lg')



# Miscellaneous utilities
from sklearn.preprocessing import OneHotEncoder
import pickle
import warnings
import unicodedata
from scipy import sparse as sp_sparse
from itertools import combinations
from tqdm import tqdm # for progress bars
from tqdm.notebook import tqdm  # -//-
import random
from collections import Counter

# For the implementation of parallel computing
from joblib import Parallel, delayed

# Supress FutureWarning messages
warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
!pip install google-cloud-bigquery


In [None]:
!pip install db-dtypes


In [None]:
!python -m spacy download en_core_web_lg

# Importing the raw data that has been loaded to BigQuery

In [None]:
from google.colab import auth
from google.cloud import bigquery
from google.oauth2 import service_account

auth.authenticate_user()

# BigQuery client
project_id = 'nlp-project-427710'
credentials = service_account.Credentials.from_service_account_file('/content/drive/MyDrive/NLP_project/preprocessing/nlp-project-427710-3e1a48df3dba.json')
client = bigquery.Client(credentials=credentials, project=credentials.project_id)

# SQL query to fetch data from BigQuery table
query = """
    SELECT *
    FROM `nlp-project-427710.person_emotion.c_emotions1`
"""

# Fetch data me to BigQuery client
df = client.query(query).to_dataframe()

In [None]:
df.shape

(416809, 2)

In [None]:
df.head()

Unnamed: 0,text,label
0,ive enjoyed being able to slouch about relax a...,0
1,i dont know i feel so lost,0
2,i was beginning to feel quite disheartened,0
3,i can still lose the weight without feeling de...,0
4,im feeling a little like a damaged tree and th...,0


### Checking for missing values
Drop any row containing missing values in either column, as that would render it useless in making predictions or training a model to make predictions. We have enough data samples so as not to worry about a slight drop in their volume.

### Checking for duplicates
Drop all duplicates, keep only the first occurrences. Multiple entries with the exact same characteristics could contribute to overfitting, and if there are many of them they could also slow down training.

In [None]:
### Checking for missing values
for col in df.columns:
  print(f'Column "{col}" contains {df[col].isna().sum()} missing values')

Column "text" contains 0 missing values
Column "label" contains 0 missing values


In [None]:
# No missing values to drop

In [None]:
### Checking for duplicate values
df.drop_duplicates().shape[0] == df.shape[0]

False

In [None]:
# There are duplicate values
print(f'There are {df.shape[0] - df.drop_duplicates().shape[0]} duplicate values.')

There are 686 duplicate values.


In [None]:
# dropping them altogether - by default keeps only
df = df.drop_duplicates()
print('Dropped duplicates.')
# sanity check - there is no difference between the shape of the de-duplicated df and `df` now
df.drop_duplicates().shape[0] == df.shape[0]

Dropped duplicates.


True

# Text Pre-Processing
### Pre-Processing taken, together with simultaneous feature extraction:
- **Normalization:**
    - Convert all text into the **lower case** (ignore short words of less than 3 chars, in order to prevent counting-in abbreviations or other initials).
    - **Expanding contractions** (we also account for their existence - we suppose that tweets in informal language -which typically features the use of contractions- is more likely to include emotional use of language). **Before we do this, we proceed by replacing '\`' characters with "'", since we noticed in our raw data that many contractions used '\`' instead of the single apostrophe.**
    - We decided not to convert **numbers** into words, rather drop all of them, as they are typically used to express pandemic-related metrics and do not contribute to the semantic meaning of the text. NOTE: for NER, we have kept those, so in that part of the project we use the initial 'text' column of the table and apply only soft preprocessing. Here, we have also decided to keep some symbols that may pertain to numerical values, like currency signs (of the most popular currencies, euro, dollar, swiss franc, ruble, wuan, british pound, indian rupee), since they may typically be part of tweets expressing -negative- sentiment.
    - We proceed to the correction of **spelling errors**. This turned text pre-processing very computationally-expensive and time-consuming, but we took steps in the direction of maximizing the efficiency of our code and tackled that issue.
    - Replacing **emojis & emoticons** - encoding their word representation inside of vector representations of words was deemed as enough, no need for extra features to be extracted.
    - Removing **usernames & hashtags** - however, we account for their existence in a tweet (using a boolean var), making the assumption that tweets directly referring to specific people are more likely to express sentiment. We keep them both in the data used for NER (applied only on the covid-related dataset), as user mentions are typically used to directly refer to people and organizations, while hashtags were extensively used for generic pandemic references.
    - Removing **URLs & HTML elements** as they do not bear any contextual meaning. We perform this before removing punctuation, else we would not match them due to the lack of '://' or '.' chars.
    - Removing words with **length less than or equal to 2 characters** - we suppose that those mainly make for slang contractions of the type of 'ur' bearing no contextual meaning and increasing the randomness in our data.
    - Removing **punctuation and special characters** - we will keep the basic punctiation marks that are more likely to que at sentiment behind tweets, and drop other such characters which mainly play a syntactic role, like ":", ";", parentheses. We will drop '@' and '#' after we've dealt with usernames and hashtags, but we will keep symbols typically refered to in some emotional tweets (at least from our experience inspecting the datasets used in this project), like '$'. NLTK's word_tokenize() is well-suited for our usecase, since it treats such characters as separate tokens, and information on their prevalence and combination with other tokens will be integrated in the vector representations of words.
    - Removing **stopwords** - we opt for NLTK's 'stopwords' container for the english language, and deal only with short stopwords (up to 3 characters in length).
    - Removing **repetitive characters** (replace them with a double utterance to still account for their existence but in a consistent way across the dataset). We pay special attention to the retaining of '...' cases, by replacing them with '_ELIPSIS_' utterances and also accounting for their frequency in each tweet.
    - Remove any **escaped characters**.
    - Replace **repetitive whitespace** chars with a single space character.
- **Tokenization & Lemmatization** based on the POS tagging of tokens.
- **Feature extraction:** as mentioned in the comments on pre-processing steps.

In [None]:
# very first step - some tweets may contain absurd characters
def remove_non_ascii(text):
    return text.encode('ascii', 'ignore').decode('ascii')

# function to drop numbers
num_pat = re.compile(r'\d+')
def drop_numbers(text):
    return re.sub(num_pat, '', text)

# function to convert text to lower case AND count no of fully uppercase words
def convert_to_lower_case(text):
    upper_words = sum(1 for word in text.split() if word.isupper())
    tweet_length = len(text)
    no_sentences = text.count('.')
    return text.lower(), upper_words, tweet_length, no_sentences

In [None]:
# function to expand contractions
contractions_dict = {  # all in lowercase - this is suppose to be done after converting all text to the lower case
	"ain't": "are_not",
	"'s": " is",
	"aren't": "are not",
    "isn't": "is not",
	"can't": "cannot",
    "can't've": "cannot have",
	"'cause": "because",
	"could've": "could have",
	"couldn't": "could not",
	"couldn't've": "could not have",
    "would've": "would have",
    "wouldn't": "would not",
    "wouldn't": "would not",
    "wouldn't've": "would not have",
	"didn't": "did not",
	"doesn't": "does not",
	"don't": "do not",
    "wasn't": "was not",
    "weren't": "were not",
	"hadn't": "had not",
	"hadn't've": "had not have",
	"hasn't": "has not",
	"haven't": "have not",
	"he'd": "he would",
	"he'd've": "he would have",
	"he'll": "he will",
	"he'll've": "he will have",
	"how'd": "how did",
	"how'd'y": "how do you",
	"how'll": "how will",
    "i've": "I have",
	"i'd": "I would",  # since casting to lower-case typically preceeds the step of contraction fixing
	"i'd've": "I would have",
	"i'll": "I will",
	"y'all'd've": "you all would have",
	"y'all're": "you all are",
	"y'all've": "you all have",
	"you'd": "you would",
	"you'd've": "you would have",
	"you'll": "you will",
	"you'll've": "you will have",
	"you're": "you are",
	"you've": "you have"
}

contractions_pattern = re.compile(r'\b(' + '|'.join(contractions_dict.keys()) + r')\b')

def replace_contractions(text):
    def replace(match):
        return contractions_dict[match.group(0)]
    return contractions_pattern.sub(replace, text)

In [None]:
# functions to replace emoticons and emojis
emojis_pattern = re.compile("["
		u"\U0001F600-\U0001F64F"  # emoticons
		u"\U0001F300-\U0001F5FF"  # symbols & pictographs
		u"\U0001F680-\U0001F6FF"  # transport & map symbols
		u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
		u"\U00002702-\U000027B0"
		u"\U000024C2-\U0001F251"
	"]",
	flags=re.UNICODE)

emoticons_pattern = re.compile(u'(' + u'|'.join(re.escape(k) for k in sorted(EMOTICONS_EMO.keys(), key=len, reverse=True)) + u')',
	flags=re.UNICODE)

def replace_emoticons(match):
    emoticon = match.group(0)  # retrieve the matched emoticon
    return "_".join(EMOTICONS_EMO[emoticon].replace(",", " ").split())  # Return the corresponding text equivalent from the dictionary
    # the replace & split are needed because some emoticons/emojis correspond to multiple words

# function to replace emojis
def replace_emojis(match):
    emoji = match.group(0)  # retrieve the matched emoticon
    return "_".join(UNICODE_EMOJI[emoji].replace(",", " ").replace(":"," ").split())  # Return the corresponding text equivalent from the dictionary

In [None]:
# Fixing spelling errors in raw text

# There is a MUCH more (computationally) efficient library to use than PySpellChecker -> 'autocorrect'
def fix_spelling(text):
    return ' '.join(spell(word) for word in text.split())

spell=Speller(lang="en") # we deal exclusively with tweets in English - even if there are
# tweets in other languages (in some cases denoted by a 'lang' column, like in the 'uns_covid1' dataset,
# we only consider those written in English)

def fix_spelling(text):
    words = re.findall(r'\w+', text)
    corrected_words = [spell(word).lower() for word in words]
    return ' '.join(corrected_words)

In [None]:
username_pat = re.compile(r'@\w+')
def remove_usernames(text):
    return re.sub(username_pat, '', text), '@' in text

hashtag_pat = re.compile(r'#\w+')
def remove_hashtags(text):
    return re.sub(hashtag_pat, '', text), '#' in text

url_pat = re.compile(r'https?://\S+|www\.\S+')
def remove_urls(text):
    return re.sub(url_pat, '', text)

html_pat = re.compile(r'<.*?>')
def remove_html(text):
    return re.sub(html_pat, '', text)

def remove_short_words(text, min_length=3):
    return ' '.join(word for word in text.split() if len(word) >= min_length)

special_chars_to_keep = ['!', '?', '"', '*', '$', '%'] # I also want the ELIPSIS ('...'), but will need to replace that with <ELIPSIS>
# because i cannot get rid of single dot chars and keep that at the same time.
# We retain '?' and '!' and '...' because they are -based on the explanations in https://www.oxbridgeediting.co.uk/blog/what-are-the-14-punctuation-marks/
# the punctuation marks most relevant to expressing emotions and sentiment. Plus we retain '"' characters because they might be used in directly quoting someone's
# words, which is possible to be followed by a comment and potential criticism or approval.
# Plus the '*' character which is typically used in social media posts to obscure foul language.

# currency symbols to keep other than the us dollar
currency_symbols = ['\u20AC', '\u20BD', '\u20B9', '\u00A3', 'CHF']  # other than the USD, we keep the Euro, Russian Ruble, Indian Rupee, British Pound, and Swiss Franc symbols.
special_chars_to_keep = ['!', '?', '"', '*', '$', '%'] + currency_symbols  # I also want to keep the ELIPSIS, but I do so by using the capitalized 'ELIPSIS' text
# in the cleaned text (happens after lower casing everything else, so it is the only capitalized text remaining)

# Create a pattern for punctuation characters to remove
punct_to_remove = ''.join([ch for ch in string.punctuation if ch not in special_chars_to_keep])
punct_pattern = re.compile(f"[{re.escape(punct_to_remove)}]")

def remove_punctuation(text):
    text = re.sub(r'(\.)\1\1+|…', r'ELIPSIS', text)  # nltk word_tokenize() or spacy's tokenizer will handle '_ELIPSIS_' as a single token.
    # Should I use another special char other than the underscore it would break it in three.
    # I need to use such a char to make sure that I do not match this word to an actuall utterance of the word 'elipsis' so the contextual meaning is retained.
    return re.sub(punct_pattern, ' ', text)

escape_char_patt = re.compile(r"[" + ''.join([re.escape(i) for i in ['\n', '\t', '\\', '/']]) + "]")
def remove_escape_chars(text):
    return re.sub(escape_char_patt, ' ', text)

exc_whitespace_pat = re.compile(r'\s+')
def remove_excessive_whitespace(text):
    return re.sub(exc_whitespace_pat, ' ', text).strip()

rep_chars = re.compile(r'(.)\1{2,}')  # We both re.compile() and use a r"" string inside of it for efficiency reasons - we're going
# to call on this pattern many thousand times as part of our preprocessing efforts.
def replace_repetitive_chars(text):
    return re.sub(rep_chars, r'\1\1', text)


In [None]:
def initial_feature_extraction(text):
    features = dict()

    # Use spaCy for tokenization, POS tagging, NER, and lemmatization
    doc = nlp(text)

    # Extract POS tags and count occurrences
    pos_counts = Counter(token.pos_ for token in doc)
    features.update({f'{pos}_instances': pos_counts[pos] for pos in pos_counts})

    # Extract named entities and count occurrences
    ner_counts = {label: 0 for label in nlp.get_pipe("ner").labels}
    for ent in doc.ents:
        if ent.label_ in ner_counts:
            ner_counts[ent.label_] += 1
    features.update({f'{label}_instances': ner_counts[label] for label in ner_counts})

    return features, doc

def preprocess(text, features):

    # remove non-ASCII characters - some tweets may contain bizzare characters
    text = remove_non_ascii(text)

    # remove urls and html - happens very early on to make sure it is not influenced by number or
    # short word or symbol removal
    text = remove_urls(text)
    text = remove_html(text)

    # Drop numbers
    text = drop_numbers(text)

    text = text.replace('`', "'")

    # converting to lower case and counting the frequency of fully upper-cased words
    text, upper_words, tweet_length, no_sentences = convert_to_lower_case(text)
    features['upper_words'] = upper_words
    features['tweet_length'] = tweet_length
    features['no_sentences'] = no_sentences

    # replacing contractions [AFTER we've converted to lower case!]
    text = replace_contractions(text)

    # replacing emoticons and emojis
    text = re.sub(emoticons_pattern, replace_emoticons, text)
    text = re.sub(emojis_pattern, replace_emojis, text)

    # removing and flagging the existence of usernames
    text, has_username = remove_usernames(text)
    features['has_username'] = has_username
    text, has_hashtag = remove_hashtags(text)
    features['has_hashtag'] = has_hashtag

    # replace ellipsis and count occurrences
    features['ELIPSIS_instances'] = text.count('ELIPSIS') # It is the only upper-cased utterance remaining
    # in our data, so exact matches will catch only actual occurrences of '...' in the initial text.

    ### ADDED: SIMPLER TEXT PREPROCESSING CASE ###
    features['light_clean_text'] = text
    ###                     ###                ###

    # Remove short words - this is prior to fixing spelling, as it could catch utterances like 'ur' which
    # are slang and typically abbreviations of pronouns or similar linguistic features, that may not bear
    # much meaning.
    text = remove_short_words(text)

    # Fix spelling - doing this here so it does not interfere with usernames, hashtags, etc.
    text = fix_spelling(text)


    # Remove irrelevant/special punctuation chars.
    text = remove_punctuation(text)

    # Remove escaped chars.
    text = remove_escape_chars(text)

    # Remove excessive whitespace.
    text = remove_excessive_whitespace(text)

    # Replace any repeated (3+) chars with double utterances.
    text = replace_repetitive_chars(text)

    # Lemmatization
    doc = nlp(text)  # Create a new doc for lemmatization after all previous transformations have taken
    # place so it is up-to-date.
    text = ' '.join([token.lemma_ for token in doc])

    features['clean_text'] = text
    return features

In [None]:
# Apply tqdm to pandas
tqdm.pandas()

# Function to preprocess a single text entry
def preprocess_single_entry(text):
    features, _ = initial_feature_extraction(text)
    return pd.Series(preprocess(text, features))

# Preprocess in batches and use parallel processing for efficiency
batch_size = 10000  # we can adjust batch size based on memory and performance
n_batches = (len(df) + batch_size - 1) // batch_size
#n_batches = 1  # this line is just to be used when testing (while commenting out the previous line)

# List to store the preprocessed data
preprocessed_data = []

# Each occurrence of the task of pre-processing different text sample inputs is independent from each other.
# Thus we can perform the corresponding computations on different CPU (on, in our case, TPU) cores simultaneously to maximize the
# efficiency with which we utilize our computational resources.
for batch_num in tqdm(range(n_batches), desc="Processing batches"):
    batch_start = batch_num * batch_size
    batch_end = min((batch_num + 1) * batch_size, len(df))
    batch_texts = df['text'].iloc[batch_start:batch_end].tolist()

    # Apply the preprocess function in parallel for the current batch
    batch_preprocessed = Parallel(n_jobs=-1, backend="multiprocessing")(
        delayed(preprocess_single_entry)(text) for text in batch_texts
    )

    # Collect the batch results
    preprocessed_data.extend(batch_preprocessed)


# Ensure the index of preprocessed_data_df matches the index of df. Important because it will be used for joins !!!
preprocessed_data_df = pd.DataFrame(preprocessed_data)
# This also serves a nice way of making sure that something has not gone so wrong during preprocessing as to lose entire samples
try:
  #df = df[:batch_size]  # !!! LINE FOR TESTING PURPOSES !!!
  preprocessed_data_df.index = df.index
except:
  raise Exception("Index mismatch between preprocessed_data_df and df. I have missed samples in the process.")

# Combine the cleaned text and features into the DataFrame
data = pd.concat([df, preprocessed_data_df], axis=1)
# Display the resulting DataFrame
print(data.head())

Processing batches: 100%|██████████| 42/42 [12:21<00:00, 17.65s/it]


                                                text  label  PRON_instances  \
0  ive enjoyed being able to slouch about relax a...      0             5.0   
1                         i dont know i feel so lost      0             2.0   
2         i was beginning to feel quite disheartened      0             1.0   
3  i can still lose the weight without feeling de...      0             1.0   
4  im feeling a little like a damaged tree and th...      0             2.0   

   AUX_instances  VERB_instances  ADJ_instances  PART_instances  \
0            4.0             8.0            5.0             2.0   
1            1.0             2.0            1.0             1.0   
2            1.0             2.0            1.0             1.0   
3            1.0             3.0            NaN             NaN   
4            2.0             2.0            3.0             NaN   

   ADP_instances  CCONJ_instances  ADV_instances  ...  has_hashtag  \
0            4.0              3.0            4.0  ..

In [None]:
data.columns

Index(['text', 'label', 'PRON_instances', 'AUX_instances', 'VERB_instances',
       'ADJ_instances', 'PART_instances', 'ADP_instances', 'CCONJ_instances',
       'ADV_instances', 'DET_instances', 'NOUN_instances', 'PROPN_instances',
       'CARDINAL_instances', 'DATE_instances', 'EVENT_instances',
       'FAC_instances', 'GPE_instances', 'LANGUAGE_instances', 'LAW_instances',
       'LOC_instances', 'MONEY_instances', 'NORP_instances',
       'ORDINAL_instances', 'ORG_instances', 'PERCENT_instances',
       'PERSON_instances', 'PRODUCT_instances', 'QUANTITY_instances',
       'TIME_instances', 'WORK_OF_ART_instances', 'upper_words',
       'tweet_length', 'no_sentences', 'has_username', 'has_hashtag',
       'ELIPSIS_instances', 'light_clean_text', 'clean_text',
       'SCONJ_instances', 'NUM_instances', 'X_instances', 'INTJ_instances',
       'PUNCT_instances', 'SYM_instances'],
      dtype='object')

In [None]:
data = data.fillna(0)

In [None]:
# DELETE ALL ROWS THAT HAVE 'clean_text' == '' - could be the case with tweets only containing a single url
data = data[data['clean_text'] != '']
data.head()

Unnamed: 0,text,label,PRON_instances,AUX_instances,VERB_instances,ADJ_instances,PART_instances,ADP_instances,CCONJ_instances,ADV_instances,...,has_hashtag,ELIPSIS_instances,light_clean_text,clean_text,SCONJ_instances,NUM_instances,X_instances,INTJ_instances,PUNCT_instances,SYM_instances
0,ive enjoyed being able to slouch about relax a...,0,5.0,4.0,8.0,5.0,2.0,4.0,3.0,4.0,...,False,0,ive enjoyed being able to slouch about relax a...,I ve enjoy be able such about relax and wind a...,0.0,0.0,0.0,0.0,0.0,0.0
1,i dont know i feel so lost,0,2.0,1.0,2.0,1.0,1.0,0.0,0.0,1.0,...,False,0,i dont know i feel so lost,do not know feel lose,0.0,0.0,0.0,0.0,0.0,0.0
2,i was beginning to feel quite disheartened,0,1.0,1.0,2.0,1.0,1.0,0.0,0.0,1.0,...,False,0,i was beginning to feel quite disheartened,be begin feel quite disheartened,0.0,0.0,0.0,0.0,0.0,0.0
3,i can still lose the weight without feeling de...,0,1.0,1.0,3.0,0.0,0.0,1.0,0.0,1.0,...,False,0,i can still lose the weight without feeling de...,can still lose the weight without feel deprive,0.0,0.0,0.0,0.0,0.0,0.0
4,im feeling a little like a damaged tree and th...,0,2.0,2.0,2.0,3.0,0.0,3.0,1.0,0.0,...,False,0,im feeling a little like a damaged tree and th...,feel little like damaged tree and that root be...,1.0,0.0,0.0,0.0,0.0,0.0


In [None]:
data.head()

Unnamed: 0,text,label,PRON_instances,AUX_instances,VERB_instances,ADJ_instances,PART_instances,ADP_instances,CCONJ_instances,ADV_instances,...,has_hashtag,ELIPSIS_instances,light_clean_text,clean_text,SCONJ_instances,NUM_instances,X_instances,INTJ_instances,PUNCT_instances,SYM_instances
0,ive enjoyed being able to slouch about relax a...,0,5.0,4.0,8.0,5.0,2.0,4.0,3.0,4.0,...,False,0,ive enjoyed being able to slouch about relax a...,I ve enjoy be able such about relax and wind a...,0.0,0.0,0.0,0.0,0.0,0.0
1,i dont know i feel so lost,0,2.0,1.0,2.0,1.0,1.0,0.0,0.0,1.0,...,False,0,i dont know i feel so lost,do not know feel lose,0.0,0.0,0.0,0.0,0.0,0.0
2,i was beginning to feel quite disheartened,0,1.0,1.0,2.0,1.0,1.0,0.0,0.0,1.0,...,False,0,i was beginning to feel quite disheartened,be begin feel quite disheartened,0.0,0.0,0.0,0.0,0.0,0.0
3,i can still lose the weight without feeling de...,0,1.0,1.0,3.0,0.0,0.0,1.0,0.0,1.0,...,False,0,i can still lose the weight without feeling de...,can still lose the weight without feel deprive,0.0,0.0,0.0,0.0,0.0,0.0
4,im feeling a little like a damaged tree and th...,0,2.0,2.0,2.0,3.0,0.0,3.0,1.0,0.0,...,False,0,im feeling a little like a damaged tree and th...,feel little like damaged tree and that root be...,1.0,0.0,0.0,0.0,0.0,0.0


In [None]:
# Turn all float columns to integer.
for col in data.select_dtypes(include=['float64']).columns:
    data[col] = data[col].astype('int64')


In [None]:
data.head()

Unnamed: 0,text,label,PRON_instances,AUX_instances,VERB_instances,ADJ_instances,PART_instances,ADP_instances,CCONJ_instances,ADV_instances,...,has_hashtag,ELIPSIS_instances,light_clean_text,clean_text,SCONJ_instances,NUM_instances,X_instances,INTJ_instances,PUNCT_instances,SYM_instances
0,ive enjoyed being able to slouch about relax a...,0,5,4,8,5,2,4,3,4,...,False,0,ive enjoyed being able to slouch about relax a...,I ve enjoy be able such about relax and wind a...,0,0,0,0,0,0
1,i dont know i feel so lost,0,2,1,2,1,1,0,0,1,...,False,0,i dont know i feel so lost,do not know feel lose,0,0,0,0,0,0
2,i was beginning to feel quite disheartened,0,1,1,2,1,1,0,0,1,...,False,0,i was beginning to feel quite disheartened,be begin feel quite disheartened,0,0,0,0,0,0
3,i can still lose the weight without feeling de...,0,1,1,3,0,0,1,0,1,...,False,0,i can still lose the weight without feeling de...,can still lose the weight without feel deprive,0,0,0,0,0,0
4,im feeling a little like a damaged tree and th...,0,2,2,2,3,0,3,1,0,...,False,0,im feeling a little like a damaged tree and th...,feel little like damaged tree and that root be...,1,0,0,0,0,0


In [None]:
# Now I create a new bigquery table with the preprocessed dataframe, called not `c_emotions` but `c_prep_emotions`.
# This table we will create not by uploading a .csv file in a bucket of our bigquery project, but by directly 'feeding' the dataframe into a bigquery table
def load_to_bigquery(df, table_ref):
    job = client.load_table_from_dataframe(df, table_ref, job_config=job_config)
    job.result()
    print(f'Loaded DataFrame into {table_ref.table_id}.')


In [None]:
# Google Cloud project id and dataset information
project_id = 'nlp-project-427710'
dataset_id = 'person_emotion'
table_id = 'c_prep_emotions1'

# Initializing the bigquery storage client to handle this request
from google.cloud import storage
storage_client = storage.Client(project=project_id, credentials=credentials)

# Creating a reference to the bigquery dataset
dataset_ref = client.dataset(dataset_id)


In [None]:
### DEFINING AND CHECKING THE SCHEMA
case_specific = [("text", "STRING"), ("label", "INTEGER"), ("clean_text", "STRING"), ("light_clean_text", "STRING")]  # these are the specific
extra_features_numeric = [(f"{col}", "INTEGER") for col in list(preprocessed_data_df.columns[['_instanc' in i for i in preprocessed_data_df.columns]]) + ['upper_words', 'tweet_length', 'no_sentences']]
extra_features_boolean = [(f"{col}", "BOOL") for col in ['has_username', 'has_hashtag']]
schema = case_specific + extra_features_numeric + extra_features_boolean

# Generate the SchemaField objects
schema = [bigquery.SchemaField(name, dtype) for name, dtype in schema]

job_config = bigquery.LoadJobConfig(
    schema=schema,
    write_disposition=bigquery.WriteDisposition.WRITE_TRUNCATE,  # Overwrite the table if it exists
)

for field in schema:
    print(f"Field: {field.name}, Type: {field.field_type}")

Field: text, Type: STRING
Field: label, Type: INTEGER
Field: clean_text, Type: STRING
Field: light_clean_text, Type: STRING
Field: PRON_instances, Type: INTEGER
Field: AUX_instances, Type: INTEGER
Field: VERB_instances, Type: INTEGER
Field: ADJ_instances, Type: INTEGER
Field: PART_instances, Type: INTEGER
Field: ADP_instances, Type: INTEGER
Field: CCONJ_instances, Type: INTEGER
Field: ADV_instances, Type: INTEGER
Field: DET_instances, Type: INTEGER
Field: NOUN_instances, Type: INTEGER
Field: PROPN_instances, Type: INTEGER
Field: CARDINAL_instances, Type: INTEGER
Field: DATE_instances, Type: INTEGER
Field: EVENT_instances, Type: INTEGER
Field: FAC_instances, Type: INTEGER
Field: GPE_instances, Type: INTEGER
Field: LANGUAGE_instances, Type: INTEGER
Field: LAW_instances, Type: INTEGER
Field: LOC_instances, Type: INTEGER
Field: MONEY_instances, Type: INTEGER
Field: NORP_instances, Type: INTEGER
Field: ORDINAL_instances, Type: INTEGER
Field: ORG_instances, Type: INTEGER
Field: PERCENT_insta

In [None]:
load_to_bigquery(data, dataset_ref.table(table_id))

Loaded DataFrame into c_prep_emotions1.
