In [2]:
import sqlite3
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# In this notebook we only read, clean and save data as csv. 
We don't tokenise it because with tokenised column data will take more than 100mb and we will not be able to save it to git.

Also because there are multiple ways to tokenise data.

So it's up to every separate model training file to tokenise the data.

## 1. Read data

In [3]:
# Connect to the database
DB_FILE = '/local/DSPT/data/nlp-data.db'
con = sqlite3.connect(DB_FILE)
cur = con.cursor()

# Define the SQL query
QUERY = """
    SELECT 
        Documents.DocumentID, 
        RawTexts.Text, 
        Labels.NumericValue AS Label, 
        Labels.Type AS LabelType, 
        Labels.StringValue AS LabelName,  
        Documents.Type AS DocumentType,
        RawTexts.LengthCharacters,
        RawTexts.HasEmoji
    FROM Documents
    INNER JOIN RawTexts ON Documents.RawTextID = RawTexts.RawTextID
    INNER JOIN Labels ON Documents.LabelID = Labels.LabelID;
"""

# Execute the query and load data into a DataFrame
reviews = pd.read_sql_query(QUERY, con)
con.close()

In [4]:
reviews

Unnamed: 0,DocumentID,Text,Label,LabelType,LabelName,DocumentType,LengthCharacters,HasEmoji
0,1,Arrived broken. Manufacturer defect. Two of th...,1,StarRating,1 Star,Review,612,0
1,2,the cabinet dot were all detached from backing...,1,StarRating,1 Star,Review,56,0
2,3,I received my first order of this product and ...,1,StarRating,1 Star,Review,206,0
3,4,This product is a piece of shit. Do not buy. D...,1,StarRating,1 Star,Review,145,0
4,5,went through 3 in one day doesn't fit correct ...,1,StarRating,1 Star,Review,91,0
...,...,...,...,...,...,...,...,...
255078,255079,@Racalto_SK ok good to know. Punting at MetLif...,1,Sentiment,Positive,Tweet,94,0
255079,255080,everyone who sat around me at metlife was so a...,0,Sentiment,Neutral,Tweet,99,0
255080,255081,what giants or niners fans would wanna go to t...,0,Sentiment,Neutral,Tweet,78,0
255081,255082,Anybody want a ticket for tomorrow Colombia vs...,1,Sentiment,Positive,Tweet,63,0


## 2. Clean data

In [36]:
# Function to clean text
def clean_text(text):
    text = re.sub(r'<.*?>', '', text)  # Remove HTML tags
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\d+', '', text)  # Remove digits
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    text = re.sub(r'_', '', text)
    return text

# Clean the text data
reviews['cleaned_text'] = reviews['Text'].apply(clean_text)
reviews = reviews[reviews['cleaned_text'] != '']

# Drow rows that have None in cleaned_text column
reviews = reviews.dropna(subset=['cleaned_text'])

### 3. Map amazon rating to -1/0/1

In [38]:
def map_amazon_rating(rating):
    if rating <= 2:
        return -1
    elif rating == 3:
        return 0
    else:
        return 1

reviews['LabelMapped'] = reviews.apply(lambda row: map_amazon_rating(row["Label"]) if row['LabelType'] == 'StarRating' else row["Label"], axis=1)

In [39]:
reviews['LabelMapped'].unique()

array([-1,  0,  1])

### 4. Save to csv

In [40]:
reviews.to_csv('data/prepared_data.csv', index=False)