In [11]:
import sqlite3
import pandas as pd
import re
import nltk
import numpy as np
import re

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from collections import Counter

In [12]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.width', None)
np.set_printoptions(threshold=np.inf)

# In this notebook we only read, clean and save data as csv. 
We don't tokenise it because with tokenised column data will take more than 100mb and we will not be able to save it to git.

Also because there are multiple ways to tokenise data.

So it's up to every separate model training file to tokenise the data.

## 1. Read data

In [15]:
# Connect to the database
DB_FILE = '/local/DSPT/data/nlp-data.db'
con = sqlite3.connect(DB_FILE)
cur = con.cursor()

# Define the SQL query
QUERY = """
    SELECT 
        Documents.DocumentID, 
        RawTexts.Text, 
        Labels.NumericValue AS Label, 
        Labels.Type AS LabelType, 
        Labels.StringValue AS LabelName,  
        Documents.Type AS DocumentType,
        RawTexts.LengthCharacters,
        RawTexts.HasEmoji
    FROM Documents
    INNER JOIN RawTexts ON Documents.RawTextID = RawTexts.RawTextID
    INNER JOIN Labels ON Documents.LabelID = Labels.LabelID;
"""

# Execute the query and load data into a DataFrame
reviews = pd.read_sql_query(QUERY, con)
con.close()

## 2. Clean data

In [16]:
# Function to clean text
def clean_text(text):
    text = re.sub(r'<.*?>', '', text)  # Remove HTML tags
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\d+', '', text)  # Remove digits
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    text = re.sub(r'_', '', text)
    return text

# Clean the text data
reviews['cleaned_text'] = reviews['Text'].apply(clean_text)
reviews = reviews[reviews['cleaned_text'] != '']

# Drow rows that have None in cleaned_text column
reviews = reviews.dropna(subset=['cleaned_text'])

### 3. Map amazon rating to -1/0/1

In [17]:
def map_amazon_rating(rating):
    if rating <= 2:
        return -1
    elif rating == 3:
        return 0
    else:
        return 1

reviews['LabelMapped'] = reviews.apply(lambda row: map_amazon_rating(row["Label"]) if row['LabelType'] == 'StarRating' else row["Label"], axis=1)

In [18]:
reviews['LabelMapped'].unique()

array([-1,  0,  1])

### 4. Add emoticons as columns

In [20]:
reviews['emoticon1'] = reviews['Text'].str.contains(":-\)").astype(int)
reviews['emoticon2'] = reviews['Text'].str.contains(':-/').astype(int)
reviews['emoticon3'] = reviews['Text'].str.contains(':-\(').astype(int)

### 5. Extract hashtags

### 5. Save to csv

In [21]:
reviews.to_csv('data/prepared_data.csv', index=False)