In [40]:
import csv
import sys
import pandas as pd
import numpy as np
import re
import nltk
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Uncomment this if these packages are not downloaded
# nltk.download('punkt')
# nltk.download('stopwords')
# nltk.download('wordnet')
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()


# Increase the maximum field size limit
csv.field_size_limit(sys.maxsize)

trainData = []
with open('fulltrain.csv', 'r', encoding='utf-8') as file:
    reader = csv.reader(file)
    for row in reader:
        trainData.append(row)

df = pd.DataFrame(trainData, columns=['Label', 'Text'])


testData = []
with open('balancedtest.csv', 'r', encoding='utf-8') as file:
    reader = csv.reader(file)
    for row in reader:
        testData.append(row)

# Convert to DataFrame
df2 = pd.DataFrame(testData, columns=['Label', 'Text'])

print(df.shape[0])
print(df2.shape[0])

48854
3000


## Lowercase Only

In [41]:
#preprocessing way1: only lower
def preprocess_text(text):
    return text.lower()

In [42]:
dfl = df.copy()
dfl['text_lowercase'] = df['Text'].apply(preprocess_text)
dfl.to_csv('way1_train.csv', index=False)

dfl2 = df2.copy()
dfl2['text_lowercase'] = df2['Text'].apply(preprocess_text)
dfl2.to_csv('way1_test.csv', index=False)

## Uppercase only

In [43]:
#preprocessing way2: only upper
def preprocess_text(text):
    return text.upper()

In [44]:
dfu = df.copy()
dfu['text_uppercase'] = df['Text'].apply(preprocess_text)
dfu.to_csv('way2_train.csv', index=False)

dfu2 = df2.copy()
dfu2['text_uppercase'] = df2['Text'].apply(preprocess_text)
dfu2.to_csv('way2_test.csv', index=False)

## Lowercase + lemmatized

In [47]:
#Preprocessing way3: lower + lemmatized
def preprocess_text(text):
    text = text.lower()
    words = word_tokenize(text)
    words = [lemmatizer.lemmatize(word) for word in words]
    text = ' '.join(words)
    return text

In [48]:
dfll = df.copy()
dfll['text_way3'] = df['Text'].apply(preprocess_text)
dfll.to_csv('way3_train.csv', index=False)

dfll2 = df2.copy()
dfll2['text_way3'] = df2['Text'].apply(preprocess_text)
dfll2.to_csv('way3_test.csv', index=False)

## Lowercase + punctuation removed

In [53]:
#Preprocessing way4: lower + removal of punctuation
def remove_punc(text):
    text = text.lower()
    text = word_tokenize(text)
    punchless_bunch = []
    for word in text:
        punchless_word = word.strip(string.punctuation)
        if punchless_word:
            punchless_bunch.append(punchless_word)
    return punchless_bunch


In [50]:
dflp = df.copy()
dflp['text_way4'] = df['Text'].apply(remove_punc)
dflp.to_csv('way4_train.csv', index=False)

dflp2 = df2.copy()
dflp2['text_way4'] = df2['Text'].apply(remove_punc)
dflp2.to_csv('way4_test.csv', index=False)

print(df.shape[0])
print(df2.shape[0])

48854
3000


## Lowercase + lemmatized + removal of stopwords and punctuations

In [54]:
#preprocessing way5:lower+lematization+remove stopwords+remove punctuation
def preprocess_text(text):
    new_text = []
    text = remove_punc(text)
    for sentence in text:
        new_sentence =[]
        for word in sentence:
            if word not in stop_words:  
                new_sentence.append(lemmatizer.lemmatize(word))
  
        new_text.append(new_sentence)
    
    return new_text

In [55]:
dflsp = df.copy()
dflsp['text_way5'] = df['Text'].apply(preprocess_text)
dflsp.to_csv('way5_train.csv', index=False)

dflsp2 = df2.copy()
dflsp2['text_way5'] = df2['Text'].apply(preprocess_text)
dflsp2.to_csv('way5_test.csv', index=False)


## Lowercase + lemmatized + removal of stopwords and punctuations and numbers

In [56]:
#preprocessing way6:lower+lematization+remove stopwords+punctations+numbers
def preprocess_text(text):
    new_text = []
    new_result = []
    text = remove_punc(text)
    
    # Remove numbers/digits
    for sentence in text:
        new_sentence = []
        for token in sentence:
            word = re.sub(r'\d+', '', token)
            if word:
                new_sentence.append(word)
        new_text.append(new_sentence)
    
    # Lemmatize words
    for sentence in new_text:
        new_sentence = []
        for word in sentence:
            if word not in stop_words:  
                new_sentence.append(lemmatizer.lemmatize(word))
        new_result.append(new_sentence)
    
    return new_result

In [57]:
dfspn = df.copy()
dfspn['text_way6'] = df['Text'].apply(preprocess_text)
dfspn.to_csv('way6_train.csv', index=False)

dfspn2 = df2.copy()
dfspn2['text_way6'] = df2['Text'].apply(preprocess_text)
dfspn2.to_csv('way6_test.csv', index=False)


### Additional functions coming soon...

In [31]:
X_train = train_data['New_Text']
X_test = test_data['New_Text']

y_train = train_data['category']
y_test = test_data['category']

In [37]:
train_data.to_csv('train_data.csv')
test_data.to_csv('test_data.csv')

In [32]:
#tf-idf
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(max_features=10000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train).toarray()
X_test_tfidf = tfidf_vectorizer.transform(X_test).toarray()


In [33]:
#tf-idf + Glove
#Glove path
glove_path = "/content/drive/MyDrive/dsml/cs4248/as2/glove.6B.300d.txt"

def load_glove_dict(path):
    embeddings_dict = {}
    with open(path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], "float32")
            embeddings_dict[word] = vector
    return embeddings_dict

glove_dict = load_glove_dict(glove_path)

def combine_glove_tfidf(texts, tfidf, tfidf_vectorizer, glove_dict):
    tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()
    vectors = np.zeros((len(texts), 300))
    for i, text in enumerate(texts):
        tokens = text.split()
        token_vectors = np.zeros((len(tokens), 300))
        for j, token in enumerate(tokens):
            if token in glove_dict:
                glove_vector = glove_dict[token]
                tfidf_index = tfidf_vectorizer.vocabulary_.get(token, -1)
                if tfidf_index != -1:
                    tfidf_value = tfidf[i, tfidf_index]
                    token_vectors[j] = glove_vector * tfidf_value
        if token_vectors.any():
            vectors[i] = np.mean(token_vectors, axis=0)
    return vectors

X_train_combined = combine_glove_tfidf(X_train, X_train_tfidf, tfidf_vectorizer, glove_dict)
X_test_combined = combine_glove_tfidf(X_test, X_test_tfidf, tfidf_vectorizer, glove_dict)

In [None]:
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical

label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_train_categorical = to_categorical(y_train_encoded)
y_test_encoded = label_encoder.transform(y_test)
y_test_categorical = to_categorical(y_test_encoded)