In [15]:
import csv
import sys
import pandas as pd
import numpy as np
import re
import nltk
import string
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Uncomment this if these packages are not downloaded
# nltk.download('punkt')
# nltk.download('stopwords')
# nltk.download('wordnet')
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()


# Increase the maximum field size limit
csv.field_size_limit(sys.maxsize)

trainData = []
with open('fulltrain.csv', 'r', encoding='utf-8') as file:
    reader = csv.reader(file)
    for row in reader:
        trainData.append(row)

df = pd.DataFrame(trainData, columns=['Label', 'Text'])


testData = []
with open('balancedtest.csv', 'r', encoding='utf-8') as file:
    reader = csv.reader(file)
    for row in reader:
        testData.append(row)

# Convert to DataFrame
df2 = pd.DataFrame(testData, columns=['Label', 'Text'])

print(df.shape[0])
print(df2.shape[0])

48854
3000


In [62]:
import pickle

file_path = 'way11_train.pkl'

with open(file_path, 'rb') as file:
    your_object = pickle.load(file)

print(your_object)

      Label                                               Text  \
0         1  A little less than a decade ago, hockey fans w...   
1         1  The writers of the HBO series The Sopranos too...   
2         1  Despite claims from the TV news outlet to offe...   
3         1  After receiving 'subpar' service and experienc...   
4         1  After watching his beloved Seattle Mariners pr...   
...     ...                                                ...   
48849     4  The ruling Kuomintang (KMT) has claimed owners...   
48850     4  The Taipei city government has encouraged the ...   
48851     4  President Ma Ying-jeou said Friday that a park...   
48852     4  The families of the four people who were kille...   
48853     4  The Ministry of Finance will make public on Sa...   

                                      tokenized_sentence  \
0      [A little less than a decade ago, hockey fans ...   
1      [The writers of the HBO series The Sopranos to...   
2      [Despite claims from

## Lowercase + remove stopwords

In [16]:
# way6 
def preprocess_text(text):
    
    tokenized_sentences = sent_tokenize(text)
    new_text = []
    for sentence in tokenized_sentences:
        tokenized_words = word_tokenize(sentence)
        new_sentence = []
        for word in tokenized_words:
            lower_word = word.lower()
            if lower_word not in stop_words:
                new_sentence.append(lower_word)
        new_text.append(new_sentence)
    return new_text

In [17]:
dfl = df.copy()
dfl['tokenized_sentence'] = df['Text'].apply(sent_tokenize)
dfl['way6_text'] = df['Text'].apply(preprocess_text)
dfl.to_pickle('way6_train.pkl')

dfl2 = df2.copy()
dfl2['tokenized_sentence'] = df2['Text'].apply(sent_tokenize)
dfl2['way6_text'] = df2['Text'].apply(preprocess_text)
dfl2.to_pickle('way6_test.pkl')

## Normal case + remove stopwords & punctuations

In [56]:
# way7
def remove_punc(text):
    text = word_tokenize(text)
    translator = str.maketrans('', '', string.punctuation)
    punchless_bunch = [word.translate(translator) for word in text if word.translate(translator)]
    return punchless_bunch


def preprocess_text(text):
    tokenized_sentences = sent_tokenize(text)
    new_text = []
    for sentence in tokenized_sentences:
        tokenized_words = remove_punc(sentence)
        new_sentence = []
        for word in tokenized_words:
            if word not in stop_words:
                new_sentence.append(word)
        new_text.append(new_sentence)
    return new_text

In [57]:
dfb = df.copy()
dfb['tokenized_sentence'] = df['Text'].apply(sent_tokenize)
dfb['way7_text'] = df['Text'].apply(preprocess_text)
dfb.to_pickle('way7_train.pkl')

dfb2 = df2.copy()
dfb2['tokenized_sentence'] = df2['Text'].apply(sent_tokenize)
dfb2['way7_text'] = df2['Text'].apply(preprocess_text)
dfb2.to_pickle('way7_test.pkl')

## Lowercase + remove stopwords & punctuations

In [50]:
# way8
def remove_punc(text):
    text = text.lower()
    text = word_tokenize(text)
    translator = str.maketrans('', '', string.punctuation)
    punchless_bunch = [word.translate(translator) for word in text if word.translate(translator)]
    return punchless_bunch


def preprocess_text(text):
    tokenized_sentences = sent_tokenize(text)
    new_text = []
    for sentence in tokenized_sentences:
        tokenized_words = remove_punc(sentence)
        new_sentence = []
        for word in tokenized_words:
            if word not in stop_words:
                new_sentence.append(word)
        new_text.append(new_sentence)
    return new_text

In [51]:
dfc = df.copy()
dfc['tokenized_sentence'] = df['Text'].apply(sent_tokenize)
dfc['way8_text'] = df['Text'].apply(preprocess_text)
dfc.to_pickle('way8_train.pkl')

dfc2 = df2.copy()
dfc2['tokenized_sentence'] = df2['Text'].apply(sent_tokenize)
dfc2['way8_text'] = df2['Text'].apply(preprocess_text)
dfc2.to_pickle('way8_test.pkl')

## Normal case + remove stopwords & lemmatization

In [39]:
# way9
def preprocess_text(text):   
    tokenized_sentences = sent_tokenize(text)
    new_text = []
    for sentence in tokenized_sentences:
        tokenized_words = word_tokenize(sentence)
        new_sentence = []
        for word in tokenized_words:
            if word not in stop_words:
                lemmatized_word = lemmatizer.lemmatize(word)
                new_sentence.append(lemmatized_word)
        new_text.append(new_sentence)
    return new_text

In [40]:
dfd = df.copy()
dfd['tokenized_sentence'] = df['Text'].apply(sent_tokenize)
dfd['way9_text'] = df['Text'].apply(preprocess_text)
dfd.to_pickle('way9_train.pkl')

dfd2 = df2.copy()
dfd2['tokenized_sentence'] = df2['Text'].apply(sent_tokenize)
dfd2['way9_text'] = df2['Text'].apply(preprocess_text)
dfd2.to_pickle('way9_test.pkl')

## Normal case + lemmatization

In [44]:
# way10
def preprocess_text(text):   
    tokenized_sentences = sent_tokenize(text)
    new_text = []
    for sentence in tokenized_sentences:
        tokenized_words = word_tokenize(sentence)
        new_sentence = []
        for word in tokenized_words:
            lemmatized_word = lemmatizer.lemmatize(word)
            new_sentence.append(lemmatized_word)
        new_text.append(new_sentence)
    return new_text

In [45]:
dfe = df.copy()
dfe['tokenized_sentence'] = df['Text'].apply(sent_tokenize)
dfe['way10_text'] = df['Text'].apply(preprocess_text)
dfe.to_pickle('way10_train.pkl')

dfe2 = df2.copy()
dfe2['tokenized_sentence'] = df2['Text'].apply(sent_tokenize)
dfe2['way10_text'] = df2['Text'].apply(preprocess_text)
dfe2.to_pickle('way10_test.pkl')

## Normal case + separate contraction

In [60]:
# way11

contractions = { 
"ain't": "am not / are not / is not / has not / have not",
"aren't": "are not / am not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he had / he would",
"he'd've": "he would have",
"he'll": "he shall / he will",
"he'll've": "he shall have / he will have",
"here's": "here is",
"he's": "he has / he is",
"how'd": "how did",
"how'd'y": "how do you",
"how'll": "how will",
"how's": "how has / how is / how does",
"I'd": "I had / I would",
"I'd've": "I would have",
"I'll": "I shall / I will",
"I'll've": "I shall have / I will have",
"I'm": "I am",
"I've": "I have",
"isn't": "is not",
"it'd": "it had / it would",
"it'd've": "it would have",
"it'll": "it shall / it will",
"it'll've": "it shall have / it will have",
"it's": "it has / it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"mightn't've": "might not have",
"must've": "must have",
"mustn't": "must not",
"mustn't've": "must not have",
"needn't": "need not",
"needn't've": "need not have",
"o'clock": "of the clock",
"oughtn't": "ought not",
"oughtn't've": "ought not have",
"shan't": "shall not",
"sha'n't": "shall not",
"shan't've": "shall not have",
"she'd": "she had / she would",
"she'd've": "she would have",
"she'll": "she shall / she will",
"she'll've": "she shall have / she will have",
"she's": "she has / she is",
"should've": "should have",
"shouldn't": "should not",
"shouldn't've": "should not have",
"so've": "so have",
"so's": "so as / so is",
"that'd": "that would / that had",
"that'd've": "that would have",
"that's": "that has / that is",
"there'd": "there had / there would",
"there'd've": "there would have",
"there's": "there has / there is",
"they'd": "they had / they would",
"they'd've": "they would have",
"they'll": "they shall / they will",
"they'll've": "they shall have / they will have",
"they're": "they are",
"they've": "they have",
"to've": "to have",
"wasn't": "was not",
"we'd": "we had / we would",
"we'd've": "we would have",
"we'll": "we will",
"we'll've": "we will have",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what shall / what will",
"what'll've": "what shall have / what will have",
"what're": "what are",
"what's": "what has / what is",
"what've": "what have",
"when's": "when has / when is",
"when've": "when have",
"where'd": "where did",
"where's": "where has / where is",
"where've": "where have",
"who'll": "who shall / who will",
"who'll've": "who shall have / who will have",
"who's": "who has / who is",
"who've": "who have",
"why's": "why has / why is",
"why've": "why have",
"will've": "will have",
"won't": "will not",
"won't've": "will not have",
"would've": "would have",
"wouldn't": "would not",
"wouldn't've": "would not have",
"y'all": "you all",
"y'all'd": "you all would",
"y'all'd've": "you all would have",
"y'all're": "you all are",
"y'all've": "you all have",
"you'd": "you had / you would",
"you'd've": "you would have",
"you'll": "you shall / you will",
"you'll've": "you shall have / you will have",
"you're": "you are",
"you've": "you have"
}

def expand_contractions_sentence(sentence, contractions):
    for contraction, expansion in contractions.items():
        pattern = re.compile(re.escape(contraction), re.IGNORECASE)
        sentence = pattern.sub(expansion, sentence)
    return sentence

def preprocess_text(text):
    sentences = sent_tokenize(text)
    processed_sentences = [expand_contractions_sentence(sentence, contractions) for sentence in sentences]
    return processed_sentences

In [61]:
dff = df.copy()
dff['tokenized_sentence'] = df['Text'].apply(sent_tokenize)
dff['way11_text'] = df['Text'].apply(preprocess_text)
dff.to_pickle('way11_train.pkl')

dff2 = df2.copy()
dff2['tokenized_sentence'] = df2['Text'].apply(sent_tokenize)
dff2['way11_text'] = df2['Text'].apply(preprocess_text)
dff2.to_pickle('way11_test.pkl')

---------------------------------------------------------------------------------

In [32]:
#tf-idf
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(max_features=10000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train).toarray()
X_test_tfidf = tfidf_vectorizer.transform(X_test).toarray()


In [33]:
#tf-idf + Glove
#Glove path
glove_path = "/content/drive/MyDrive/dsml/cs4248/as2/glove.6B.300d.txt"

def load_glove_dict(path):
    embeddings_dict = {}
    with open(path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], "float32")
            embeddings_dict[word] = vector
    return embeddings_dict

glove_dict = load_glove_dict(glove_path)

def combine_glove_tfidf(texts, tfidf, tfidf_vectorizer, glove_dict):
    tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()
    vectors = np.zeros((len(texts), 300))
    for i, text in enumerate(texts):
        tokens = text.split()
        token_vectors = np.zeros((len(tokens), 300))
        for j, token in enumerate(tokens):
            if token in glove_dict:
                glove_vector = glove_dict[token]
                tfidf_index = tfidf_vectorizer.vocabulary_.get(token, -1)
                if tfidf_index != -1:
                    tfidf_value = tfidf[i, tfidf_index]
                    token_vectors[j] = glove_vector * tfidf_value
        if token_vectors.any():
            vectors[i] = np.mean(token_vectors, axis=0)
    return vectors

X_train_combined = combine_glove_tfidf(X_train, X_train_tfidf, tfidf_vectorizer, glove_dict)
X_test_combined = combine_glove_tfidf(X_test, X_test_tfidf, tfidf_vectorizer, glove_dict)

In [None]:
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical

label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_train_categorical = to_categorical(y_train_encoded)
y_test_encoded = label_encoder.transform(y_test)
y_test_categorical = to_categorical(y_test_encoded)