In [None]:
import csv
import sys
import pandas as pd
import numpy as np
import re
import nltk
import string
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
punctuations = set(string.punctuation)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# import data here
# should change the data path

train_data_path = '/content/drive/MyDrive/preprocessed_data/raw_data/fulltrain.csv'
test_data_path = '/content/drive/MyDrive/preprocessed_data/raw_data/balancedtest.csv'

train_data = pd.read_csv(train_data_path,header = None,names=['category', 'text'])
test_data = pd.read_csv(test_data_path,header = None,names=['category', 'text'])

In [None]:
print(train_data.shape[0])
print(test_data.shape[0])

48854
3000


In [None]:
# split sentence
def split_sentence(text):
    sentences = sent_tokenize(text, language='english')
    return sentences

train_data['tokenized_sentence'] = train_data['text'].apply(split_sentence)
test_data['tokenized_sentence'] = test_data['text'].apply(split_sentence)

In [None]:
train_data = train_data.drop('text', axis=1)
test_data = test_data.drop('text', axis=1)

In [None]:
train_data

Unnamed: 0,category,tokenized_sentence
0,1,"[A little less than a decade ago, hockey fans ..."
1,1,[The writers of the HBO series The Sopranos to...
2,1,[Despite claims from the TV news outlet to off...
3,1,[After receiving 'subpar' service and experien...
4,1,[After watching his beloved Seattle Mariners p...
...,...,...
48849,4,[The ruling Kuomintang (KMT) has claimed owner...
48850,4,[The Taipei city government has encouraged the...
48851,4,[President Ma Ying-jeou said Friday that a par...
48852,4,[The families of the four people who were kill...


# way 1 Normal case

In [None]:
#preprocessing way1: Normal case
def preprocess_text(sentences):
    tokenized_sentences = [word_tokenize(sentence) for sentence in sentences]
    return tokenized_sentences

# way 2 Lower case

In [None]:
#preprocessing way2: Lower case
def preprocess_text(sentences):
    tokenized_sentences = [word_tokenize(sentence) for sentence in sentences]
    lowercased_sentences = [[word.lower() for word in sentence] for sentence in tokenized_sentences]
    return lowercased_sentences

# way 3 Normal case, no punctuation

In [None]:
#preprocessing way3: Normal case, no punctuation
def preprocess_text(sentences):
    tokenized_sentences = [word_tokenize(sentence) for sentence in sentences]
    cleaned_sentences = [
        [word for word in sentence if not all(char in punctuations for char in word)]
        for sentence in tokenized_sentences
    ]
    return cleaned_sentences

# way 4 Lower case, no punctuation

In [None]:
#preprocessing way4: Lower case, no punctuation
def preprocess_text(sentences):
    tokenized_sentences = [word_tokenize(sentence) for sentence in sentences]
    cleaned_sentences = [
        [word.lower() for word in sentence if not all(char in punctuations for char in word)]
        for sentence in tokenized_sentences
    ]
    return cleaned_sentences

# way 5 Normal case, no stopwords


exist a problem, with no lowercase, 'A' can not be removed  while 'a' is a stopword.

In [None]:
#preprocessing way5: Normal case, no stopwords
def preprocess_text(sentences):
    tokenized_sentences = [word_tokenize(sentence) for sentence in sentences]
    cleaned_sentences = [
        [word for word in sentence if word not in stop_words]
        for sentence in tokenized_sentences
    ]
    return cleaned_sentences

# apply and save

In [None]:
train_data = train_data.drop('way4_text', axis=1)
test_data = test_data.drop('way4_text', axis=1)

In [None]:
train_data

Unnamed: 0,category,tokenized_sentence
0,1,"[A little less than a decade ago, hockey fans ..."
1,1,[The writers of the HBO series The Sopranos to...
2,1,[Despite claims from the TV news outlet to off...
3,1,[After receiving 'subpar' service and experien...
4,1,[After watching his beloved Seattle Mariners p...
...,...,...
48849,4,[The ruling Kuomintang (KMT) has claimed owner...
48850,4,[The Taipei city government has encouraged the...
48851,4,[President Ma Ying-jeou said Friday that a par...
48852,4,[The families of the four people who were kill...


In [None]:
test_data

Unnamed: 0,category,tokenized_sentence
0,1,[When so many actors seem content to churn out...
1,1,[ In what football insiders are calling an une...
2,1,[In a freak accident following Game 3 of the N...
3,1,[North Koreas official news agency announced t...
4,1,[The former Alaska Governor Sarah Palin would ...
...,...,...
2995,4,[The Air Force mistakenly gave rival companies...
2996,4,[The United Nations climate chief on Friday ch...
2997,4,[River Plate midfielder Diego Buonanotte has u...
2998,4,[Lawmakers were on the brink Tuesday of exempt...


In [None]:
train_data['way5_text'] =train_data['tokenized_sentence'].apply(preprocess_text)
test_data['way5_text'] =test_data['tokenized_sentence'].apply(preprocess_text)

In [None]:
train_data.to_pickle('way5_train.pkl')
test_data.to_pickle('way5_test.pkl')

In [None]:
df_loaded = pd.read_pickle('way5_test.pkl')
df_loaded

Unnamed: 0,category,tokenized_sentence,way5_text
0,1,[When so many actors seem content to churn out...,"[[When, many, actors, seem, content, churn, pe..."
1,1,[ In what football insiders are calling an une...,"[[In, football, insiders, calling, unexpectedl..."
2,1,[In a freak accident following Game 3 of the N...,"[[In, freak, accident, following, Game, 3, N.B..."
3,1,[North Koreas official news agency announced t...,"[[North, Koreas, official, news, agency, annou..."
4,1,[The former Alaska Governor Sarah Palin would ...,"[[The, former, Alaska, Governor, Sarah, Palin,..."
...,...,...,...
2995,4,[The Air Force mistakenly gave rival companies...,"[[The, Air, Force, mistakenly, gave, rival, co..."
2996,4,[The United Nations climate chief on Friday ch...,"[[The, United, Nations, climate, chief, Friday..."
2997,4,[River Plate midfielder Diego Buonanotte has u...,"[[River, Plate, midfielder, Diego, Buonanotte,..."
2998,4,[Lawmakers were on the brink Tuesday of exempt...,"[[Lawmakers, brink, Tuesday, exempting, nation..."
