In [None]:
import json
import pandas as pd
import os
import nltk
import string
import re
from nltk.corpus import wordnet
import tarfile
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# Creating dataframes

In [None]:
curPath = os.getcwd()
parentDir = os.path.abspath(os.path.join(curPath, os.pardir))
print("Parent Directory", parentDir)
dockerDatasetPath = os.path.join(parentDir, "datasets/yelp/yelp_review.csv")
print("Path:", dockerDatasetPath)
print("Exists:", os.path.exists(dockerDatasetPath))

In [None]:
DivyaPath = "/Users/divyashekar/Documents/Semester 5/Mini Project/GitHub/Deep-Co-Training/Deep-Co-Training/data/yelp_review.csv"

In [None]:
df_review = pd.read_csv(dockerDatasetPath)

In [None]:
df_review

# Text Pre Processing

## Removing numbers

In [None]:
df_review["text"] = df_review["text"].replace("\d+", "", regex=True)

In [None]:
df_review

## Removing punctuation

In [None]:
df_review["text"] = df_review["text"].str.replace("[{}]".format(string.punctuation), "")

In [None]:
df_review

## Removing whitespaces

In [None]:
df_review["text"] = df_review["text"].str.strip()

In [None]:
df_review

## Converting to lowercase

In [None]:
df_review["text"] = df_review["text"].str.lower()

In [None]:
df_review

In [None]:
df_review = df_review.iloc[:500000, :]

In [None]:
df_review

## Removing stop words

In [None]:
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer

tokenizer = TweetTokenizer(preserve_case=True, strip_handles=True, reduce_len=True)

In [None]:
def token(tweet):
    return tokenizer.tokenize(tweet)

In [None]:
df_review["text"] = df_review["text"].apply(token)

In [None]:
df_review

In [None]:
stop_words = set(stopwords.words("english"))

In [None]:
def remove(tweet):
    return [i for i in tweet if i not in stop_words]

In [None]:
df_review["text"] = df_review["text"].apply(remove)

In [None]:
df_review

## Lemmatising

In [None]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

In [None]:
def get_wordnet_POS(word):
    tag = nltk.pos_tag([word])[0][1][0].upper()

    tag_dict = {
        "J": wordnet.ADJ,
        "N": wordnet.NOUN,
        "V": wordnet.VERB,
        "R": wordnet.ADV,
    }
    return tag_dict.get(word, wordnet.NOUN)

In [None]:
def lemm(tweet):
    return [lemmatizer.lemmatize(i, get_wordnet_POS(i)) for i in tweet]

In [None]:
df_review["text"] = df_review["text"].apply(lemm)

In [None]:
df_review

## Saving processed data

In [None]:
df_review.to_csv("processed_data.csv")