## Importing Libraries

In [1]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.stem import PorterStemmer
#from nltk.stem import WordNetLemmatizer
import re
from bs4 import BeautifulSoup

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Loading the data previously extracted

In [2]:
data = pd.read_csv('r_india_with_top_comment.csv')

## Defining the functions for text pre-processing

### "." punctuator has also been replaced by space so that every url can be broken down to extract valuable tokens
### Verb abbreviated forms have been decontracted to ensure proper removal of stopwords

In [5]:
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,.;_]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))
#STEMMER = PorterStemmer()
#lemmatizer = WordNetLemmatizer()

def decontracted(phrase):
    # specific
    phrase = re.sub(r"won\'t", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)
    phrase = re.sub(r" ive ", "i have ", phrase)
    phrase = re.sub(r" hes ", " he is ", phrase)
    phrase = re.sub(r" shes ", " she is ", phrase)
    phrase = re.sub(r"http", "", phrase)
    phrase = re.sub(r"www", "", phrase)
    phrase = re.sub(r"\.com", "", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

def conv_str(text):
    return str(text)

def cleaner(text):
   
    text = BeautifulSoup(text, "lxml").text
    text = ' '.join(decontracted(word) for word in text.split())
    text = text.lower()
    text = REPLACE_BY_SPACE_RE.sub(' ', text)
    text = BAD_SYMBOLS_RE.sub('', text)
    text = ' '.join(word for word in text.split() if word not in STOPWORDS)
    #text = ' '.join(lemmatizer.lemmatize(word) for word in text.split())
    #text = ' '.join(STEMMER.stem(word) for word in text.split())
    return text

## Function call for text pre-processing

In [6]:
data['title'] = data['title'].apply(conv_str)
data['body'] = data['body'].apply(conv_str)
data['comments'] = data['comments'].apply(conv_str)

data['title'] = data['title'].apply(cleaner)
data['body'] = data['body'].apply(cleaner)
data['comments'] = data['comments'].apply(cleaner)

  ' that document to Beautiful Soup.' % decoded_markup


In [7]:
data.head()

Unnamed: 0,flair,title,url,comms_num,body,author,comments
0,Non-Political,indian states uts renamed countries similar po...,https://i.redd.it/hk9o11b8dun41.png,20,,schadenfeuder,nice name uttar pradesh ajay bisht gonna happy...
1,Non-Political,hotstar finally uploads sundays last week toni...,https://www.hotstar.com/in/tv/last-week-tonigh...,12,,TimeVendor,first turning internet entire state refusing b...
2,Non-Political,best nonpolitical stand comedian,https://www.reddit.com/r/india/comments/g3zbrt...,9,thing related politics gives anxiety recommend...,daredevil005,abhishek upmanyu kenny sebastian biswa anubhav...
3,Non-Political,icse isc exams postponed due covid19 nonpolitical,https://cisce.org//UploadedFiles/PDF/COVID%201...,0,,DSMalhotra,
4,Non-Political,nonpolitical friend wrote first book week givi...,https://www.reddit.com/r/india/comments/ezqw6i...,0,friend mine wrote first book music men volume ...,bitswreck,


## Storing the pre-processed data for further use

In [8]:
data.to_csv('pre_processed_data_with_top_comment_latest_one.csv', index = False)