# Imports

In [21]:
import pandas as pd
import numpy as np
import re, string
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
from sklearn.model_selection import train_test_split
     

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Csv to Dataframe

In [22]:
data = pd.read_csv('reddit_data.csv')
data.head()

Unnamed: 0,parent_id,text,topic,length,size_range
0,t1_crojgfu,Thanks! Not sure if those links were up there ...,pcmasterrace,103,101 to 200
1,t1_cquq97y,I think its unlikely someone would kill them s...,news,163,101 to 200
2,t1_cr92xnl,Hoult is another one that's important.\n\nBut ...,movies,99,0 to 100
3,t1_crk6sin,Can have my opinion.\n\nThey're noisy and hot....,pcmasterrace,1556,501 +
4,t1_cr9398p,Nice! That reminds me of a more recent article...,news,252,201 to 500


#Checking the number or rows and column of the dataframe

In [23]:
print(data.shape)

(40001, 5)


# Fetching Column containing documents for classification

In [24]:
data['text'].head()

0    Thanks! Not sure if those links were up there ...
1    I think its unlikely someone would kill them s...
2    Hoult is another one that's important.\n\nBut ...
3    Can have my opinion.\n\nThey're noisy and hot....
4    Nice! That reminds me of a more recent article...
Name: text, dtype: object

# Data Preprocessing ( lowercase, special character removal, white space removal, digits removal)

In [37]:
def preprocess(row):
    text=re.sub(r'[^\w\s]', '', str(row).lower().strip())
    text =re.compile('<.*?>').sub('', str(row)) 
    text = re.compile('[%s]' % re.escape(string.punctuation)).sub(' ', str(row))
    text = re.sub('\s+', ' ', str(row)) 
    text = re.sub(r'\[[0-9]*\]',' ',str(row))
    
    return text
   

#Applying preprocessing function to the fetched column

In [38]:
data['text'] = data['text'].apply(preprocess)
data['text'].head()

0    thanks not sure if those links were up there b...
1    i think its unlikely someone would kill them s...
2    hoult is another one thats important\n\nbut mo...
3    can have my opinion\n\ntheyre noisy and hot th...
4    nice that reminds me of a more recent article ...
Name: text, dtype: object

#Removing stop words

In [39]:
import nltk
nltk.download('stopwords')

# STOPWORD REMOVAL
def stopword(row):
    a= [i for i in str(row).split() if i not in stopwords.words('english')]
    return ' '.join(a)

data['text'] = data['text'].apply(stopword)
data['text'].head()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


0      thanks sure links im confused whether feel dumb
1    think unlikely someone would kill selves blunt...
2    hoult another one thats important fassbender m...
3    opinion theyre noisy hot thats issues shit dri...
4    nice reminds recent article doctoral student v...
Name: text, dtype: object

#Lemmatizing 

In [42]:
from nltk.stem import WordNetLemmatizer
nltk.download('omw-1.4')

lemmatizer = WordNetLemmatizer()

def lemmatizing(row):
  splits = str(row).split()
  text = " ".join(lemmatizer.lemmatize(w) for w in splits)
  
  return text
  

data['text'] = data['text'].apply(lemmatizing)
data['text'].head()

[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


0       thanks sure link im confused whether feel dumb
1    think unlikely someone would kill self blunt f...
2    hoult another one thats important fassbender m...
3    opinion theyre noisy hot thats issue shit driv...
4    nice reminds recent article doctoral student v...
Name: text, dtype: object

#Dropping duplicate rows 

In [45]:
data = data.drop_duplicates()
print(data.shape)
data.head()

(40001, 5)


Unnamed: 0,parent_id,text,topic,length,size_range
0,t1_crojgfu,thanks sure link im confused whether feel dumb,pcmasterrace,103,101 to 200
1,t1_cquq97y,think unlikely someone would kill self blunt f...,news,163,101 to 200
2,t1_cr92xnl,hoult another one thats important fassbender m...,movies,99,0 to 100
3,t1_crk6sin,opinion theyre noisy hot thats issue shit driv...,pcmasterrace,1556,501 +
4,t1_cr9398p,nice reminds recent article doctoral student v...,news,252,201 to 500


#Fetching the text data column

In [46]:
comments = data['text']
print(comments.shape)
comments.head()

(40001,)


0       thanks sure link im confused whether feel dumb
1    think unlikely someone would kill self blunt f...
2    hoult another one thats important fassbender m...
3    opinion theyre noisy hot thats issue shit driv...
4    nice reminds recent article doctoral student v...
Name: text, dtype: object

#Keeping only first 10 words in each row 

In [47]:
def getfirstten(row):
  text = str(row)
  text = text.split()[:10]

  return text

comments = comments.apply(getfirstten)
comments.head()


0    [thanks, sure, link, im, confused, whether, fe...
1    [think, unlikely, someone, would, kill, self, ...
2    [hoult, another, one, thats, important, fassbe...
3    [opinion, theyre, noisy, hot, thats, issue, sh...
4    [nice, reminds, recent, article, doctoral, stu...
Name: text, dtype: object

#Splitting into train and test data

In [61]:
traindata,testdata = train_test_split(comments,train_size=0.7)
traindata.head()



38148    [everything, youve, mentioned, making, respons...
3681     [even, vega, doesnt, know, hell, would, happen...
11313    [petty, officer, enlisted, worth, waaaaay, off...
1123     [would, downplay, said, something, something, ...
37731    [gt, better, live, society, equally, poor, one...
Name: text, dtype: object

#Counting total words in train data

In [67]:
wordcount = traindata.size
print(wordcount)

28000


#Creating vocabulary dictionary

In [66]:
dictionary = dict()
for row in traindata:
 for i in range(len(row)):
  k = row[i]  
  if k not in dictionary:
    dictionary[k] = len(dictionary)
    
print(dictionary)

print(len(dictionary))

23961


#One hot vector 

In [68]:
index = []
for x,y in dictionary.items():
    index.append(y)

index = np.asarray(index)
x= len(dictionary)
onehotvec = np.zeros((x,wordcount))

onehotvec[np.arange(x), index] = 1

print(onehotvec)

[[1. 0. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]
 [0. 0. 1. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
