In [98]:
# Dataset: https://www.kaggle.com/rtatman/blog-authorship-corpus

In [94]:
# data manipulation
import pandas as pd
# for balancing the classes in the dataset
from imblearn.datasets import make_imbalance
from collections import Counter

# library for NLP tasks, 
from textblob import TextBlob
import nltk
nltk.download('wordnet')
from textblob import Word 
# bag of words and tf-idf
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer




[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Adarsh\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
df = pd.read_csv(r'dataset/blogtext/blogtext.csv')

In [3]:
df.shape

(681284, 7)

In [4]:
df.head(5)

Unnamed: 0,id,gender,age,topic,sign,date,text
0,2059027,male,15,Student,Leo,"14,May,2004","Info has been found (+/- 100 pages,..."
1,2059027,male,15,Student,Leo,"13,May,2004",These are the team members: Drewe...
2,2059027,male,15,Student,Leo,"12,May,2004",In het kader van kernfusie op aarde...
3,2059027,male,15,Student,Leo,"12,May,2004",testing!!! testing!!!
4,3581210,male,33,InvestmentBanking,Aquarius,"11,June,2004",Thanks to Yahoo!'s Toolbar I can ...


In [5]:
df.topic.value_counts()

indUnk                     251015
Student                    153903
Technology                  42055
Arts                        32449
Education                   29633
Communications-Media        20140
Internet                    16006
Non-Profit                  14700
Engineering                 11653
Law                          9040
Publishing                   7753
Science                      7269
Government                   6907
Consulting                   5862
Religion                     5235
Fashion                      4851
Marketing                    4769
Advertising                  4676
BusinessServices             4500
Banking                      4049
Chemicals                    3928
Telecommunications           3891
Accounting                   3832
Military                     3128
Museums-Libraries            3096
Sports-Recreation            3038
HumanResources               3010
RealEstate                   2870
Transportation               2326
Manufacturing 

In [11]:
df.columns

Index(['id', 'gender', 'age', 'topic', 'sign', 'date', 'text'], dtype='object')

In [17]:
df.topic = df.topic.astype(str)

In [34]:
# datasize is huge. Let's keep only few topics which are our target classe 
# after that perform downsampling to balance the classes
df_student = df[ (df.topic == 'Student') ] 
df_tech =   df[ (df.topic == 'Technology') ]
df_arts     = df[(df.topic == 'Arts')]       
df_non_profit = df[(df.topic == 'Non-Profit')]
df_law = df[df.topic == 'Law']
df_gov = df[(df.topic == 'Government')]
df_rel = df[(df.topic == 'Religion')]
df_fashion = df[(df.topic == 'Fashion')]

In [36]:
    # use resample method from scikit-learn
    from sklearn.utils import resample

df_student = resample(df_student, 
                      replace=True,    # sample with replacement
                      n_samples=5000,     # to match number of values in each class
                      random_state=123) # reproducible results

df_tech = resample(df_tech, 
                      replace=True,    # sample with replacement
                      n_samples=5000,     # to match number of values in each class
                      random_state=123) # reproducible results

df_arts = resample(df_arts, 
                      replace=True,    # sample with replacement
                      n_samples=5000,     # to match number of values in each class
                      random_state=123) # reproducible results

df_non_profit = resample(df_non_profit, 
                      replace=True,    # sample with replacement
                      n_samples=5000,     # to match number of values in each class
                      random_state=123) # reproducible results

df_law = resample(df_law, 
                      replace=True,    # sample with replacement
                      n_samples=5000,     # to match number of values in each class
                      random_state=123) # reproducible results

df_gov = resample(df_gov, 
                      replace=True,    # sample with replacement
                      n_samples=5000,     # to match number of values in each class
                      random_state=123) # reproducible results

df_rel = resample(df_rel, 
                      replace=True,    # sample with replacement
                      n_samples=5000,     # to match number of values in each class
                      random_state=123) # reproducible results




In [37]:
# Combine all the class with equal number of values
df = pd.concat([df_student, df_tech, df_arts, df_non_profit, df_law, df_gov, df_rel, df_fashion ])

# check for the class balance
df['topic'].value_counts()

Law           5000
Non-Profit    5000
Government    5000
Technology    5000
Religion      5000
Arts          5000
Student       5000
Fashion       4851
Name: topic, dtype: int64

In [38]:
df.columns

Index(['id', 'gender', 'age', 'topic', 'sign', 'date', 'text'], dtype='object')

In [39]:
df.head(5)

Unnamed: 0,id,gender,age,topic,sign,date,text
72398,479019,male,24,Student,Gemini,"02,July,2004",Ive been watching a lot of movies...
132682,3622174,male,16,Student,Leo,"25,June,2004","Zuma, by Pop Cap games, is one o..."
82810,1478632,male,17,Student,Leo,"30,May,2003","[Step Up Day? WTF?] Yeah, the eighth ..."
543401,2973911,male,17,Student,Sagittarius,"01,September,2003","Ok, so i haven't posted for some ti..."
650855,891544,male,27,Student,Libra,"28,July,2004",hi gang... the hunt may have to be pos...


In [40]:
df.to_csv(r'dataset/blogtext/balanced_blogtext.csv', index= False)

In [41]:
text = df[['text', 'topic']]

In [None]:
text.head(5)

In [54]:
print(text.topic.value_counts())
print("\n")
print(text.shape)

Law           5000
Non-Profit    5000
Government    5000
Technology    5000
Religion      5000
Arts          5000
Student       5000
Fashion       4851
Name: topic, dtype: int64


(39851, 2)


In [44]:
pd.set_option('display.max_colwidth' , 100)

In [45]:
# 1. Lower casing  - change all the words to lower case to avoid duplication. Because "Python" and "python" considered 2 words
text['text'] = text['text'].apply( lambda t : ' '.join( word.lower() for word in t.split()  ) )
text['text'].head(5)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


72398     ive been watching a lot of movies lately. in the month of may ive watched a better tomorrow , th...
132682    zuma, by pop cap games, is one of the most addicting web games i have ever played. here, you can...
82810     [step up day? wtf?] yeah, the eighth graders come up to my school on monday. woohoo. that means ...
543401    ok, so i haven't posted for some time... sorry `bout that. anyways, ayan nagparamdam lang po ako...
650855    hi gang... the hunt may have to be postponed. there is a possibility that i'll be going out of t...
Name: text, dtype: object

In [47]:
# 2. Remove punctuations
# the [^\w\s] means remove everything, keep only words(w) and spaces(s)
# this step should be done after feature extraction like hashtags, user tagged
text['text'] = text['text'].str.replace( '[^\w\s]' , '' )
text['text'].head(10)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


72398     ive been watching a lot of movies lately in the month of may ive watched a better tomorrow  the ...
132682    zuma by pop cap games is one of the most addicting web games i have ever played here you can see...
82810     step up day wtf yeah the eighth graders come up to my school on monday woohoo that means we get ...
543401              ok so i havent posted for some time sorry bout that anyways ayan nagparamdam lang po ako 
650855    hi gang the hunt may have to be postponed there is a possibility that ill be going out of town t...
584378    wow havent been here in a few weeks thanks for the comments guys  i apologize for lack of update...
229176    shots and shots welll today was an interesting day it was no ordinary day it was activity day ya...
679363    when she loved me by sarah mclachlan when somebody loved me everything was beautiful every hour ...
317983    urllink so i think i found the last can of tab on earth i found the old boynbspin a back alley c...
541092    

In [49]:
# 3 . Remove stop words - the, a , and etc. These are most commonly occuring words, and may created irrevelent 
# baises to our model
from nltk.corpus import stopwords
stop = stopwords.words('english')

text['text'] = text['text'].apply( lambda t : " ".join( word for word in t.split() 
                                                       if word not in stop ) )

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


In [50]:
text['text'].head(10)

72398     ive watching lot movies lately month may ive watched better tomorrow one iron monkey ichi killer...
132682    zuma pop cap games one addicting web games ever played see happens clear level pop cap games res...
82810     step day wtf yeah eighth graders come school monday woohoo means get bunch snotnosed losers idol...
543401                                  ok havent posted time sorry bout anyways ayan nagparamdam lang po ako
650855    hi gang hunt may postponed possibility ill going town weekend leaving thursday afternoon anyothe...
584378    wow havent weeks thanks comments guys apologize lack updates seem fallen someone hes asian hes r...
229176    shots shots welll today interesting day ordinary day activity day yay actually went pretty well ...
679363    loved sarah mclachlan somebody loved everything beautiful every hour spent together lives within...
317983    urllink think found last tab earth found old boynbspin back alley convenience store new yorknbsp...
541092    

In [57]:
# 4. Frequent word removal from the text; text which are not stopwords
# first we will take whole tweets data and split into words and then calculate their frequency
#  join words only with strings so, there needs to be some string to join other string
all_words = ' '.join( text['text'] ).split()
freqeuncy = pd.Series(all_words).value_counts()[:20]

In [58]:
freqeuncy

like       29810
im         28567
one        25231
urllink    23038
get        20922
dont       20262
time       19569
know       19153
would      18050
really     17057
people     16701
think      16306
go         15725
well       15431
good       15114
see        13314
back       13231
got        13217
going      13036
day        13003
dtype: int64

In [61]:
# remove frequenct words
text['text'] = text['text'].apply( lambda t : ' '.join( word for word in t.split() 
                                                      if word not in freqeuncy) )
text['text'].head(10)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


72398     ive watching lot movies lately month may ive watched better tomorrow iron monkey ichi killer shr...
132682    zuma pop cap games addicting web games ever played happens clear level pop cap games responsible...
82810     step wtf yeah eighth graders come school monday woohoo means bunch snotnosed losers idolize fres...
543401                                       ok havent posted sorry bout anyways ayan nagparamdam lang po ako
650855    hi gang hunt may postponed possibility ill town weekend leaving thursday afternoon anyother nigh...
584378    wow havent weeks thanks comments guys apologize lack updates seem fallen someone hes asian hes h...
229176    shots shots welll today interesting ordinary activity yay actually went pretty first went laser ...
679363    loved sarah mclachlan somebody loved everything beautiful every hour spent together lives within...
317983    found last tab earth found old boynbspin alley convenience store new yorknbsp dated 1993probably...
541092    

In [62]:
# you can also remove "None/none" or some other values which are not relevant.

In [72]:
# 5. Remove Rare words, which will not contribute to our model
all_words = ' '.join(text['text'] ).split()
rarely = pd.Series(all_words).value_counts()[-160000:]
rarely.sort_values

<bound method Series.sort_values of parameters                                                                         23
motorway                                                                           23
indianapolis                                                                       23
allstar                                                                            23
doses                                                                              23
brb                                                                                23
org                                                                                23
rejects                                                                            23
camcorder                                                                          23
grips                                                                              23
ailments                                                                           23
perseverance      

In [73]:
# remove rare words
# remove frequenct words
text['text'] = text['text'].apply( lambda t : ' '.join( word for word in t.split() 
                                                      if word not in rarely) )
text['text'].head(10)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


72398     ive watching lot movies lately month may ive watched better tomorrow iron monkey killer shrek 2 ...
132682    pop cap games web games ever played happens clear level pop cap games responsible many big name ...
82810     step wtf yeah eighth graders come school monday woohoo means bunch losers freshmen god thats stu...
543401                                                        ok havent posted sorry bout anyways lang po ako
650855    hi gang hunt may postponed possibility ill town weekend leaving thursday afternoon night thats e...
584378    wow havent weeks thanks comments guys apologize lack updates seem fallen someone hes asian hes h...
229176    shots shots today interesting ordinary activity yay actually went pretty first went laser quest ...
679363    loved sarah somebody loved everything beautiful every hour spent together lives within heart sad...
317983    found last tab earth found old alley convenience store new dated year report came outnbsp danger...
541092    

In [74]:
# 6. Remove whitespaces
text['text'] = text['text'].str.strip()
text['text'].head(10)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


72398     ive watching lot movies lately month may ive watched better tomorrow iron monkey killer shrek 2 ...
132682    pop cap games web games ever played happens clear level pop cap games responsible many big name ...
82810     step wtf yeah eighth graders come school monday woohoo means bunch losers freshmen god thats stu...
543401                                                        ok havent posted sorry bout anyways lang po ako
650855    hi gang hunt may postponed possibility ill town weekend leaving thursday afternoon night thats e...
584378    wow havent weeks thanks comments guys apologize lack updates seem fallen someone hes asian hes h...
229176    shots shots today interesting ordinary activity yay actually went pretty first went laser quest ...
679363    loved sarah somebody loved everything beautiful every hour spent together lives within heart sad...
317983    found last tab earth found old alley convenience store new dated year report came outnbsp danger...
541092    

In [87]:
# Remove Numberic
import re

def remove_num(t):
    removed_num_text = re.sub(r'\d+', '', t)
    return removed_num_text

text['text'] = text['text'].apply( lambda t : remove_num(t) )
text['text'].head(10)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


72398     ive watching lot movie lately month may ive watched better tomorrow iron monkey killer shrek  va...
132682    pop cap game web game ever played happens clear level pop cap game responsible many big name web...
82810     step wtf yeah eighth grader come school monday woohoo mean bunch loser freshman god thats stupid...
543401                                                        ok havent posted sorry bout anyways lang po ako
650855    hi gang hunt may postponed possibility ill town weekend leaving thursday afternoon night thats e...
584378    wow havent week thanks comment guy apologize lack update seem fallen someone he asian he hot he ...
229176    shot shot today interesting ordinary activity yay actually went pretty first went laser quest fi...
679363    loved sarah somebody loved everything beautiful every hour spent together life within heart sad ...
317983    found last tab earth found old alley convenience store new dated year report came outnbsp danger...
541092    

In [None]:
# 7. Spelling correction  -- 
# !!!!!!!!!!!!!!!!!!!!!!!!  Warning: This step is gonna take too much time maybe a few hours
text['text'].apply( lambda t :  str( TextBlob(t).correct() ) )


In [77]:
# 8 . Lemmatization ; its preferred over stemming because if finds the root word
# perform lemmatization on the title
text['text'] = text['text'].apply( lambda t : " ".join( [Word(word).lemmatize() for word in t.split()  ]) )
text['text'].head(5)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


72398     ive watching lot movie lately month may ive watched better tomorrow iron monkey killer shrek 2 v...
132682    pop cap game web game ever played happens clear level pop cap game responsible many big name web...
82810     step wtf yeah eighth grader come school monday woohoo mean bunch loser freshman god thats stupid...
543401                                                        ok havent posted sorry bout anyways lang po ako
650855    hi gang hunt may postponed possibility ill town weekend leaving thursday afternoon night thats e...
Name: text, dtype: object

In [88]:
# save the data at this point so that we don't have to do time consuming steps again. Steps like
# spelling correction, lemmatization etc
text.to_csv(r'dataset/blogtext/cleaned_blogtext.csv', index= False)

In [89]:
# 9. Bag of words
cv = CountVectorizer(stop_words='english')
data_cv = cv.fit_transform(text.text.values.astype('str'))

data_dtm = pd.DataFrame(data_cv.toarray(), columns=cv.get_feature_names())
data_dtm.index = df.index


In [91]:
data_dtm.head(50)

Unnamed: 0,aa,aan,aaron,ab,aback,abandon,abandoned,abandonment,abbott,abby,...,zeppelin,zero,zhi,zion,zip,zo,zodiac,zombie,zone,zoo
72398,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
132682,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
82810,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
543401,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
650855,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
584378,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
229176,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
679363,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
317983,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
541092,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
