In [2]:
# build fastText model based on some comments data
# Kaggle's toxic comment classification challenge

In [60]:
! pip install gensim

You should consider upgrading via the '/opt/anaconda3/bin/python -m pip install --upgrade pip' command.[0m


In [27]:
import nltk
import re
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from gensim.models import FastText
import io
import collections

import pandas as pd

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/aayush/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/aayush/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [11]:
# reading the data into basic structure

In [7]:
words=[]
data=[]

with io.open('comments.txt','r') as file:
    for entry in file:
        entry = entry.strip()
        data.append(entry)
        words.extend(entry.split())
        
       

In [12]:
len(words) # there are about 10 million words 

10737835

In [10]:
len(data)

561808

In [14]:
# most common word in the corpus
# dictionary of words in the form of key as the words and their frequency as the value

In [15]:
unique_words = []
unique_words = collections.Counter(words)
unique_words.most_common(10)


[('the', 445892),
 ('to', 288753),
 ('of', 219279),
 ('and', 207335),
 ('a', 201765),
 ('I', 182618),
 ('is', 164602),
 ('you', 157025),
 ('that', 140495),
 ('in', 130244)]

In [18]:
#  data is dominated by stopwords
# preprocessing in terms of keeping only alphanumeric data, case-folding, and removing stopwords.

# we dont use stemming or lemmatizing because we want the model to understand incorrect spellings as well

In [19]:
# let us preprocess the data using the preprocessing pipleline

In [39]:
def text_clean(corpus):
    '''
    Purpose : Function to keep only alphabets, digits and certain words (punctuations, qmarks, tabs etc. removed)
    
    Input : Takes a text corpus, 'corpus' to be cleaned along with a list of words, 'keep_list', which have to be retained
            even after the cleaning process
    
    Output : Returns the cleaned text corpus
    
    '''
    cleaned_corpus = []
    for row in corpus:
        qs = []
        for word in row.split():
            p1 = re.sub(pattern='[^a-zA-Z0-9]',repl=' ',string=word)
            p1 = p1.lower()
            qs.append(p1)
        cleaned_corpus.append(' '.join(qs))
    return cleaned_corpus

In [22]:
def stopwords_removal(corpus):
    wh_words = ['who', 'what', 'when', 'why', 'how', 'which', 'where', 'whom']
    stop = set(stopwords.words('english'))
    for word in wh_words:
        stop.remove(word) # removing wh words from the set of stopwords
    corpus = [[x for x in x.split() if x not in stop] for x in corpus]
    return corpus

In [30]:
def lemmatize(corpus):
    lem = WordNetLemmatizer()
    corpus = [[lem.lemmatize(x, pos = 'v') for x in x] for x in corpus]
    return corpus

In [31]:
def stem(corpus, stem_type = None):
    if stem_type == 'snowball':
        stemmer = SnowballStemmer(language = 'english')
        corpus = [[stemmer.stem(x) for x in x] for x in corpus]
    else :
        stemmer = PorterStemmer()
        corpus = [[stemmer.stem(x) for x in x] for x in corpus]
    return corpus

In [34]:
def preprocess(corpus, cleaning = True, stemming = False, stem_type = None, lemmatization = False, remove_stopwords = True):
    '''
    Purpose : Function to perform all pre-processing tasks (cleaning, stemming, lemmatization, stopwords removal etc.)
    
    Input : 
    'corpus' - Text corpus on which pre-processing tasks will be performed
    'keep_list' - List of words to be retained during cleaning process
    'cleaning', 'stemming', 'lemmatization', 'remove_stopwords' - Boolean variables indicating whether a particular task should 
                                                                  be performed or not
    'stem_type' - Choose between Porter stemmer or Snowball(Porter2) stemmer. Default is "None", which corresponds to Porter
                  Stemmer. 'snowball' corresponds to Snowball Stemmer
    
    Note : Either stemming or lemmatization should be used. There's no benefit of using both of them together
    
    Output : Returns the processed text corpus
    
    '''
    
    if cleaning == True:
        corpus = text_clean(corpus)
    
    if remove_stopwords == True:
        corpus = stopwords_removal(corpus)
    else :
        corpus = [[x for x in x.split()] for x in corpus]
    
    if lemmatization == True:
        corpus = lemmatize(corpus)
        
        
    if stemming == True:
        corpus = stem(corpus, stem_type)
    
    corpus = [' '.join(x) for x in corpus]        

    return corpus

In [35]:
data

['"Explanation',
 'Why the edits made under my username Hardcore Metallica Fan were reverted? They weren\'t vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don\'t remove the template from the talk page since I\'m retired now.89.205.38.27"',
 "D'aww! He matches this background colour I'm seemingly stuck with. Thanks.  (talk) 21:51, January 11, 2016 (UTC)",
 "Hey man, I'm really not trying to edit war. It's just that this guy is constantly removing relevant information and talking to me through edits instead of my talk page. He seems to care more about the formatting than the actual info.",
 '"""',
 'More',
 'I can\'t make any real suggestions on improvement - I wondered if the section statistics should be later on, or a subsection of """"types of accidents""""  -I think the references may need tidying so that they are all in the exact same format ie date format etc. I can do that later on, if no-one else does first - if you have any preferences for f

In [40]:
data = preprocess(data)

In [None]:
# data after cleaning

In [41]:
data

['explanation',
 'why edits made username hardcore metallica fan reverted vandalisms closure gas voted new york dolls fac please remove template talk page since retired 89 205 38 27',
 'aww matches background colour seemingly stuck thanks talk 21 51 january 11 2016 utc',
 'hey man really trying edit war guy constantly removing relevant information talking edits instead talk page seems care formatting actual info',
 '',
 '',
 'make real suggestions improvement wondered section statistics later subsection types accidents think references may need tidying exact format ie date format etc later one else first preferences formatting style references want please let know',
 '',
 'appears backlog articles review guess may delay reviewer turns listed relevant form eg wikipedia good article nominations transport',
 'sir hero chance remember what page',
 '',
 '',
 'congratulations well use tools well talk',
 'cocksucker piss around work',
 'vandalism matt shirvington article reverted please banne

In [42]:
# data conversion into the format expected by fastText

In [45]:
preprocessed_data = []
for line in data:
    if line !="":
        preprocessed_data.append(line.split())


In [44]:
# for line in data:
#     print(line)

In [48]:
# contains list of list of tokens for each sentence 
preprocessed_data

[['explanation'],
 ['why',
  'edits',
  'made',
  'username',
  'hardcore',
  'metallica',
  'fan',
  'reverted',
  'vandalisms',
  'closure',
  'gas',
  'voted',
  'new',
  'york',
  'dolls',
  'fac',
  'please',
  'remove',
  'template',
  'talk',
  'page',
  'since',
  'retired',
  '89',
  '205',
  '38',
  '27'],
 ['aww',
  'matches',
  'background',
  'colour',
  'seemingly',
  'stuck',
  'thanks',
  'talk',
  '21',
  '51',
  'january',
  '11',
  '2016',
  'utc'],
 ['hey',
  'man',
  'really',
  'trying',
  'edit',
  'war',
  'guy',
  'constantly',
  'removing',
  'relevant',
  'information',
  'talking',
  'edits',
  'instead',
  'talk',
  'page',
  'seems',
  'care',
  'formatting',
  'actual',
  'info'],
 ['make',
  'real',
  'suggestions',
  'improvement',
  'wondered',
  'section',
  'statistics',
  'later',
  'subsection',
  'types',
  'accidents',
  'think',
  'references',
  'may',
  'need',
  'tidying',
  'exact',
  'format',
  'ie',
  'date',
  'format',
  'etc',
  'later

## Initialize fastText Model

In [49]:
# min_n and max_n -- helps us by setting the minimum and maximum lengths of the character
# n-grams so that we can build representations

In [57]:
model = FastText(vector_size=300, window=3, min_count=1, min_n=1, max_n=5)

In [58]:
# let us build our vocabulary and check the size of the build vocab

In [63]:
model.build_vocab(corpus_iterable=preprocessed_data)

In [64]:
len(model.wv.index_to_key)

182228

In [65]:
# the size would have been smaller if we would have appiled lemmatization or stemming

In [67]:
# let us train our model

In [None]:
model.train(corpus_iterable=preprocessed_data,total_examples=len(preprocessed_data),epochs=10)