In [None]:
#Use conda if using jupyter, use pip if using colab
#!conda install contractions
#!conda install pattern

#!pip install contractions
#!pip install pattern

In [None]:
import pandas as pd
import re
import nltk
nltk.download('omw-1.4')
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import pos_tag, word_tokenize
import contractions
from pattern.text.en import singularize
from sklearn.feature_extraction.text import CountVectorizer

In [3]:
movieset = pd.read_csv('movieset.csv')
moviesetstem = movieset.copy()

#Removing Contractions
Removing apostrophes from words and separating them <br>
didn't -> did not <br>
you're -> you are

In [7]:
#Copy column into series
setsummaries = movieset['summaries'].copy()

#Turn sentences into array of words, expand contractions
x = 0
for summary in setsummaries:
  setsummaries[x] = summary.split(' ')
  y = 0
  for word in setsummaries[x]:
    setsummaries[x][y] = contractions.fix(word)
    y += 1
  x += 1

#Turn array of words back into sentences
x = 0
for summary in setsummaries:
  setsummaries[x] = " ".join(summary)
  x += 1

# Removing punctuation
Self explanatory, removing punctuation to make words

In [8]:
#Cleaning up words (Removing punctuation)

x = 0
for sentence in setsummaries:
  setsummaries[x] = sentence.replace('\'s', '')\
                      .replace(',','')\
                      .replace('-',' ')\
                      .replace('.','')\
                      .replace('!','')\
                      .replace('?','')\
                      .replace('\'','')\
                      .replace(':','')\
                      .replace('(','')\
                      .replace(')','')\
                      .replace('\"','')
  x = x + 1

#Cleaning up words (Removing numbers)
x = 0
for sentence in setsummaries:
  sentence.replace('007','doubleOseven') #This number has meaning (James Bond films)
  nonum = re.sub('[0-9]+[a-z]+', '', sentence) #removes positions (1st, 4th, etc)
  setsummaries[x] = re.sub('[0-9]+','',nonum) #removes series of numbers
  sentence.replace('doubleOseven','007') #Bringing back the 007
  x += 1

# Stemming words
Removing unnecessary parts of words. <br>

This process uses WordNetLemmatizer from nltk. This section:

* Turns each summary in the "Summary" column into an array.
* Makes every word lowercase.
* Cycles through every possible tag in lemmatize (verb, noun, etc.).
* Stops cycling when the word is stemmed.
* Turns plural words into singular
* Puts sentence back together.

However, despite using different methods, some words still slip through.

In [9]:
tags = ['v','s','n','a','r']
sentences = pd.Series(setsummaries)
stem = WordNetLemmatizer()
y = 0
for sentence in sentences:
  sentence = sentence.split(" ") #Make arrays out of the sentences
  x = 0
  for word in sentence:
    word = word.lower() #might take this out and do it earlier
    for tag in tags:
      lemma = stem.lemmatize(word,tag) #Stems the words and rotates 5 different tags for the lemmatizer
      if len(lemma) < len(word):
        break; #Breaks out of loop if the word gets shortened
    lemma = singularize(lemma)
    #putting the sentence back together
    sentence[x] = lemma
    x += 1
  #putting the series of sentences back together
  sentence = " ".join(sentence)
  sentences[y] = sentence
  y += 1


# Transforming into bag of words / n-gram
Use CountVectorizer() to drop stop words, and count the amount of times words appear. <br>
* We change the parameter ngram_range to determine whether we want a bag of words model, or an n-gram. <br>

When we start running models, then we will change the minimum occurances of words. <br>
After that, we will add back the rest of the attributes to the bag of words for model usage.

In [None]:
#Getting ready to turn the summaries column into a bag of words
moviesetstem['summaries'] = setsummaries
moviesetstem.to_csv("stemmedmovieset.csv",index = False)

ngram = int(input('What number for n? (1 = bag of words model, 2+ = n-gram model): '))
mindf = int(input('What should be the minimum occurances of words?: '))

#Splitting Summaries, making a dictionary, forming it into a bag of words/n-gram
CountSum = CountVectorizer(ngram_range=(ngram,ngram),stop_words='english', min_df = mindf)
bow = CountSum.fit_transform(moviesetstem['summaries'])
bagofwords = pd.DataFrame(bow.toarray(),columns=CountSum.get_feature_names())

#removing the index
bagofwords = bagofwords.reset_index(drop=True)
moviesetcopy = moviesetstem.reset_index(drop=True)

#Merging the datasets
bagofwords['titles'] = moviesetcopy['titles']
bagofwords = pd.merge(bagofwords, moviesetcopy, how = 'inner', on = 'titles').drop(columns = 'summaries', axis=1)

bagofwords.to_csv('BagOfWords.csv',index = False)