In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from bs4 import BeautifulSoup
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
from collections import Counter
from nltk.corpus import stopwords

In [6]:
# Reading in data
reddit = pd.read_csv('./data/reddit_clean.csv')

In [33]:
# Defining a tokenizer/stemmer to use in my CountVectorizer/TfidfVectorizer in the next notebook. This is stored in my_functions.py
def tokenize_and_stem(text):
    '''
    This does more than just tokenize and stem! The code removes links and
    html artifacts and is great for dirty data scraped from Reddit.
    
    Needs to be run within a loop to work properly.
    '''
    # Getting rid of links
    text = [word for word in text.lower().split() if not 'http' in word]
    text = ' '.join(text)
    
    # Remove HTML Artifacts
    bs = BeautifulSoup(text)
    text = bs.get_text()
    
    # Tokenize clean text by separating out all word characters
    tokenizer = RegexpTokenizer('\w+')
    tokens = tokenizer.tokenize(text)
    
    # Stem the tokens
    p_stemmer = PorterStemmer()
    return [p_stemmer.stem(i) for i in tokens]

### I want to see what the **most frequent words** are to determine if I need to add any other words to the standard English stop words

In [19]:
processed_posts = [tokenize_and_stem(row) for row in reddit['selftext']]

In [27]:
# Creating a stop word list based on what I've determined as popular words that aren't helpful
stop_words = [
    'https',
    'com',
    'www',
    'amp',
    'like',
    'just',
    'spotify',
    'because',
    'song',
    'music',
    'album',
    'want',
    'would',
    'make',
    'know',
    'becau',
]

In [28]:
# Concatenating the english stop words list with my custom list
custom_sw = stopwords.words('english') + stop_words

In [29]:
# Processing my stop words in the same way I'll process my data
processed_sw = tokenize_and_stem(' '.join(custom_sw))

In [30]:
processed_words = [word for post in processed_posts for word in post if word not in processed_sw]

In [31]:
c = Counter(processed_words)

In [32]:
c.most_common(20)

[('punk', 2972),
 ('band', 2161),
 ('get', 918),
 ('pop', 894),
 ('one', 838),
 ('new', 807),
 ('listen', 776),
 ('think', 771),
 ('time', 750),
 ('peopl', 740),
 ('realli', 718),
 ('love', 696),
 ('year', 694),
 ('go', 676),
 ('look', 671),
 ('anyon', 582),
 ('find', 523),
 ('guy', 496),
 ('show', 495),
 ('good', 489)]

Here are our most frequently used words, and now we've got a great function we can plug into `CountVectorizer()` and `TfidfVectorizer`

## What's next?

Now we can vectorize and model our data and find out what model works best