In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from bs4 import BeautifulSoup
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
from collections import Counter
from nltk.corpus import stopwords

In [3]:
# Reading in data
reddit = pd.read_csv('./data/reddit_clean.csv')

In [5]:
reddit.rename(columns={'Unnamed: 0' : 'id'})

0

In [11]:
# Defining a tokenizer/stemmer to use in my CountVectorizer/TfidfVectorizer in the next notebook
def tokenize_and_stem(text):
    # Getting rid of links
    text = [word for word in text.lower().split() if not 'http' in word]
    text = ' '.join(text)
    
    # Remove HTML Artifacts
    bs = BeautifulSoup(text)
    text = bs.get_text()
    
    # Tokenize clean text by separating out all word characters
    tokenizer = RegexpTokenizer('\w+')
    tokens = tokenizer.tokenize(text)
    
    # Stem the tokens
    p_stemmer = PorterStemmer()
    return [p_stemmer.stem(i) for i in tokens]

#### I want to see what my most frequent words are to determine if I need to add any other words to the standard English stop words

In [12]:
processed_posts = [tokenize_and_stem(row) for row in reddit['selftext']]

In [13]:
processed_posts[8]

['here',
 's',
 'a',
 'playlist',
 'for',
 'you',
 'left',
 'of',
 'the',
 'dial',
 'clean',
 'sound',
 'fun',
 'by',
 'left',
 'of',
 'the',
 'dial']

In [40]:
# Creating a stop word list based on what I've determined as popular words that aren't helpful
stop_words = [
    'https',
    'com',
    'www',
    'amp',
    'like',
    'just',
    'spotify',
    'because',
    'song',
    'music',
    'album',
    'want',
    'would',
    'make',
    'know'
]

In [41]:
# Concatenating the english stop words list with my custom list
custom_sw = stopwords.words('english') + stop_words

In [42]:
# Processing my stop words in the same way I'll process my data
processed_sw = tokenize_and_stem(' '.join(custom_sw))

In [43]:
processed_words = [word for post in processed_posts for word in post if word not in processed_sw]

In [44]:
c = Counter(processed_words)

In [45]:
c.most_common(20)

[('punk', 2972),
 ('band', 2161),
 ('get', 918),
 ('pop', 894),
 ('one', 838),
 ('new', 807),
 ('listen', 776),
 ('think', 771),
 ('time', 750),
 ('peopl', 740),
 ('realli', 718),
 ('love', 696),
 ('year', 694),
 ('go', 676),
 ('look', 671),
 ('anyon', 582),
 ('find', 523),
 ('guy', 496),
 ('show', 495),
 ('good', 489)]